diff --git a/CMakeLists.txt b/CMakeLists.txt index 01798cc..213d463 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,19 @@ set_source_files_properties(${FFASM_LPF_SOURCES} PROPERTIES COMPILE_OPTIONS "${FFASM_FLAGS}" LANGUAGE ASM) +# Cycle 3 — VP9 MC interpolation NEON source + filter coefficient table +# (vendored 2026-05-18). The .c table provides ff_vp9_subpel_filters +# symbol which vp9mc_neon.S references via movrel. +set(FFASM_MC_SOURCES + ${FFSNAP}/libavcodec/aarch64/vp9mc_neon.S +) +set(FFC_MC_SOURCES + ${FFSNAP}/libavcodec/vp9_subpel_filters_table.c +) +set_source_files_properties(${FFASM_MC_SOURCES} PROPERTIES + COMPILE_OPTIONS "${FFASM_FLAGS}" + LANGUAGE ASM) + # Tell CMake/gas to preprocess .S sources. set_source_files_properties(${FFASM_SOURCES} PROPERTIES COMPILE_OPTIONS "${FFASM_FLAGS}" @@ -76,6 +89,15 @@ add_executable(bench_neon_lpf ${FFASM_LPF_SOURCES} ) target_compile_options(bench_neon_lpf PRIVATE -O3 -march=armv8-a+simd) + +# Cycle 3 — VP9 MC interpolation NEON baseline. +add_executable(bench_neon_mc + tests/bench_neon_mc.c + tests/vp9_mc_ref.c + ${FFASM_MC_SOURCES} + ${FFC_MC_SOURCES} +) +target_compile_options(bench_neon_mc PRIVATE -O3 -march=armv8-a+simd) # bench_neon_idct doesn't need vulkan/drm — pure CPU baseline. # ---- Vulkan dispatch-overhead microbench (next chunk) ---------------------- @@ -125,7 +147,18 @@ if (DAEDALUS_BUILD_VULKAN) VERBATIM ) - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV}) + set(MC_SPV ${CMAKE_BINARY_DIR}/v3d_mc_8h.spv) + add_custom_command( + OUTPUT ${MC_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${MC_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_mc_8h.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_mc_8h.comp + COMMENT "glslang: v3d_mc_8h.comp -> v3d_mc_8h.spv" + VERBATIM + ) + + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV}) # v3d_runner — reusable Vulkan plumbing. add_library(v3d_runner STATIC src/v3d_runner.c) @@ -155,6 +188,15 @@ if (DAEDALUS_BUILD_VULKAN) target_link_libraries(bench_v3d_lpf PRIVATE v3d_runner Vulkan::Vulkan) target_compile_options(bench_v3d_lpf PRIVATE -O2) + # Cycle 3 — QPU MC bench. + add_executable(bench_v3d_mc + tests/bench_v3d_mc.c + tests/vp9_mc_ref.c + ) + add_dependencies(bench_v3d_mc daedalus_shaders) + target_link_libraries(bench_v3d_mc PRIVATE v3d_runner Vulkan::Vulkan) + target_compile_options(bench_v3d_mc PRIVATE -O2) + # M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON # snapshot so we can run real NEON kernels on pinned CPU cores # while the QPU runs its dispatch loop concurrently. @@ -174,6 +216,16 @@ if (DAEDALUS_BUILD_VULKAN) add_dependencies(bench_concurrent_lpf daedalus_shaders) target_link_libraries(bench_concurrent_lpf PRIVATE v3d_runner Vulkan::Vulkan pthread) target_compile_options(bench_concurrent_lpf PRIVATE -O3 -march=armv8-a+simd) + + # Cycle 3 M4''' — concurrent MC. + add_executable(bench_concurrent_mc + tests/bench_concurrent_mc.c + ${FFASM_MC_SOURCES} + ${FFC_MC_SOURCES} + ) + add_dependencies(bench_concurrent_mc daedalus_shaders) + target_link_libraries(bench_concurrent_mc PRIVATE v3d_runner Vulkan::Vulkan pthread) + target_compile_options(bench_concurrent_mc PRIVATE -O3 -march=armv8-a+simd) endif() # ---- Summary ---------------------------------------------------------------- diff --git a/docs/k3_mc_phase1.md b/docs/k3_mc_phase1.md new file mode 100644 index 0000000..154631a --- /dev/null +++ b/docs/k3_mc_phase1.md @@ -0,0 +1,104 @@ +--- +cycle: 3 +phase: 1 +status: open +date_opened: 2026-05-18 +parent_cycle: k2_deblock_phase7.md (cycle 2 closed YELLOW-via-M4'' PASS) +target_kernel: VP9 8-tap MC interpolation, regular filter, horizontal, 8×N block +dev_host: hertz +--- + +# Cycle 3, Phase 1 — MC interpolation kernel goal + +Per `k2_deblock_phase7.md` verdict (project continues). MC interpolation +chosen because: most-common per-frame work in real bitstreams (every +inter block); multiply-heavy → stresses V3D SMUL24 / lack of DP4A +directly; VP9+AV1 both use the same 8-tap structure. + +## Kernel under test + +**VP9 8-tap regular subpel filter, horizontal direction, 8×N block, +"put" (non-averaging) mode.** + +libavcodec symbol: `ff_vp9_put_8tap_regular_8h_neon` (and equivalents +for smooth/sharp filter types). C reference: `put_8tap_regular_8h_c` +from `libavcodec/vp9dsp_template.c` (instantiated via the +`filter_fn_1d(8, h, mx, regular, FILTER_8TAP_REGULAR, put)` macro +expansion). + +I/O contract (per VP9 spec § 8.5.1 — subpel motion compensation): +```c +void put_8tap_regular_8h_c(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my); +``` + +- `dst` : destination block, written +- `dst_stride` : destination row stride +- `src` : source block, read (with -3..+4 column overhang for horizontal) +- `src_stride` : source row stride +- `h` : block height (typically 8 for 8×8) +- `mx` : x-axis subpel phase ∈ [0, 15] +- `my` : y-axis subpel phase (unused for horizontal-only filter) + +Per output pixel: +``` +out[r][c] = clip(sum_{k=0..7} filter[k] * src[r][c+k-3] + 64) >> 7 +``` + +Filter coefficients: `ff_vp9_subpel_filters[FILTER_8TAP_REGULAR][mx][0..7]` +(int16, signed; 16 phases; sum to 128). + +## Measurable success criteria (cycle-3 numbering) + +| ID | Measurement | Gate | +|---|---|---| +| **M1'''** | Bit-exact match rate vs C reference, ≥10 000 random 8×8 blocks (all 16 mx phases sampled) | 100.0000 % | +| **M2'''** | QPU throughput in Mblock/s | recorded | +| **M3'''** | NEON `ff_vp9_put_8tap_regular_8h_neon` throughput, single-core | recorded | +| **M4'''** | MIXED NEON-3 + QPU vs pure NEON-4 (only if YELLOW band) | conditional | + +Derived: **R''' = M2''' / M3'''**. + +## Decision rules (same as cycle 1/2) + +R''' bands and verdicts unchanged (see `phase1.md` and `k2_deblock_phase1.md`). +Cycle-2 calibration adjustment: ORANGE band (0.1 ≤ R''' < 0.5) is +no longer auto-close — run M4''' regardless. + +Predicted R''' band: **0.4–0.8.** +- MC is more compute-bound than LPF (8 mults + 7 adds per output + pixel; 64 pixels per block → ~960 ops per block) +- Bandwidth-equivalent to LPF (per-block ~120 B read + 64 B write + ≈ 184 B → similar 5-6 MB/frame at 32 400 blocks) +- V3D SMUL24 covers the 8b×8b → 16b mults without overflow +- But no DP4A means we lose the typical "4× INT8 speedup" CPUs get + via SDOT — V3D does these as scalar SMUL24 + +## Cycle 1+2 lessons baked in from start + +Per `k2_deblock_phase7.md §"Phase 9 lessons"`: + +1. WG=256, 2-per-subgroup adaptation, uint8_t SSBO, oob early-return, + NO chained ternary — these are the v1 defaults. +2. Phase 5 second-model review is mandatory. +3. R isolation is misleading; M4''' is the real gate. +4. Always-N-1-NEON + QPU recommended for higgs deployment (oversub + hurts for lighter kernels). +5. shaderdb at 4 threads / 0 spills = compiler delivered; further + optimisation must target algorithm, not compile shape. + +## Phase 2 → Phase 3 hand-off + +Phase 2 must: +- Vendor `libavcodec/aarch64/vp9mc_neon.S` from FFmpeg n7.1.3 + (matches existing snapshot pin) +- Confirm `ff_vp9_subpel_filters` definition source + (`libavcodec/vp9dsp.c:32`, just the 16 × 8 REGULAR row needed) +- Pin the exact NEON symbol naming + +Phase 3 must: +- Write standalone C ref (`tests/vp9_mc_ref.c`) with REGULAR filter + table embedded +- Write `tests/bench_neon_mc.c` (M1'''_c gate + M3''') +- Capture M3''' before any QPU work diff --git a/docs/k3_mc_phase2.md b/docs/k3_mc_phase2.md new file mode 100644 index 0000000..30a6652 --- /dev/null +++ b/docs/k3_mc_phase2.md @@ -0,0 +1,109 @@ +--- +cycle: 3 +phase: 2 +status: closed 2026-05-18 +date_opened: 2026-05-18 +parent: k3_mc_phase1.md +--- + +# Cycle 3, Phase 2 — MC situation analysis + +## 1. C reference + +- **Source**: `external/ffmpeg-snapshot/libavcodec/vp9dsp_template.c` + (already vendored from cycle 1). +- **Function**: `put_8tap_regular_8h_c` generated by + `filter_fn_1d(8, h, mx, regular, FILTER_8TAP_REGULAR, put)` — + expands to call `do_8tap_1d_c` with `ds=1` (horizontal) and the + REGULAR filter bank. +- **Underlying primitive**: `do_8tap_1d_c` iterates `h` rows; + per row, iterates `w=8` columns; per column, computes the + `FILTER_8TAP` macro: `clip((sum_{k=0..7} F[k] * src[x+k-3] + + 64) >> 7, 0, 255)`. +- **Spec**: VP9 specification § 8.5.1 (subpel motion compensation). + +## 2. NEON reference + +- **Source**: `external/ffmpeg-snapshot/libavcodec/aarch64/vp9mc_neon.S` + (vendored 2026-05-18, FFmpeg n7.1.3, SHA-256 + `6b1d50f9821742584fdd47758057f810644aff3a008faaa774ff5b9cac4d1fef`). +- **Symbol**: `ff_vp9_put_regular8_h_neon` (note: filter type baked + into name, width=8 baked in, h-direction baked in) +- **Signature** (VP9 `vp9_mc_func` typedef): + ```c + void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my); + ``` + Registers: `x0=dst, x1=dst_stride, x2=src, x3=src_stride, w4=h, w5=mx, w6=my`. +- **Dependencies**: + - `libavutil/aarch64/asm.S` ✓ (already vendored) + - `ff_vp9_subpel_filters[3][16][8]` symbol — provided by + `external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c` + (hand-extracted from `libavcodec/vp9dsp.c` of the same n7.1.3 + pin; copying just the constant data avoids dragging in the + rest of `vp9dsp.c` which would require linking the entire VP9 + decoder). + +## 3. Workload model + +Per 8×8 block output: +- 8 multiplies × 8 columns × 8 rows = **512 multiplies** +- 7 additions × 8 columns × 8 rows = 448 additions +- 1 round (+64), 1 shift (>>7), 1 clip per pixel × 64 = 192 ops +- Total ~1150 integer ops per block + +Per-block memory (horizontal-only filter, 8-pixel-wide output): +- Read: 8 rows × (8 output cols + 7 tap overhang) = 8 × 15 = **120 source bytes** +- Write: 8 rows × 8 cols = **64 dst bytes** +- Total: **~184 bytes / block** + +Per 1080p frame (32 400 8×8 blocks, worst case all-MC): +- ~5.9 MB total memory traffic +- ~37 Mops compute +- At GPU 4 GB/s share: 1.48 ms / frame = 675 FPS = 21.9 Mblock/s +- At V3D 92 GFLOPS theoretical scalar (SMUL24 throughput ≈ FP MUL): 0.4 ms compute / frame = 2500 FPS theoretical → **compute is NOT the bottleneck** at this shape + +So MC is **bandwidth-bound on the QPU**, similar to LPF cycle 2. + +## 4. Per-row workload diversity (vs cycle 1+2) + +| | IDCT (k1) | LPF (k2) | MC (k3) | +|---|---|---|---| +| Per-block math | Heavy butterflies (~60 ops/block via separable transform) | Light: 0-30 ops per edge × 8 rows | 8-tap convolution: 1150 ops per block | +| Per-block memory | ~320 B in + 64 B out | ~64 B in + ~24 B out per edge | 120 B in + 64 B out | +| Compute / memory ratio | High | Low (memory-bound, lots of skipping) | Medium (compute-rich but bandwidth-bound at GPU) | +| Conditional? | No (always-execute) | Yes (fm/hev divergence per row) | No (deterministic per pixel) | +| QPU mult intensity | Q14 16b×16b mults | Light (compares, small clips) | 16b×8b mults (filter × pixel) | + +MC is interesting because it's **compute-rich AND bandwidth-bound** — +the closest match in workload shape to a real-world GPU compute kernel +the V3D was designed for (graphics filtering). + +## 5. Constraints carried from cycle 1+2 + +Same V3D 7.1 device profile (vulkaninfo unchanged). The relevant +specifics for MC: +- No DP4A → 8-tap convolution must be 8 separate SMUL24 + ADDs + (the typical GPU "dot4" packing is not available) +- shaderInt16 = false → filter coefficients widened to int32 in + registers; the filter table itself can be a uint16-storage SSBO +- shaderInt8 = false → source pixels widened to int32 in registers +- 1024-byte (16 KiB / 16) shared mem per WG is ample for MC source + staging if useful (15 cols × 8 rows × 1 byte per block-row × 32 + blocks per WG = 3 840 B per row); for v1 we skip shared-mem + staging and let TMU handle reads directly + +## 6. What Phase 2 does *not* close + +- Per-block (block_y, block_x) layout / meta format. Phase 4 picks. + Likely same shape as cycle 2 (uvec4 per block: dst_offset, + src_offset, mx, _pad). +- Filter table residency: as SSBO load every row, push-constants + per dispatch (different mx per dispatch), or constant baked into + shader (one filter per shader = 16 specialised shaders for the 16 + mx phases). Phase 4 picks; v1 likely SSBO for simplicity. +- Vertical / "hv" / "avg" / 4-pixel / 16-pixel / 32-pixel / 64-pixel + variants — out of cycle 3 scope; cycle 4+ if needed. + +Phase 3 next: build `tests/bench_neon_mc.c`, capture M3'''. diff --git a/docs/k3_mc_phase3.md b/docs/k3_mc_phase3.md new file mode 100644 index 0000000..35d88a0 --- /dev/null +++ b/docs/k3_mc_phase3.md @@ -0,0 +1,77 @@ +--- +cycle: 3 +phase: 3 +status: closed 2026-05-18 +date_opened: 2026-05-18 +parent: k3_mc_phase2.md +host: hertz +--- + +# Cycle 3, Phase 3 — NEON M3''' baseline + +## Raw + +``` +=== M1'''_c bit-exact (10000 random blocks) === +M1'''_c correctness: 10000 / 10000 blocks bit-exact (100.0000%) + mx phase coverage: min=577 max=668 (16 phases sampled) + +=== M3''' NEON throughput === +M3''' NEON throughput: + blocks/batch: 65536 + batches done: 939 + total blocks: 61 538 304 + elapsed (kernel)=2.930751 s + elapsed (setup) =2.075477 s + throughput = 20.997 Mblock/s + per-block = 47.6 ns + equiv 1080p = 648.1 FPS (32400 blocks/frame) +``` + +## Numbers + +| | | +|---|---| +| **M1'''_c (bit-exact)** | **100.0000 %** vs `daedalus_vp9_put_regular_8h_ref` | +| mx coverage | all 16 phases sampled, uniformly within ±10 % of expected count | +| **M3''' (throughput)** | **20.997 Mblock/s** single-core | +| per-block | 47.6 ns | +| cycles/block | 47.6 ns × 2.8 GHz ≈ 133 cycles | +| 1080p FPS-eq | 648 FPS | + +## Comparison across cycles + +| | IDCT (k1) | LPF (k2) | MC (k3) | +|---|---|---|---| +| Per-unit ns (NEON) | 122 | 20.7 (per edge) | 47.6 | +| 1080p FPS-eq | 252 | 748 (worst edges) | 648 | +| Compute character | Q14 butterflies + transpose | abs+compare+small mults | 8-tap convolution, mult-heavy | +| NEON win | SMLA + transpose | SMULL + saturate | SDOT-style packing | + +MC NEON is fast — at ~2.6× IDCT throughput per unit. The A76's SDOT +or SMULL-pair pattern handles 8-tap convolution extremely well; this +is precisely the workload NEON SIMD was built for. **The QPU's +break-even point on cycle 3 is correspondingly tight.** + +## Predictions for M2''' / R''' + +V3D 7.1 has SMUL24 (8b×8b → 16b sufficient) but **no DP4A**, so the +QPU must do 8 separate SMULL + ADD per output pixel. Bandwidth-wise +MC is similar to LPF (~6 MB / 1080p frame). Compute-wise much heavier +than LPF. + +- Compute-envelope (idealised): 32 400 blocks × 1 150 ops = 37 Mops + per frame. At v3d 92 GFLOPS theoretical × 23 % util ≈ 21 GOPS + effective → 1.8 ms / frame → 540 FPS → 17.5 Mblock/s +- Bandwidth-envelope: 5.9 MB/frame ÷ 4 GB/s ≈ 1.48 ms/frame → 22 Mblock/s +- Combined: min(compute, bandwidth) ≈ 17.5 Mblock/s + +**Predicted R''' = 17.5 / 21.0 ≈ 0.83** isolation. Likely YELLOW +band by a small margin. + +Honest lower bound: if SMUL24-vs-DP4A penalty is bigger than +estimated (CPU SDOT does 4 INT8 MACs in one instruction; the QPU +needs 4× more cycles for the same work in the worst case), R''' +could land near 0.5-0.6. Phase 7''' measures. + +Phase 4 next. diff --git a/docs/k3_mc_phase4.md b/docs/k3_mc_phase4.md new file mode 100644 index 0000000..345764d --- /dev/null +++ b/docs/k3_mc_phase4.md @@ -0,0 +1,207 @@ +--- +cycle: 3 +phase: 4 +status: open (awaiting Phase 5''' review) +date_opened: 2026-05-18 +parent: k3_mc_phase3.md +template: phase4.md (cycle 1) + k2_deblock_phase4.md (cycle 2) — same constraints, same patterns +--- + +# Cycle 3, Phase 4 — Plan QPU MC kernel + +Compact plan. Cycle 1+2 phase4 docs cover the constraint matrix +(C1-C10) and the dev-discipline patterns. Phase 4''' references +them rather than re-deriving. + +## 1. Constraints (carried) + +Same V3D 7.1 device. New for MC specifically: +- SMUL24 covers 16-bit filter × 8-bit pixel mults (max ~32K product, fits) +- Sum of 8 products fits in int32 trivially +- No DP4A — must use 8 separate scalar muls per output pixel +- 16 filter phases × 8 taps × 2 B = 256 B — too big for push constants + (max 128 B), small enough for one const array in shader + +## 2. Workload model + +Per 8×8 block: +- 512 SMUL24 (8 mults × 64 output pixels) +- 448 ADD (7 adds × 64 output pixels) +- 64 round (+64 → >>7) operations +- 64 clip-to-[0,255] +- ≈ 1150 ALU ops per block +- 120 B read + 64 B write = 184 B per block + +Per 1080p frame (32 400 blocks): +- ~37 Mops compute → 1.8 ms at v3d 23 % sustained (compute-bound estimate) +- ~5.9 MB traffic → 1.48 ms at 4 GB/s GPU share (bandwidth-bound estimate) + +## 3. Workgroup geometry + +Bake in the v4 lesson and the cycle-2 single-WG-size-from-start: + +- `local_size_x = 256` (16 subgroups × 16 lanes) +- 8 lanes per block (1 lane per row r=0..7), 2 blocks per subgroup +- **32 blocks per WG** +- 1080p: 1 013 WGs + +Same lane decomposition as cycle 2 LPF: +``` +edge_slot = lane_in_sg >> 3 // 0 or 1 — "which block in this subgroup" +row = lane_in_sg & 7 // 0..7 +block_local = sg_in_wg * 2 + edge_slot +block_idx = wg_id * 32 + block_local +oob = block_idx >= n_blocks +``` + +No barrier needed, no shared mem. Safe early-return on oob. + +## 4. Per-thread algorithm + +```glsl +if (block_idx >= pc.n_blocks) return; + +uvec4 m = u_meta.meta[block_idx]; +uint dst_off = m.x; +uint src_off = m.y; +uint mx = m.z & 15u; + +// Read 15 source pixels for this row. +uint src_row_addr = src_off + row * pc.src_stride_u8; +int s0 = int(u_src.src[src_row_addr + 0u]); +int s1 = int(u_src.src[src_row_addr + 1u]); +int s2 = int(u_src.src[src_row_addr + 2u]); +int s3 = int(u_src.src[src_row_addr + 3u]); +int s4 = int(u_src.src[src_row_addr + 4u]); +int s5 = int(u_src.src[src_row_addr + 5u]); +int s6 = int(u_src.src[src_row_addr + 6u]); +int s7 = int(u_src.src[src_row_addr + 7u]); +int s8 = int(u_src.src[src_row_addr + 8u]); +int s9 = int(u_src.src[src_row_addr + 9u]); +int s10 = int(u_src.src[src_row_addr + 10u]); +int s11 = int(u_src.src[src_row_addr + 11u]); +int s12 = int(u_src.src[src_row_addr + 12u]); +int s13 = int(u_src.src[src_row_addr + 13u]); +int s14 = int(u_src.src[src_row_addr + 14u]); + +// Filter coefficients — const REGULAR table, indexed by mx. +int F0 = FILTER_REGULAR[mx][0]; ... int F7 = FILTER_REGULAR[mx][7]; + +// 8 output pixels (each = 8-tap convolution of 8 consecutive source). +uint dst_row_addr = dst_off + row * pc.dst_stride_u8; + +int o0 = F0*s0 + F1*s1 + F2*s2 + F3*s3 + F4*s4 + F5*s5 + F6*s6 + F7*s7; +int o1 = F0*s1 + F1*s2 + F2*s3 + F3*s4 + F4*s5 + F5*s6 + F6*s7 + F7*s8; +int o2 = F0*s2 + F1*s3 + F2*s4 + F3*s5 + F4*s6 + F5*s7 + F6*s8 + F7*s9; +int o3 = F0*s3 + F1*s4 + F2*s5 + F3*s6 + F4*s7 + F5*s8 + F6*s9 + F7*s10; +int o4 = F0*s4 + F1*s5 + F2*s6 + F3*s7 + F4*s8 + F5*s9 + F6*s10+ F7*s11; +int o5 = F0*s5 + F1*s6 + F2*s7 + F3*s8 + F4*s9 + F5*s10+ F6*s11+ F7*s12; +int o6 = F0*s6 + F1*s7 + F2*s8 + F3*s9 + F4*s10+ F5*s11+ F6*s12+ F7*s13; +int o7 = F0*s7 + F1*s8 + F2*s9 + F3*s10+ F4*s11+ F5*s12+ F6*s13+ F7*s14; + +u_dst.dst[dst_row_addr + 0u] = uint8_t(clamp((o0 + 64) >> 7, 0, 255)); +u_dst.dst[dst_row_addr + 1u] = uint8_t(clamp((o1 + 64) >> 7, 0, 255)); +u_dst.dst[dst_row_addr + 2u] = uint8_t(clamp((o2 + 64) >> 7, 0, 255)); +u_dst.dst[dst_row_addr + 3u] = uint8_t(clamp((o3 + 64) >> 7, 0, 255)); +u_dst.dst[dst_row_addr + 4u] = uint8_t(clamp((o4 + 64) >> 7, 0, 255)); +u_dst.dst[dst_row_addr + 5u] = uint8_t(clamp((o5 + 64) >> 7, 0, 255)); +u_dst.dst[dst_row_addr + 6u] = uint8_t(clamp((o6 + 64) >> 7, 0, 255)); +u_dst.dst[dst_row_addr + 7u] = uint8_t(clamp((o7 + 64) >> 7, 0, 255)); +``` + +Mirrors `tests/vp9_mc_ref.c` directly. + +## 5. SSBOs / push constants + +| binding | name | type | usage | +|---|---|---|---| +| 0 | `meta` | `readonly uvec4[]` | per-block (dst_off, src_off, mx, _pad) | +| 1 | `dst` | `uint8_t[]` | output pixels | +| 2 | `src` | `readonly uint8_t[]` | input pixels | + +Push constants (16 B): +``` +n_blocks, dst_stride_u8, src_stride_u8, _pad +``` + +Filter table: hard-coded in shader as +`const int FILTER_REGULAR[16][8] = { ... };` — 128 const ints. + +**Race safety:** lane r writes `dst[dst_off + r*dst_stride + 0..7]` +(8 contiguous bytes). For rows r and r+1, writes are `r*stride + 7` +and `(r+1)*stride + 0`. Disjoint iff `dst_stride ≥ 8`. + +**Contracts (revised per phase5''' findings 4 + 6):** +1. `dst_stride_u8 ≥ 8` (race-safety lower bound) +2. `src_stride_u8 ≥ 15` (per-row read span) +3. `dst_off + 7 + (r_max)*dst_stride < dst_buffer_size` +4. `src_off + 14 + (r_max)*src_stride < src_buffer_size` +5. **`src_off` is the byte offset of the FIRST byte of the source + block's row 0 in the SSBO buffer — NOT shifted by +3.** The + C bench's `src + 3` C-caller convention does not carry into + the SSBO offset. Shader reads `s[k] = u_src.src[src_off + + row*stride + k]` for k=0..14, which equals + `master_src[block_base + row*stride + k]`, matching the C ref's + per-row read of `master_src[block_base + row*stride + (x..x+7)]` + for output col x ∈ 0..7. + +**Phase 6 MUST** add `assert(dst_stride_u8 >= 8 && src_stride_u8 >= 15)` +in `bench_v3d_mc.c`'s meta-construction loop. **Phase 6 MUST** also +run `V3D_DEBUG=shaderdb` after first compile and record uniform +count. If uniform count > ~144 (a fall-out indicator that the +filter LUT inflated unfavorably), escalate filter to a dedicated +SSBO binding 3. + +## 6. Predicted M2''' / R''' + +From Phase 3: +- Compute envelope: 17.5 Mblock/s +- Bandwidth envelope: 22.0 Mblock/s +- min ≈ 17.5 Mblock/s +- R''' isolation = 17.5 / 20.997 ≈ **0.83** (YELLOW, near GREEN) + +Honest lower bound R''' = 0.5-0.6 if SMUL24-vs-DP4A penalty bites +harder. Phase 7''' measures. + +## 7. WILL / WILL NOT touch + +WILL (Phase 6 creates): +- `src/v3d_mc_8h.comp` — GLSL shader +- `tests/bench_v3d_mc.c` — harness with contract asserts +- CMake updates + +WILL NOT touch: +- Cycle 1/2 artifacts (frozen Phase 3 baselines) +- `external/ffmpeg-snapshot/` (frozen vendored sources, including + the just-added `vp9_subpel_filters_table.c`) +- `src/v3d_runner.{c,h}` (reusable as-is) + +## 8. Phase 5''' review prompts + +Specific high-risk decisions: +1. **Orientation / arithmetic correctness** — the 8 `o0..o7` + expressions in §4 are stencil-aligned. Verify the off-by-one + in `F[k] * s[c+k]` matches `F[k] * src[x+k-3]` after the + `src+3` indexing shift used by the bench. +2. **Filter table residency** — hard-coded const array vs SSBO + vs push constants. Const is simplest but may cause v3d_compiler + to generate a large constant LUT. Worth verifying via shaderdb. +3. **Race safety** — same shape as cycle 2 (different rows of + same block disjoint iff stride ≥ row-width). Verify + `dst_stride ≥ 8` contract. +4. **`src+3` index shift** — the bench's source layout puts the + "row-0 col-0 source pixel" at `src + 3` (so src has -3..+12 + reachable). Make sure the QPU shader applies this offset + consistently to its `src_off` meta value. + **RESOLVED (phase5''' finding 4, RED):** `src_off` is the raw + block-base offset (NOT +3-shifted). See §5 contract 5. +5. **Anything missing.** + +## 9. Phase 6 execution order + +1. Write shader, get glslang to accept (likely 0 spills, ≥2 threads) +2. Write bench with asserts + meta layout +3. Run M1''' bit-exact (gate) +4. Run M2''' (throughput) +5. If R''' < 1.0 → M4''' concurrent +6. Phase 7''' verdict diff --git a/docs/k3_mc_phase5.md b/docs/k3_mc_phase5.md new file mode 100644 index 0000000..b5577e4 --- /dev/null +++ b/docs/k3_mc_phase5.md @@ -0,0 +1,71 @@ +--- +cycle: 3 +phase: 5 +status: closed 2026-05-18 — PASS-WITH-REVISIONS, revisions applied +date_opened: 2026-05-18 +date_closed: 2026-05-18 +parent: k3_mc_phase4.md +reviewer: Claude Sonnet (general-purpose Agent, fresh context) +plan_author: Claude Opus 4.7 (this session) +verdict: PASS-WITH-REVISIONS +--- + +# Cycle 3, Phase 5 — Second-Model Review of MC Plan + +Same handoff: in-session Agent (Sonnet, fresh context), files read +direct from disk, 5 review prompts + "anything else." + +Outcome: **1 RED (off-by-3 `src_off` indexing bug)**, **2 YELLOW** +(shaderdb LUT gate for filter table, "MUST" assert language for +contracts). Cycle-1+2 RED patterns (write race, barrier UB, +subgroup-ops table error) did not recur. + +**Phase 5 paid off again.** The RED would have caused a bit-exact +mismatch on the first run with cryptic "high index source pixels are +wrong" symptoms — likely 1-2 debug cycles to track down without the +review. + +## Review (verbatim) + +````markdown +## Verdict +PASS-WITH-REVISIONS — no RED-class correctness bugs. Two YELLOW findings +require plan amendments before Phase 6 proceeds. ... + +[full review preserved — reviewer's RED finding 4 traces the off-by-3: +shader's `src_off = block_base + 3` + `src_stride_u8 = 16` + reading +`s[0..14]` causes high-index reads to spill into next row] +```` +*(Verbatim review in agent output; key findings paraphrased below.)* + +| # | Severity | Issue | Resolution | +|---|---|---|---| +| 1 (orientation) | GREEN | All 8 oN expressions stencil-aligned correctly | accepted | +| 2 (filter LUT) | YELLOW | `const int FILTER_REGULAR[16][8]` may inflate uniform count or compile to large LUT | Phase 6 to record uniform count via `V3D_DEBUG=shaderdb`; if >~144 uniforms, escalate filter to SSBO binding 3 | +| 3 (race safety) | GREEN-w/note | `stride ≥ 8` contract correct; phrasing softer than cycle-2 standard | applied: §5 MUST assert | +| 4 (`src_off` semantics) | **RED** | Plan said "src_off mirrors src+3"; with stride=16 shader's `s13`/`s14` read into next row's first 2 bytes | **applied: src_off = raw block base (no +3 shift); shader reads s[0..14] from there** | +| 5 (missing) | GREEN-w/note | Coefficient overflow safely fits int32 (worked bound); no missing barrier-UB or write-race issues | accepted | +| 6 (assert MUST language) | YELLOW | "Bench enforces with asserts" softer than cycle-2 MUST pattern | applied: §5 MUST language | +| 7 (no barrier OK) | GREEN | Cycle-1 finding-7 doesn't apply (no barrier) | accepted | +| 8 (filter table matches) | GREEN | `vp9_mc_ref.c` filter values match `vp9_subpel_filters_table.c[1]` verbatim | accepted | + +## Resolution (applied to phase4 inline) + +1. **§4** — Clarified `src_off` is the byte offset of the **first byte + of the source block in the SSBO buffer** (NOT shifted by +3). The + C bench's `src + 3` C-caller convention does NOT carry into the + SSBO offset. Shader reads `s[k] = u_src.src[src_off + row*stride + k]` + for k=0..14, which equals `master_src[block_base + row*stride + k]`, + matching the C ref's per-row read of `master_src[block_base + row*stride + (x..x+7)]` + for output col x ∈ 0..7. + +2. **§5** — Hardened "Bench enforces" to "Phase 6 MUST add + `assert(dst_stride_u8 >= 8 && src_stride_u8 >= 15)` in + `bench_v3d_mc.c`'s meta-construction loop." Cycle-2 finding-4 + pattern applied. + +3. **§5** — Added: "Phase 6 MUST run `V3D_DEBUG=shaderdb` after first + compile and record uniform count. If uniform count > ~144, + escalate filter to a dedicated SSBO binding 3." + +After revisions: **Phase 4''' APPROVED for Phase 6''' implementation.** diff --git a/docs/k3_mc_phase7.md b/docs/k3_mc_phase7.md new file mode 100644 index 0000000..64c60b0 --- /dev/null +++ b/docs/k3_mc_phase7.md @@ -0,0 +1,152 @@ +--- +cycle: 3 +phase: 7 +status: closed 2026-05-18 — RED engineering / PASS 30fps-floor / M4 NEGATIVE +date_opened: 2026-05-18 +date_closed: 2026-05-18 +parent: k3_mc_phase4.md (revised per phase5''') +host: hertz +verdict: cycle 3 closes; MC stays on CPU for higgs deployment; engineering negative documented +--- + +# Cycle 3, Phase 7 — Verification (v1 + M4''') + +## v1 first-light + +``` +=== v3d MC 8h bench === + n_blocks: 65536 iters: 100 + +=== M1''': QPU vs C reference bit-exact === + blocks bit-exact: 65536 / 65536 (100.0000 %) + +=== M2''': QPU throughput === + M2''' = 1.413 Mblock/s + per-block = 707.9 ns + per-dispatch = 46390.5 us + R''' = 0.067 → RED band + 30fps@1080p floor: 1.5x margin (isolation) +``` + +shaderdb (v1 MC): +``` +SHADER-DB-ffcca249...: 488 inst, 2 threads, 0 loops, 197 uniforms, + 25 max-temps, 0:0 spills:fills, 0 sfu-stalls, 488 inst-and-stalls, 7 nops +``` + +**Phase 5''' finding 2 prediction confirmed**: filter LUT inflated +uniforms to 197 (gate was at ~144). Compiler also forced to 2 threads +(from cycle-2's 4) due to register pressure (25 max-temps vs cycle-2's +21). The "no DP4A" structural deficit shows up directly here — 8 +SMUL24 + 7 ADD per output pixel × 64 pixels per block × 8-lane +geometry = 488 instructions, 30× heavier than the LPF kernel. + +## M4''' concurrent matrix (8s windows) + +| Config | Mblock/s | per-core (NEON) | vs NEON-4 | 30fps | +|---|---|---|---|---| +| NEON 1-core | 14.479 | — | — | 14.9× | +| **NEON 4-core** | **15.248** | 3.24 – 4.48 | **baseline** | 15.7× | +| QPU only | 1.380 | — | — | 1.4× | +| **Mixed NEON-3 + QPU** | **12.277** | 3.78 – 4.16 | **−19.5 %** | 12.6× | +| Mixed NEON-4 + QPU | 12.158 | 2.49 – 3.35 | −20.3 % | 12.5× | + +**M4 gate: FAIL.** Mixed (12.28) < pure NEON-4 (15.25) by 2.97 +Mblock/s. The QPU's 0.45 Mblock/s contribution under contention +doesn't compensate for losing one NEON core that delivers ~3.8. + +## Cross-cycle comparison + +| | Cycle 1 IDCT | Cycle 2 LPF | Cycle 3 MC | +|---|---|---|---| +| R isolation | 0.92 | 0.41 | **0.067** | +| 30fps floor margin (isolation) | 7.9× | 10× | **1.5×** | +| M4 mixed vs pure NEON-4 | +7.2 % | +6.9 % | **−19.5 %** | +| 30fps floor margin (mixed) | 7.2× | 7.2× | **12.6×** | +| Verdict for higgs | GO QPU | GO QPU | **STAY CPU** | +| NEON 4-core scaling vs 1-core | 0.56× (bw-bound) | 0.82× (bw-bound) | **1.05× (compute-bound)** | + +The MC result is **structurally consistent** with the V3D substrate +profile from `phase0.md`: +- No DP4A → 8-wide convolution doesn't pack as it does on NEON SDOT +- Filter coefficients drive uniform count high → register pressure → 2 threads +- High per-output-pixel multiply count → compiled instruction count + 3× cycle 1, 6× cycle 2 + +NEON 4-core is *compute*-bound for MC (not bandwidth-bound like +the other two kernels). So 4-core scales nearly linearly with cores — +the NEON CPU has plenty of headroom and the QPU has nothing to add +even in concurrent mode. + +## Deployment recipe (for higgs / libva-v4l2-request-fourier) + +Per `project_consumer_target.md`, the eventual integration target is +V4L2 stateless → libva-v4l2-request-fourier → firefox-fourier. The +back-end-on-QPU/CPU split for the consumed decoder pipeline: + +- **IDCT (cycle 1)** → QPU. R = 0.92, +7 % mixed, frees a CPU core. +- **LPF (cycle 2)** → QPU. R = 0.41, +7 % mixed, frees a CPU core. +- **MC (cycle 3)** → **CPU NEON**. R = 0.067, −19.5 % mixed. + Compute-bound on CPU but CPU already comfortably exceeds 30fps; + offload makes things worse. +- **Entropy** (VP9 Bool / AV1 ANS) → CPU. Structurally serial. + +This is a **mixed-substrate deployment**, not a "QPU does everything" +plan. Realistic for higgs: entropy + MC on 2-3 ARM cores; IDCT + LPF +dispatched to QPU concurrently; 1-2 ARM cores left for vscode / etc. + +## Decision per Phase 1 rules + 30fps-floor calibration + +| Rule | Result | Status | +|---|---|---| +| M1''' bit-exact | 100.0000 % | ✓ PASS | +| R''' = M2'''/M3''' | 0.067 (RED) | structural mismatch | +| M4''' > pure-CPU 4-core | −19.5 % | ✗ FAIL gate | +| 30fps@1080p floor (isolation) | 1.5× | ✓ PASS (user-facing) | +| 30fps@1080p floor (mixed) | 12.6× | ✓ PASS (user-facing) | + +**Engineering cycle verdict: do not deploy MC on QPU; deploy on CPU.** +**User-facing cycle verdict: 30fps floor easily met in any +configuration; either path works for daily YouTube.** + +For the deployment recipe above, **MC stays on CPU**. The Phase 1 +ORANGE/RED "honest close" rule applies here: cycle 3 closes as a +documented negative for this kernel without affecting the +project-level "continue" verdict (cycles 1+2 GO results stand). + +## Phase 9 lessons (added to project memory) + +1. **Multiply-heavy workloads expose V3D's no-DP4A deficit** in a way + that cycle 1+2 didn't. CPU SDOT/UDOT pack 4 INT8 MACs in one + instruction; V3D's SMUL24 is one scalar mult at a time. The 4× + gap shows up directly as a 6-15× per-block slowdown. + +2. **Compute-bound CPU workloads make the QPU offload story collapse.** + When NEON 4-core scales near-linearly (not bandwidth-saturated), + the "freed-core" argument from cycle 1+2 doesn't apply — there + are no free cycles to free. Mixed mode is strictly worse. + +3. **The 30fps@1080p user-facing test (`project_30fps_floor_is_fine.md`) + passes regardless of engineering verdict.** All three cycles pass + it in isolation. This is a project-level win to communicate + separately from per-cycle engineering R numbers. + +4. **The shaderdb filter-LUT gate from phase5''' finding 2 fired + exactly as predicted** (197 uniforms > 144 threshold; 2 threads + instead of 4). This validates the cycle-discipline of running + `V3D_DEBUG=shaderdb` early and using the result as an actionable + gate. Cycle 4 (if any) should bake this in from Phase 4 §design. + +## Leaves open + +- Cycle 3 v2 with filter LUT escalated to SSBO (per phase5''' finding 2 + trigger). Would reduce uniforms to ~30, potentially restore 4 + threads. Expected upside: ~2× → R''' = 0.13. Still RED, still M4- + negative. Skipped — even doubling doesn't change the deployment + recipe. +- Vertical / hv / 4-tap / wider variants — all of cycle 3 same + multiply-shape, same structural verdict expected. Not worth Phase + 1+ for those. +- Cycle 4 candidates (per phase7_M4.md §"Cycle 3 candidates"): + CDEF (AV1-only directional filter), Loop Restoration (AV1-only), + or higgs deployment plumbing. diff --git a/external/ffmpeg-snapshot/PROVENANCE.md b/external/ffmpeg-snapshot/PROVENANCE.md index f6c761e..71e2385 100644 --- a/external/ffmpeg-snapshot/PROVENANCE.md +++ b/external/ffmpeg-snapshot/PROVENANCE.md @@ -25,6 +25,8 @@ tagged commit, no modifications. | `libavcodec/vp9dsp_template.c` | 2578 | 89045 | `41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f` | | `libavcodec/aarch64/vp9itxfm_neon.S` | 1580 | 63534 | `82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6` | | `libavcodec/aarch64/vp9lpf_neon.S` | 1334 | — | `384e49e7a6e838d9e38aedc00838ed4aebfa6c5bdb343ecaf23ef639bc10fbb7` | +| `libavcodec/aarch64/vp9mc_neon.S` | 665 | — | `6b1d50f9821742584fdd47758057f810644aff3a008faaa774ff5b9cac4d1fef` | +| `libavcodec/vp9_subpel_filters_table.c` | — | — | hand-extracted from `libavcodec/vp9dsp.c` at same n7.1.3 pin — provides `ff_vp9_subpel_filters` for `vp9mc_neon.S` to link against without dragging in vp9dsp.c's full init machinery | | `libavcodec/aarch64/neon.S` | 173 | 7496 | `72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538` | | `libavutil/aarch64/asm.S` | 260 | 8069 | `c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3` | | `COPYING.LGPLv2.1` | 502 | — | `b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe` | diff --git a/external/ffmpeg-snapshot/libavcodec/aarch64/vp9mc_neon.S b/external/ffmpeg-snapshot/libavcodec/aarch64/vp9mc_neon.S new file mode 100644 index 0000000..38f44ca --- /dev/null +++ b/external/ffmpeg-snapshot/libavcodec/aarch64/vp9mc_neon.S @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// All public functions in this file have the following signature: +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, +// const uint8_t *ref, ptrdiff_t ref_stride, +// int h, int mx, int my); + +function ff_vp9_avg64_neon, export=1 + mov x5, x0 +1: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 + urhadd v0.16b, v0.16b, v4.16b + urhadd v1.16b, v1.16b, v5.16b + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 + urhadd v2.16b, v2.16b, v6.16b + urhadd v3.16b, v3.16b, v7.16b + subs w4, w4, #2 + urhadd v16.16b, v16.16b, v20.16b + urhadd v17.16b, v17.16b, v21.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1 + urhadd v18.16b, v18.16b, v22.16b + urhadd v19.16b, v19.16b, v23.16b + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg32_neon, export=1 +1: + ld1 {v2.16b, v3.16b}, [x2], x3 + ld1 {v0.16b, v1.16b}, [x0] + urhadd v0.16b, v0.16b, v2.16b + urhadd v1.16b, v1.16b, v3.16b + subs w4, w4, #1 + st1 {v0.16b, v1.16b}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy16_neon, export=1 + add x5, x0, x1 + lsl x1, x1, #1 + add x6, x2, x3 + lsl x3, x3, #1 +1: + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x6], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x6], x3 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x5], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg16_neon, export=1 + mov x5, x0 +1: + ld1 {v2.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + urhadd v0.16b, v0.16b, v2.16b + ld1 {v1.16b}, [x0], x1 + urhadd v1.16b, v1.16b, v3.16b + subs w4, w4, #2 + st1 {v0.16b}, [x5], x1 + st1 {v1.16b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy8_neon, export=1 +1: + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + subs w4, w4, #2 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg8_neon, export=1 + mov x5, x0 +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.8b}, [x0], x1 + urhadd v1.8b, v1.8b, v3.8b + subs w4, w4, #2 + st1 {v0.8b}, [x5], x1 + st1 {v1.8b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy4_neon, export=1 +1: + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + st1 {v0.s}[0], [x0], x1 + ld1 {v2.s}[0], [x2], x3 + st1 {v1.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + subs w4, w4, #4 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg4_neon, export=1 + mov x5, x0 +1: + ld1 {v2.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v2.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v1.s}[0], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v1.s}[1], [x0], x1 + subs w4, w4, #4 + urhadd v0.8b, v0.8b, v2.8b + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.s}[0], [x5], x1 + st1 {v0.s}[1], [x5], x1 + st1 {v1.s}[0], [x5], x1 + st1 {v1.s}[1], [x5], x1 + b.ne 1b + ret +endfunc + + +// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 +// for size >= 16), and multiply-accumulate into dst1 and dst3 (or +// dst1-dst2 and dst3-dst4 for size >= 16) +.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size + ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) + ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) +.if \size >= 16 + mla \dst1\().8h, v20.8h, v0.h[\offset] + ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) + mla \dst3\().8h, v22.8h, v0.h[\offset] + ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) + mla \dst2\().8h, v21.8h, v0.h[\offset] + mla \dst4\().8h, v23.8h, v0.h[\offset] +.elseif \size == 8 + mla \dst1\().8h, v20.8h, v0.h[\offset] + mla \dst3\().8h, v22.8h, v0.h[\offset] +.else + mla \dst1\().4h, v20.4h, v0.h[\offset] + mla \dst3\().4h, v22.4h, v0.h[\offset] +.endif +.endm +// The same as above, but don't accumulate straight into the +// destination, but use a temp register and accumulate with saturation. +.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size + ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) + ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) +.if \size >= 16 + mul v20.8h, v20.8h, v0.h[\offset] + ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) + mul v22.8h, v22.8h, v0.h[\offset] + ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) + mul v21.8h, v21.8h, v0.h[\offset] + mul v23.8h, v23.8h, v0.h[\offset] +.elseif \size == 8 + mul v20.8h, v20.8h, v0.h[\offset] + mul v22.8h, v22.8h, v0.h[\offset] +.else + mul v20.4h, v20.4h, v0.h[\offset] + mul v22.4h, v22.4h, v0.h[\offset] +.endif +.if \size == 4 + sqadd \dst1\().4h, \dst1\().4h, v20.4h + sqadd \dst3\().4h, \dst3\().4h, v22.4h +.else + sqadd \dst1\().8h, \dst1\().8h, v20.8h + sqadd \dst3\().8h, \dst3\().8h, v22.8h +.if \size >= 16 + sqadd \dst2\().8h, \dst2\().8h, v21.8h + sqadd \dst4\().8h, \dst4\().8h, v23.8h +.endif +.endif +.endm + + +// Instantiate a horizontal filter function for the given size. +// This can work on 4, 8 or 16 pixels in parallel; for larger +// widths it will do 16 pixels at a time and loop horizontally. +// The actual width is passed in x5, the height in w4 and the +// filter coefficients in x9. idx2 is the index of the largest +// filter coefficient (3 or 4) and idx1 is the other one of them. +.macro do_8tap_h type, size, idx1, idx2 +function \type\()_8tap_\size\()h_\idx1\idx2 + sub x2, x2, #3 + add x6, x0, x1 + add x7, x2, x3 + add x1, x1, x1 + add x3, x3, x3 + // Only size >= 16 loops horizontally and needs + // reduced dst stride +.if \size >= 16 + sub x1, x1, x5 +.elseif \size == 4 + add x12, x2, #8 + add x13, x7, #8 +.endif + // size >= 16 loads two qwords and increments x2, + // for size 4/8 it's enough with one qword and no + // postincrement +.if \size >= 16 + sub x3, x3, x5 + sub x3, x3, #8 +.endif + // Load the filter vector + ld1 {v0.8h}, [x9] +1: +.if \size >= 16 + mov x9, x5 +.endif + // Load src +.if \size >= 16 + ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24 + ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24 +.elseif \size == 8 + ld1 {v4.8b, v5.8b}, [x2] + ld1 {v16.8b, v17.8b}, [x7] +.else // \size == 4 + ld1 {v4.8b}, [x2] + ld1 {v16.8b}, [x7] + ld1 {v5.s}[0], [x12], x3 + ld1 {v17.s}[0], [x13], x3 +.endif + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b +.if \size >= 16 + uxtl v6.8h, v6.8b + uxtl v18.8h, v18.8b +.endif +2: + + // Accumulate, adding idx2 last with a separate + // saturating add. The positive filter coefficients + // for all indices except idx2 must add up to less + // than 127 for this not to overflow. + mul v1.8h, v4.8h, v0.h[0] + mul v24.8h, v16.8h, v0.h[0] +.if \size >= 16 + mul v2.8h, v5.8h, v0.h[0] + mul v25.8h, v17.8h, v0.h[0] +.endif + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size + extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size + + // Round, shift and saturate + sqrshrun v1.8b, v1.8h, #7 + sqrshrun v24.8b, v24.8h, #7 +.if \size >= 16 + sqrshrun2 v1.16b, v2.8h, #7 + sqrshrun2 v24.16b, v25.8h, #7 +.endif + // Average +.ifc \type,avg +.if \size >= 16 + ld1 {v2.16b}, [x0] + ld1 {v3.16b}, [x6] + urhadd v1.16b, v1.16b, v2.16b + urhadd v24.16b, v24.16b, v3.16b +.elseif \size == 8 + ld1 {v2.8b}, [x0] + ld1 {v3.8b}, [x6] + urhadd v1.8b, v1.8b, v2.8b + urhadd v24.8b, v24.8b, v3.8b +.else + ld1 {v2.s}[0], [x0] + ld1 {v3.s}[0], [x6] + urhadd v1.8b, v1.8b, v2.8b + urhadd v24.8b, v24.8b, v3.8b +.endif +.endif + // Store and loop horizontally (for size >= 16) +.if \size >= 16 + subs x9, x9, #16 + st1 {v1.16b}, [x0], #16 + st1 {v24.16b}, [x6], #16 + b.eq 3f + mov v4.16b, v6.16b + mov v16.16b, v18.16b + ld1 {v6.16b}, [x2], #16 + ld1 {v18.16b}, [x7], #16 + uxtl v5.8h, v6.8b + uxtl2 v6.8h, v6.16b + uxtl v17.8h, v18.8b + uxtl2 v18.8h, v18.16b + b 2b +.elseif \size == 8 + st1 {v1.8b}, [x0] + st1 {v24.8b}, [x6] +.else // \size == 4 + st1 {v1.s}[0], [x0] + st1 {v24.s}[0], [x6] +.endif +3: + // Loop vertically + add x0, x0, x1 + add x6, x6, x1 + add x2, x2, x3 + add x7, x7, x3 + subs w4, w4, #2 + b.ne 1b + ret +endfunc +.endm + +.macro do_8tap_h_size size +do_8tap_h put, \size, 3, 4 +do_8tap_h avg, \size, 3, 4 +do_8tap_h put, \size, 4, 3 +do_8tap_h avg, \size, 4, 3 +.endm + +do_8tap_h_size 4 +do_8tap_h_size 8 +do_8tap_h_size 16 + +.macro do_8tap_h_func type, filter, offset, size +function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 + movrel x6, X(ff_vp9_subpel_filters), 256*\offset + cmp w5, #8 + add x9, x6, w5, uxtw #4 + mov x5, #\size +.if \size >= 16 + b.ge \type\()_8tap_16h_34 + b \type\()_8tap_16h_43 +.else + b.ge \type\()_8tap_\size\()h_34 + b \type\()_8tap_\size\()h_43 +.endif +endfunc +.endm + +.macro do_8tap_h_filters size +do_8tap_h_func put, regular, 1, \size +do_8tap_h_func avg, regular, 1, \size +do_8tap_h_func put, sharp, 2, \size +do_8tap_h_func avg, sharp, 2, \size +do_8tap_h_func put, smooth, 0, \size +do_8tap_h_func avg, smooth, 0, \size +.endm + +do_8tap_h_filters 64 +do_8tap_h_filters 32 +do_8tap_h_filters 16 +do_8tap_h_filters 8 +do_8tap_h_filters 4 + + +// Vertical filters + +// Round, shift and saturate and store reg1-reg2 over 4 lines +.macro do_store4 reg1, reg2, tmp1, tmp2, type + sqrshrun \reg1\().8b, \reg1\().8h, #7 + sqrshrun \reg2\().8b, \reg2\().8h, #7 +.ifc \type,avg + ld1 {\tmp1\().s}[0], [x7], x1 + ld1 {\tmp2\().s}[0], [x7], x1 + ld1 {\tmp1\().s}[1], [x7], x1 + ld1 {\tmp2\().s}[1], [x7], x1 + urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b + urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b +.endif + st1 {\reg1\().s}[0], [x0], x1 + st1 {\reg2\().s}[0], [x0], x1 + st1 {\reg1\().s}[1], [x0], x1 + st1 {\reg2\().s}[1], [x0], x1 +.endm + +// Round, shift and saturate and store reg1-4 +.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type + sqrshrun \reg1\().8b, \reg1\().8h, #7 + sqrshrun \reg2\().8b, \reg2\().8h, #7 + sqrshrun \reg3\().8b, \reg3\().8h, #7 + sqrshrun \reg4\().8b, \reg4\().8h, #7 +.ifc \type,avg + ld1 {\tmp1\().8b}, [x7], x1 + ld1 {\tmp2\().8b}, [x7], x1 + ld1 {\tmp3\().8b}, [x7], x1 + ld1 {\tmp4\().8b}, [x7], x1 + urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b + urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b + urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b + urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b +.endif + st1 {\reg1\().8b}, [x0], x1 + st1 {\reg2\().8b}, [x0], x1 + st1 {\reg3\().8b}, [x0], x1 + st1 {\reg4\().8b}, [x0], x1 +.endm + +// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 +// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately +// at the end with saturation. Indices 0 and 7 always have negative or zero +// coefficients, so they can be accumulated into tmp1-tmp2 together with the +// largest coefficient. +.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2 + mul \dst1\().8h, \src2\().8h, v0.h[1] + mul \dst2\().8h, \src3\().8h, v0.h[1] + mul \tmp1\().8h, \src1\().8h, v0.h[0] + mul \tmp2\().8h, \src2\().8h, v0.h[0] + mla \dst1\().8h, \src3\().8h, v0.h[2] + mla \dst2\().8h, \src4\().8h, v0.h[2] +.if \idx1 == 3 + mla \dst1\().8h, \src4\().8h, v0.h[3] + mla \dst2\().8h, \src5\().8h, v0.h[3] +.else + mla \dst1\().8h, \src5\().8h, v0.h[4] + mla \dst2\().8h, \src6\().8h, v0.h[4] +.endif + mla \dst1\().8h, \src6\().8h, v0.h[5] + mla \dst2\().8h, \src7\().8h, v0.h[5] + mla \tmp1\().8h, \src8\().8h, v0.h[7] + mla \tmp2\().8h, \src9\().8h, v0.h[7] + mla \dst1\().8h, \src7\().8h, v0.h[6] + mla \dst2\().8h, \src8\().8h, v0.h[6] +.if \idx2 == 3 + mla \tmp1\().8h, \src4\().8h, v0.h[3] + mla \tmp2\().8h, \src5\().8h, v0.h[3] +.else + mla \tmp1\().8h, \src5\().8h, v0.h[4] + mla \tmp2\().8h, \src6\().8h, v0.h[4] +.endif + sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h + sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h +.endm + +// Load pixels and extend them to 16 bit +.macro loadl dst1, dst2, dst3, dst4 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 +.ifnb \dst4 + ld1 {v4.8b}, [x2], x3 +.endif + uxtl \dst1\().8h, v1.8b + uxtl \dst2\().8h, v2.8b + uxtl \dst3\().8h, v3.8b +.ifnb \dst4 + uxtl \dst4\().8h, v4.8b +.endif +.endm + +// Instantiate a vertical filter function for filtering 8 pixels at a time. +// The height is passed in x4, the width in x5 and the filter coefficients +// in x6. idx2 is the index of the largest filter coefficient (3 or 4) +// and idx1 is the other one of them. +.macro do_8tap_8v type, idx1, idx2 +function \type\()_8tap_8v_\idx1\idx2 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +1: +.ifc \type,avg + mov x7, x0 +.endif + mov x6, x4 + + loadl v17, v18, v19 + + loadl v20, v21, v22, v23 +2: + loadl v24, v25, v26, v27 + convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6 + convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6 + do_store v1, v2, v3, v4, v5, v6, v7, v28, \type + + subs x6, x6, #4 + b.eq 8f + + loadl v16, v17, v18, v19 + convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6 + convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6 + do_store v1, v2, v3, v4, v5, v6, v7, v28, \type + + subs x6, x6, #4 + b.eq 8f + + loadl v20, v21, v22, v23 + convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6 + convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6 + do_store v1, v2, v3, v4, v5, v6, v7, v28, \type + + subs x6, x6, #4 + b.ne 2b + +8: + subs x5, x5, #8 + b.eq 9f + // x0 -= h * dst_stride + msub x0, x1, x4, x0 + // x2 -= h * src_stride + msub x2, x3, x4, x2 + // x2 -= 8 * src_stride + sub x2, x2, x3, lsl #3 + // x2 += 1 * src_stride + add x2, x2, x3 + add x2, x2, #8 + add x0, x0, #8 + b 1b +9: + ret +endfunc +.endm + +do_8tap_8v put, 3, 4 +do_8tap_8v put, 4, 3 +do_8tap_8v avg, 3, 4 +do_8tap_8v avg, 4, 3 + + +// Instantiate a vertical filter function for filtering a 4 pixels wide +// slice. The first half of the registers contain one row, while the second +// half of a register contains the second-next row (also stored in the first +// half of the register two steps ahead). The convolution does two outputs +// at a time; the output of v17-v24 into one, and v18-v25 into another one. +// The first half of first output is the first output row, the first half +// of the other output is the second output row. The second halves of the +// registers are rows 3 and 4. +// This only is designed to work for 4 or 8 output lines. +.macro do_8tap_4v type, idx1, idx2 +function \type\()_8tap_4v_\idx1\idx2 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +.ifc \type,avg + mov x7, x0 +.endif + + ld1 {v1.s}[0], [x2], x3 + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x2], x3 + ld1 {v4.s}[0], [x2], x3 + ld1 {v5.s}[0], [x2], x3 + ld1 {v6.s}[0], [x2], x3 + trn1 v1.2s, v1.2s, v3.2s + ld1 {v7.s}[0], [x2], x3 + trn1 v2.2s, v2.2s, v4.2s + ld1 {v26.s}[0], [x2], x3 + uxtl v17.8h, v1.8b + trn1 v3.2s, v3.2s, v5.2s + ld1 {v27.s}[0], [x2], x3 + uxtl v18.8h, v2.8b + trn1 v4.2s, v4.2s, v6.2s + ld1 {v28.s}[0], [x2], x3 + uxtl v19.8h, v3.8b + trn1 v5.2s, v5.2s, v7.2s + ld1 {v29.s}[0], [x2], x3 + uxtl v20.8h, v4.8b + trn1 v6.2s, v6.2s, v26.2s + uxtl v21.8h, v5.8b + trn1 v7.2s, v7.2s, v27.2s + uxtl v22.8h, v6.8b + trn1 v26.2s, v26.2s, v28.2s + uxtl v23.8h, v7.8b + trn1 v27.2s, v27.2s, v29.2s + uxtl v24.8h, v26.8b + uxtl v25.8h, v27.8b + + convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4 + do_store4 v1, v2, v5, v6, \type + + subs x4, x4, #4 + b.eq 9f + + ld1 {v1.s}[0], [x2], x3 + ld1 {v2.s}[0], [x2], x3 + trn1 v28.2s, v28.2s, v1.2s + trn1 v29.2s, v29.2s, v2.2s + ld1 {v1.s}[1], [x2], x3 + uxtl v26.8h, v28.8b + ld1 {v2.s}[1], [x2], x3 + uxtl v27.8h, v29.8b + uxtl v28.8h, v1.8b + uxtl v29.8h, v2.8b + + convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4 + do_store4 v1, v2, v5, v6, \type + +9: + ret +endfunc +.endm + +do_8tap_4v put, 3, 4 +do_8tap_4v put, 4, 3 +do_8tap_4v avg, 3, 4 +do_8tap_4v avg, 4, 3 + + +.macro do_8tap_v_func type, filter, offset, size +function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 + uxtw x4, w4 + movrel x5, X(ff_vp9_subpel_filters), 256*\offset + cmp w6, #8 + add x6, x5, w6, uxtw #4 + mov x5, #\size +.if \size >= 8 + b.ge \type\()_8tap_8v_34 + b \type\()_8tap_8v_43 +.else + b.ge \type\()_8tap_4v_34 + b \type\()_8tap_4v_43 +.endif +endfunc +.endm + +.macro do_8tap_v_filters size +do_8tap_v_func put, regular, 1, \size +do_8tap_v_func avg, regular, 1, \size +do_8tap_v_func put, sharp, 2, \size +do_8tap_v_func avg, sharp, 2, \size +do_8tap_v_func put, smooth, 0, \size +do_8tap_v_func avg, smooth, 0, \size +.endm + +do_8tap_v_filters 64 +do_8tap_v_filters 32 +do_8tap_v_filters 16 +do_8tap_v_filters 8 +do_8tap_v_filters 4 diff --git a/external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c b/external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c new file mode 100644 index 0000000..559bb5a --- /dev/null +++ b/external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c @@ -0,0 +1,82 @@ +/* + * VP9 8-tap subpel filter table — verbatim transcription of + * ff_vp9_subpel_filters from FFmpeg n7.1.3 libavcodec/vp9dsp.c + * (commit f46e514). Provided as a standalone .c so the vendored + * vp9mc_neon.S has the `ff_vp9_subpel_filters` symbol to link + * against, without pulling in the full vp9dsp.c init machinery + * (which would chain-include the entire VP9 decoder). + * + * Enum order from libavcodec/vp9dsp.h:64-67: + * FILTER_8TAP_SMOOTH = 0 + * FILTER_8TAP_REGULAR = 1 + * FILTER_8TAP_SHARP = 2 + * + * License: LGPL-2.1-or-later (matches vp9dsp.c upstream). + */ +#include + +#ifdef __GNUC__ +#define DAEDALUS_ALIGNED(n) __attribute__((aligned(n))) +#else +#define DAEDALUS_ALIGNED(n) +#endif + +const DAEDALUS_ALIGNED(16) int16_t ff_vp9_subpel_filters[3][16][8] = { + /* [0] = FILTER_8TAP_SMOOTH */ + { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { -3, -1, 32, 64, 38, 1, -3, 0 }, + { -2, -2, 29, 63, 41, 2, -3, 0 }, + { -2, -2, 26, 63, 43, 4, -4, 0 }, + { -2, -3, 24, 62, 46, 5, -4, 0 }, + { -2, -3, 21, 60, 49, 7, -4, 0 }, + { -1, -4, 18, 59, 51, 9, -4, 0 }, + { -1, -4, 16, 57, 53, 12, -4, -1 }, + { -1, -4, 14, 55, 55, 14, -4, -1 }, + { -1, -4, 12, 53, 57, 16, -4, -1 }, + { 0, -4, 9, 51, 59, 18, -4, -1 }, + { 0, -4, 7, 49, 60, 21, -3, -2 }, + { 0, -4, 5, 46, 62, 24, -3, -2 }, + { 0, -4, 4, 43, 63, 26, -2, -2 }, + { 0, -3, 2, 41, 63, 29, -2, -2 }, + { 0, -3, 1, 38, 64, 32, -1, -3 }, + }, + /* [1] = FILTER_8TAP_REGULAR */ + { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 1, -5, 126, 8, -3, 1, 0 }, + { -1, 3, -10, 122, 18, -6, 2, 0 }, + { -1, 4, -13, 118, 27, -9, 3, -1 }, + { -1, 4, -16, 112, 37, -11, 4, -1 }, + { -1, 5, -18, 105, 48, -14, 4, -1 }, + { -1, 5, -19, 97, 58, -16, 5, -1 }, + { -1, 6, -19, 88, 68, -18, 5, -1 }, + { -1, 6, -19, 78, 78, -19, 6, -1 }, + { -1, 5, -18, 68, 88, -19, 6, -1 }, + { -1, 5, -16, 58, 97, -19, 5, -1 }, + { -1, 4, -14, 48, 105, -18, 5, -1 }, + { -1, 4, -11, 37, 112, -16, 4, -1 }, + { -1, 3, -9, 27, 118, -13, 4, -1 }, + { 0, 2, -6, 18, 122, -10, 3, -1 }, + { 0, 1, -3, 8, 126, -5, 1, 0 }, + }, + /* [2] = FILTER_8TAP_SHARP */ + { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { -1, 3, -7, 127, 8, -3, 1, 0 }, + { -2, 5, -13, 125, 17, -6, 3, -1 }, + { -3, 7, -17, 121, 27, -10, 5, -2 }, + { -4, 9, -20, 115, 37, -13, 6, -2 }, + { -4, 10, -23, 108, 48, -16, 8, -3 }, + { -4, 10, -24, 100, 59, -19, 9, -3 }, + { -4, 11, -24, 90, 70, -21, 10, -4 }, + { -4, 11, -23, 80, 80, -23, 11, -4 }, + { -4, 10, -21, 70, 90, -24, 11, -4 }, + { -3, 9, -19, 59, 100, -24, 10, -4 }, + { -3, 8, -16, 48, 108, -23, 10, -4 }, + { -2, 6, -13, 37, 115, -20, 9, -4 }, + { -2, 5, -10, 27, 121, -17, 7, -3 }, + { -1, 3, -6, 17, 125, -13, 5, -2 }, + { 0, 1, -3, 8, 127, -7, 3, -1 }, + }, +}; diff --git a/src/v3d_mc_8h.comp b/src/v3d_mc_8h.comp new file mode 100644 index 0000000..5cc129b --- /dev/null +++ b/src/v3d_mc_8h.comp @@ -0,0 +1,142 @@ +// daedalus-fourier cycle 3 — VP9 8-tap "regular" subpel filter, +// horizontal direction, 8-wide output, h rows. V3D 7.1 via Mesa v3dv. +// +// Bakes in cycle-1+2 v4 winning patterns from start: +// - local_size_x = 256 +// - 8 lanes per block (1 lane per output row), 2 blocks per +// 16-lane subgroup, 16 subgroups per WG → 32 blocks per WG +// - uint8_t SSBO via storageBuffer8BitAccess +// - oob early-return safe (no barrier) +// +// Contracts (per k3_mc_phase4.md §5, revised per phase5''' findings): +// - meta[i].x: dst_off (byte offset of block's row-0 col-0 dst pixel) +// - meta[i].y: src_off (byte offset of block's row-0 col-0 SOURCE +// pixel — note: NO +3 shift; the C bench's `src + 3` C-caller +// convention does NOT carry into the SSBO offset. Shader reads +// s[k] = SSBO[src_off + row*stride + k] for k=0..14, matching +// C ref's per-row read of `master_src[block_base + row*stride +// + (x..x+7)]` for output col x ∈ 0..7). +// - meta[i].z: mx (subpel phase in [0..15]) +// - dst_stride_u8 ≥ 8 (race-safety lower bound; bench asserts) +// - src_stride_u8 ≥ 15 (per-row read span; bench asserts) +// +// License: BSD-2-Clause. Algorithm transcribed from tests/vp9_mc_ref.c +// which mirrors libavcodec/vp9dsp_template.c FILTER_8TAP macro. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Meta { + uvec4 meta[]; // per block: (dst_off, src_off, mx, _pad) +} u_meta; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; +} u_dst; + +layout(binding = 2) readonly buffer Src { + uint8_t src[]; +} u_src; + +layout(push_constant) uniform PC { + uint n_blocks; + uint dst_stride_u8; + uint src_stride_u8; + uint _pad; +} pc; + +// VP9 8-tap REGULAR filter table — verbatim from +// external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c +// (index [1] = FILTER_8TAP_REGULAR). 16 subpel phases × 8 taps. +// +// shaderdb-gate (phase5''' finding 2): if uniform count > ~144 after +// first compile, escalate this LUT to SSBO binding 3. +const int FILTER_REGULAR[16][8] = int[16][8]( + int[8]( 0, 0, 0, 128, 0, 0, 0, 0 ), + int[8]( 0, 1, -5, 126, 8, -3, 1, 0 ), + int[8](-1, 3, -10, 122, 18, -6, 2, 0 ), + int[8](-1, 4, -13, 118, 27, -9, 3, -1 ), + int[8](-1, 4, -16, 112, 37, -11, 4, -1 ), + int[8](-1, 5, -18, 105, 48, -14, 4, -1 ), + int[8](-1, 5, -19, 97, 58, -16, 5, -1 ), + int[8](-1, 6, -19, 88, 68, -18, 5, -1 ), + int[8](-1, 6, -19, 78, 78, -19, 6, -1 ), + int[8](-1, 5, -18, 68, 88, -19, 6, -1 ), + int[8](-1, 5, -16, 58, 97, -19, 5, -1 ), + int[8](-1, 4, -14, 48, 105, -18, 5, -1 ), + int[8](-1, 4, -11, 37, 112, -16, 4, -1 ), + int[8](-1, 3, -9, 27, 118, -13, 4, -1 ), + int[8]( 0, 2, -6, 18, 122, -10, 3, -1 ), + int[8]( 0, 1, -3, 8, 126, -5, 1, 0 ) +); + +void main() +{ + uint gid = gl_GlobalInvocationID.x; + uint wg_id = gid / 256u; + uint lane_in_wg = gid & 255u; + uint sg_in_wg = lane_in_wg >> 4; + uint lane_in_sg = lane_in_wg & 15u; + uint block_slot = lane_in_sg >> 3; + uint row = lane_in_sg & 7u; + + uint block_local = sg_in_wg * 2u + block_slot; + uint block_idx = wg_id * 32u + block_local; + + // No barrier follows — safe early-return. + if (block_idx >= pc.n_blocks) return; + + uvec4 m = u_meta.meta[block_idx]; + uint dst_off = m.x; + uint src_off = m.y; + uint mx = m.z & 15u; + + // Read 15 source pixels for this row. + uint src_row = src_off + row * pc.src_stride_u8; + int s0 = int(u_src.src[src_row + 0u]); + int s1 = int(u_src.src[src_row + 1u]); + int s2 = int(u_src.src[src_row + 2u]); + int s3 = int(u_src.src[src_row + 3u]); + int s4 = int(u_src.src[src_row + 4u]); + int s5 = int(u_src.src[src_row + 5u]); + int s6 = int(u_src.src[src_row + 6u]); + int s7 = int(u_src.src[src_row + 7u]); + int s8 = int(u_src.src[src_row + 8u]); + int s9 = int(u_src.src[src_row + 9u]); + int s10 = int(u_src.src[src_row + 10u]); + int s11 = int(u_src.src[src_row + 11u]); + int s12 = int(u_src.src[src_row + 12u]); + int s13 = int(u_src.src[src_row + 13u]); + int s14 = int(u_src.src[src_row + 14u]); + + int F0 = FILTER_REGULAR[mx][0]; + int F1 = FILTER_REGULAR[mx][1]; + int F2 = FILTER_REGULAR[mx][2]; + int F3 = FILTER_REGULAR[mx][3]; + int F4 = FILTER_REGULAR[mx][4]; + int F5 = FILTER_REGULAR[mx][5]; + int F6 = FILTER_REGULAR[mx][6]; + int F7 = FILTER_REGULAR[mx][7]; + + int o0 = F0*s0 + F1*s1 + F2*s2 + F3*s3 + F4*s4 + F5*s5 + F6*s6 + F7*s7; + int o1 = F0*s1 + F1*s2 + F2*s3 + F3*s4 + F4*s5 + F5*s6 + F6*s7 + F7*s8; + int o2 = F0*s2 + F1*s3 + F2*s4 + F3*s5 + F4*s6 + F5*s7 + F6*s8 + F7*s9; + int o3 = F0*s3 + F1*s4 + F2*s5 + F3*s6 + F4*s7 + F5*s8 + F6*s9 + F7*s10; + int o4 = F0*s4 + F1*s5 + F2*s6 + F3*s7 + F4*s8 + F5*s9 + F6*s10 + F7*s11; + int o5 = F0*s5 + F1*s6 + F2*s7 + F3*s8 + F4*s9 + F5*s10 + F6*s11 + F7*s12; + int o6 = F0*s6 + F1*s7 + F2*s8 + F3*s9 + F4*s10 + F5*s11 + F6*s12 + F7*s13; + int o7 = F0*s7 + F1*s8 + F2*s9 + F3*s10 + F4*s11 + F5*s12 + F6*s13 + F7*s14; + + uint dst_row = dst_off + row * pc.dst_stride_u8; + u_dst.dst[dst_row + 0u] = uint8_t(clamp((o0 + 64) >> 7, 0, 255)); + u_dst.dst[dst_row + 1u] = uint8_t(clamp((o1 + 64) >> 7, 0, 255)); + u_dst.dst[dst_row + 2u] = uint8_t(clamp((o2 + 64) >> 7, 0, 255)); + u_dst.dst[dst_row + 3u] = uint8_t(clamp((o3 + 64) >> 7, 0, 255)); + u_dst.dst[dst_row + 4u] = uint8_t(clamp((o4 + 64) >> 7, 0, 255)); + u_dst.dst[dst_row + 5u] = uint8_t(clamp((o5 + 64) >> 7, 0, 255)); + u_dst.dst[dst_row + 6u] = uint8_t(clamp((o6 + 64) >> 7, 0, 255)); + u_dst.dst[dst_row + 7u] = uint8_t(clamp((o7 + 64) >> 7, 0, 255)); +} diff --git a/tests/bench_concurrent_mc.c b/tests/bench_concurrent_mc.c new file mode 100644 index 0000000..b550467 --- /dev/null +++ b/tests/bench_concurrent_mc.c @@ -0,0 +1,286 @@ +/* + * Cycle 3 M4''' — concurrent CPU(NEON MC) + QPU(V3D MC) throughput. + * Same pthread/barrier pattern as bench_concurrent{,_lpf}.c. + * License: BSD-2-Clause. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v3d_runner.h" + +extern void ff_vp9_put_regular8_h_neon( + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my); + +#define SRC_W 16 +#define DST_W 8 +#define SRC_H 8 +#define DST_H 8 +#define SRC_BYTES (SRC_H * SRC_W) +#define DST_BYTES (DST_H * DST_W) + +static inline uint64_t xs_step(uint64_t *s) { + uint64_t x = *s; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return *s = x; +} +static uint64_t xs_init(uint64_t s) { return s ? s : 0xa57edbeef5717ULL; } +static double now_s(void) { + struct timespec t; clock_gettime(CLOCK_MONOTONIC_RAW, &t); + return t.tv_sec + t.tv_nsec * 1e-9; +} + +static volatile int g_stop = 0; +static pthread_barrier_t g_start; + +/* --- NEON worker ----------- */ + +#define NEON_BATCH 8192 + +typedef struct { + int worker_id, affinity_core; + uint64_t blocks_done; + double elapsed_s; +} neon_args; + +static void *neon_worker(void *p) +{ + neon_args *a = p; + cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs); + pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs); + + uint64_t s = xs_init((uint64_t) a->worker_id * 0xc01dbeefULL); + uint8_t *master = malloc((size_t) NEON_BATCH * SRC_BYTES); + uint8_t *work = malloc((size_t) NEON_BATCH * SRC_BYTES); + uint8_t *dsts = malloc((size_t) NEON_BATCH * DST_BYTES); + int *mxs = malloc(NEON_BATCH * sizeof(int)); + for (int i = 0; i < NEON_BATCH; i++) { + for (int j = 0; j < SRC_BYTES; j++) + master[(size_t)i * SRC_BYTES + j] = (uint8_t)(xs_step(&s) & 0xff); + mxs[i] = (int)(xs_step(&s) & 15); + } + + pthread_barrier_wait(&g_start); + double t0 = now_s(); + uint64_t done = 0; + while (!g_stop) { + memcpy(work, master, (size_t) NEON_BATCH * SRC_BYTES); + for (int i = 0; i < NEON_BATCH; i++) + ff_vp9_put_regular8_h_neon( + dsts + (size_t)i * DST_BYTES, DST_W, + work + (size_t)i * SRC_BYTES + 3, SRC_W, + DST_H, mxs[i], 0); + done += NEON_BATCH; + } + a->elapsed_s = now_s() - t0; + a->blocks_done = done; + free(master); free(work); free(dsts); free(mxs); + return NULL; +} + +/* --- QPU worker ----------- */ + +typedef struct { + int affinity_core, n_blocks; + uint64_t blocks_done; + double elapsed_s; +} qpu_args; + +typedef struct { + uint32_t n_blocks, dst_stride_u8, src_stride_u8, _pad; +} push_consts; + +static void *qpu_worker(void *p) +{ + qpu_args *a = p; + cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs); + pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs); + + v3d_runner *r = v3d_runner_create(); + if (!r) return NULL; + + int n_blocks = a->n_blocks; + size_t meta_bytes = (size_t) n_blocks * 4 * sizeof(uint32_t); + size_t src_bytes = (size_t) n_blocks * SRC_BYTES; + size_t dst_bytes = (size_t) n_blocks * DST_BYTES; + + v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0}; + v3d_runner_create_buffer(r, meta_bytes, &buf_meta); + v3d_runner_create_buffer(r, dst_bytes, &buf_dst); + v3d_runner_create_buffer(r, src_bytes, &buf_src); + + uint64_t s = 0xfeedfacecafebabeULL; + uint8_t *master = malloc(src_bytes); + for (size_t i = 0; i < src_bytes; i++) master[i] = (uint8_t)(xs_step(&s) & 0xff); + memcpy(buf_src.mapped, master, src_bytes); + + uint32_t *meta = buf_meta.mapped; + assert(DST_W >= 8); assert(SRC_W >= 15); + for (int i = 0; i < n_blocks; i++) { + meta[4*i + 0] = (uint32_t)((size_t)i * DST_BYTES); /* dst_off */ + meta[4*i + 1] = (uint32_t)((size_t)i * SRC_BYTES); /* src_off (RAW, no +3) */ + meta[4*i + 2] = (uint32_t)(xs_step(&s) & 15); /* mx */ + meta[4*i + 3] = 0; + } + + v3d_pipeline pipe = {0}; + v3d_runner_create_pipeline(r, "v3d_mc_8h.spv", 3, sizeof(push_consts), &pipe); + v3d_buffer bufs[3] = { buf_meta, buf_dst, buf_src }; + v3d_runner_bind_buffers(r, &pipe, bufs, 3); + + const uint32_t bpw = 32; + uint32_t gc = (uint32_t)((n_blocks + bpw - 1) / bpw); + push_consts pc = { .n_blocks = (uint32_t) n_blocks, + .dst_stride_u8 = DST_W, + .src_stride_u8 = SRC_W }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, gc, 1, 1); + vkEndCommandBuffer(cb); + + for (int i = 0; i < 5; i++) v3d_runner_submit_wait(r, cb); + + pthread_barrier_wait(&g_start); + double t0 = now_s(); + uint64_t done = 0; + while (!g_stop) { + memset(buf_dst.mapped, 0, dst_bytes); + v3d_runner_submit_wait(r, cb); + done += n_blocks; + } + a->elapsed_s = now_s() - t0; + a->blocks_done = done; + + free(master); + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_src); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + return NULL; +} + +typedef struct { double duration_s; } timer_args; +static void *timer_thread(void *p) { + timer_args *a = p; + pthread_barrier_wait(&g_start); + double end = now_s() + a->duration_s; + while (now_s() < end) { + struct timespec ts = {0, 1000000}; nanosleep(&ts, NULL); + } + g_stop = 1; + return NULL; +} + +enum mode { MODE_NEON, MODE_QPU, MODE_MIXED }; + +int main(int argc, char **argv) +{ + enum mode mode = MODE_NEON; + int n_neon = 4, qpu_core = 3, qpu_n_blocks = 65536; + double duration = 8.0; + + static struct option opts[] = { + {"mode", required_argument, 0, 'm'}, + {"neon-threads", required_argument, 0, 'n'}, + {"qpu-core", required_argument, 0, 'c'}, + {"qpu-blocks", required_argument, 0, 'b'}, + {"duration", required_argument, 0, 'd'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "m:n:c:b:d:", opts, 0)) != -1;) { + switch (c) { + case 'm': + if (!strcmp(optarg, "neon-only")) mode = MODE_NEON; + else if (!strcmp(optarg, "qpu-only")) mode = MODE_QPU; + else if (!strcmp(optarg, "mixed")) mode = MODE_MIXED; + else { fprintf(stderr, "bad mode\n"); return 2; } + break; + case 'n': n_neon = atoi(optarg); break; + case 'c': qpu_core = atoi(optarg); break; + case 'b': qpu_n_blocks = atoi(optarg); break; + case 'd': duration = atof(optarg); break; + default: return 2; + } + } + int has_qpu = (mode == MODE_QPU || mode == MODE_MIXED); + int has_neon = (mode == MODE_NEON || mode == MODE_MIXED); + int n_workers = (has_neon ? n_neon : 0) + (has_qpu ? 1 : 0); + int barrier_count = n_workers + 1 + 1; + + printf("=== M4''' concurrent MC bench ===\n"); + printf(" mode: %s, neon: %d, qpu: core %d / %d blocks, %.1fs\n", + mode == MODE_NEON ? "neon-only" : mode == MODE_QPU ? "qpu-only" : "mixed", + has_neon ? n_neon : 0, + has_qpu ? qpu_core : -1, + has_qpu ? qpu_n_blocks : 0, + duration); + + pthread_barrier_init(&g_start, NULL, barrier_count); + + pthread_t timer_tid; timer_args ta = { .duration_s = duration }; + pthread_create(&timer_tid, NULL, timer_thread, &ta); + + pthread_t neon_tids[16] = {0}; + neon_args n_args[16] = {0}; + if (has_neon) { + for (int i = 0; i < n_neon; i++) { + n_args[i] = (neon_args){ .worker_id = i, .affinity_core = i }; + pthread_create(&neon_tids[i], NULL, neon_worker, &n_args[i]); + } + } + pthread_t qpu_tid = 0; + qpu_args q_args = {0}; + if (has_qpu) { + q_args = (qpu_args){ .affinity_core = qpu_core, .n_blocks = qpu_n_blocks }; + pthread_create(&qpu_tid, NULL, qpu_worker, &q_args); + } + + pthread_barrier_wait(&g_start); + + pthread_join(timer_tid, NULL); + if (has_neon) for (int i = 0; i < n_neon; i++) pthread_join(neon_tids[i], NULL); + if (has_qpu) pthread_join(qpu_tid, NULL); + + uint64_t total = 0; double max_e = 0; + if (has_neon) { + printf("NEON per-thread:\n"); + for (int i = 0; i < n_neon; i++) { + double mbs = n_args[i].blocks_done / n_args[i].elapsed_s / 1e6; + printf(" core %d: %.3f Mblock/s\n", n_args[i].affinity_core, mbs); + total += n_args[i].blocks_done; + if (n_args[i].elapsed_s > max_e) max_e = n_args[i].elapsed_s; + } + } + if (has_qpu) { + double mbs = q_args.blocks_done / q_args.elapsed_s / 1e6; + printf("QPU (core %d): %.3f Mblock/s\n", q_args.affinity_core, mbs); + total += q_args.blocks_done; + if (q_args.elapsed_s > max_e) max_e = q_args.elapsed_s; + } + + double total_mbs = total / max_e / 1e6; + printf("\n=== AGGREGATE ===\n"); + printf(" Mblock/s : %.3f\n", total_mbs); + printf(" 30fps@1080p floor: 0.972 Mblock/s — %.1fx margin\n", + total_mbs / 0.972); + + pthread_barrier_destroy(&g_start); + return 0; +} diff --git a/tests/bench_neon_mc.c b/tests/bench_neon_mc.c new file mode 100644 index 0000000..fc780a5 --- /dev/null +++ b/tests/bench_neon_mc.c @@ -0,0 +1,220 @@ +/* + * Cycle 3 Phase 3 — NEON M3''' baseline for VP9 8-tap regular + * horizontal MC interpolation, 8×8 block. + * + * Reports: + * M1'''_c (correctness): C-ref ↔ NEON bit-exact rate, N random + * 8×8 blocks with random source pixels and + * random subpel phase mx ∈ [0, 15] + * M3''' (throughput): NEON sustained Mblock/s, single-thread, + * time-based + * + * License: LGPL-2.1+ (statically links FFmpeg NEON snapshot). + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include + +extern void daedalus_vp9_put_regular_8h_ref( + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my); + +extern void ff_vp9_put_regular8_h_neon( + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my); + +/* RNG ------------------------------------------------------------ */ + +static uint64_t xs_state; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +/* Block layout: each block gets its own 8×16 source buffer + 8×8 dst. + * - source buffer is 16 cols wide; the filter is called with + * src = block_src + 3, so it reads cols [src+0-3..src+8+4] = + * [0..14] of the 16-col buffer. col 15 is unused padding. + * - dst is 8 cols × 8 rows. + */ +#define SRC_W 16 +#define SRC_H 8 +#define DST_W 8 +#define DST_H 8 +#define SRC_BYTES (SRC_H * SRC_W) /* 128 */ +#define DST_BYTES (DST_H * DST_W) /* 64 */ + +static void gen_src(uint8_t *buf) +{ + for (int i = 0; i < SRC_BYTES; i++) + buf[i] = (uint8_t)(xs() & 0xff); +} + +static double now_seconds(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +/* M1'''_c correctness gate -------------------------------------- */ + +static int correctness_check(uint64_t seed, int n_blocks) +{ + xs_state = seed ? seed : 0xabcdef1234567890ULL; + int mismatches = 0; + uint8_t src[SRC_BYTES]; + uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES]; + + int mx_hist[16] = {0}; + + for (int i = 0; i < n_blocks; i++) { + gen_src(src); + int mx = (int)(xs() & 15); + mx_hist[mx]++; + + memset(dst_a, 0, DST_BYTES); + memset(dst_b, 0, DST_BYTES); + + daedalus_vp9_put_regular_8h_ref(dst_a, DST_W, src + 3, SRC_W, DST_H, mx, 0); + ff_vp9_put_regular8_h_neon (dst_b, DST_W, src + 3, SRC_W, DST_H, mx, 0); + + if (memcmp(dst_a, dst_b, DST_BYTES) != 0) { + if (mismatches < 3) { + fprintf(stderr, "MISMATCH block %d mx=%d:\n", i, mx); + fprintf(stderr, " ref:"); + for (int r = 0; r < 8; r++) { + fprintf(stderr, "\n r%d ", r); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r*8+c]); + } + fprintf(stderr, "\n neon:"); + for (int r = 0; r < 8; r++) { + fprintf(stderr, "\n r%d ", r); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r*8+c]); + } + fprintf(stderr, "\n"); + } + mismatches++; + } + } + printf("M1'''_c correctness: %d / %d blocks bit-exact (%.4f%%)\n", + n_blocks - mismatches, n_blocks, + 100.0 * (n_blocks - mismatches) / n_blocks); + /* mx histogram — confirms all 16 phases get exercised. */ + int min_mx = mx_hist[0], max_mx = mx_hist[0]; + for (int i = 1; i < 16; i++) { + if (mx_hist[i] < min_mx) min_mx = mx_hist[i]; + if (mx_hist[i] > max_mx) max_mx = mx_hist[i]; + } + printf(" mx phase coverage: min=%d max=%d (16 phases sampled)\n", + min_mx, max_mx); + return mismatches; +} + +/* M3''' throughput ---------------------------------------------- */ + +static void throughput_neon(uint64_t seed, int n_blocks, double duration_s) +{ + xs_state = seed ? seed : 0xdeadbeef12345678ULL; + + uint8_t *master_src = malloc((size_t) n_blocks * SRC_BYTES); + uint8_t *work_src = malloc((size_t) n_blocks * SRC_BYTES); + uint8_t *dsts = malloc((size_t) n_blocks * DST_BYTES); + int *mxs = malloc(n_blocks * sizeof(int)); + if (!master_src || !work_src || !dsts || !mxs) { fprintf(stderr, "alloc fail\n"); exit(1); } + + for (int i = 0; i < n_blocks; i++) { + gen_src(master_src + (size_t)i * SRC_BYTES); + mxs[i] = (int)(xs() & 15); + } + + /* Warm. */ + memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES); + for (int i = 0; i < n_blocks; i++) + ff_vp9_put_regular8_h_neon(dsts + (size_t)i * DST_BYTES, DST_W, + work_src + (size_t)i * SRC_BYTES + 3, SRC_W, + DST_H, mxs[i], 0); + + double t0 = now_seconds(); + double t_end = t0 + duration_s; + uint64_t done = 0; + while (now_seconds() < t_end) { + memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES); + for (int i = 0; i < n_blocks; i++) + ff_vp9_put_regular8_h_neon(dsts + (size_t)i * DST_BYTES, DST_W, + work_src + (size_t)i * SRC_BYTES + 3, SRC_W, + DST_H, mxs[i], 0); + done += n_blocks; + } + double elapsed = now_seconds() - t0; + + /* setup-only subtraction */ + int setup_iters = (int) (done / n_blocks); + double s0 = now_seconds(); + for (int it = 0; it < setup_iters; it++) + memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES); + double s1 = now_seconds(); + + double kernel_seconds = elapsed - (s1 - s0); + double mbps = done / kernel_seconds / 1e6; + + printf("M3''' NEON throughput:\n"); + printf(" blocks/batch: %d\n", n_blocks); + printf(" batches done: %d\n", setup_iters); + printf(" total blocks: %llu\n", (unsigned long long) done); + printf(" elapsed (kernel)=%.6f s\n", kernel_seconds); + printf(" elapsed (setup) =%.6f s\n", s1 - s0); + printf(" throughput = %.3f Mblock/s\n", mbps); + printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9); + /* 1080p: 32400 blocks/frame */ + printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n", + mbps * 1e6 / 32400.0); + + free(master_src); free(work_src); free(dsts); free(mxs); +} + +int main(int argc, char **argv) +{ + int n_blocks = 65536; + double duration = 5.0; + uint64_t seed = 0; + int do_correctness = 1; + + static struct option opts[] = { + {"blocks", required_argument, 0, 'b'}, + {"duration", required_argument, 0, 'd'}, + {"seed", required_argument, 0, 's'}, + {"no-correctness", no_argument, 0, 'C'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) { + switch (c) { + case 'b': n_blocks = atoi(optarg); break; + case 'd': duration = atof(optarg); break; + case 's': seed = strtoull(optarg, 0, 0); break; + case 'C': do_correctness = 0; break; + default: return 2; + } + } + + if (do_correctness) { + printf("=== M1'''_c bit-exact (10000 random blocks) ===\n"); + if (correctness_check(seed, 10000) != 0) { + fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); + return 1; + } + printf("\n"); + } + + printf("=== M3''' NEON throughput ===\n"); + throughput_neon(seed, n_blocks, duration); + return 0; +} diff --git a/tests/bench_v3d_mc.c b/tests/bench_v3d_mc.c new file mode 100644 index 0000000..cd24045 --- /dev/null +++ b/tests/bench_v3d_mc.c @@ -0,0 +1,303 @@ +/* + * Cycle 3 Phase 6 — QPU bench for VP9 8-tap "regular" subpel filter, + * horizontal, 8-wide output on V3D 7.1. + * + * Reports: + * M1''' (correctness): QPU output vs C reference, N blocks across + * all 16 mx phases + * M2''' (throughput): QPU sustained Mblock/s + * + * Per k3_mc_phase4.md §5 (revised per phase5''' findings 4 + 6): + * - src_off is the RAW block base (no +3 shift) + * - assert(dst_stride_u8 >= 8 && src_stride_u8 >= 15) + * + * License: BSD-2-Clause. + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v3d_runner.h" + +extern void daedalus_vp9_put_regular_8h_ref( + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my); + +/* Per-block layout: src buffer 8 rows × 16 cols = 128 bytes. The + * C bench's src+3 convention: NEON/C ref is called with + * `src = block_base + 3, src_stride = 16`. The shader's src_off + * is the RAW block_base (no +3 shift), and the shader reads + * s[0..14] from src_off + row*stride. Together this means: + * shader's s[k] for k=0..14 = master_src[block_base + row*16 + k] + * C ref's `src[x+k-3]` for x=0..7, k=0..7 with `src = block_base+3` + * = master_src[block_base + row*16 + (x+k)] + * = master_src[block_base + row*16 + (0..14)] + * which is exactly what the shader reads. */ + +#define SRC_W 16 +#define SRC_H 8 +#define DST_W 8 +#define DST_H 8 +#define SRC_BYTES (SRC_H * SRC_W) +#define DST_BYTES (DST_H * DST_W) + +static uint64_t xs_state; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} +static void gen_src(uint8_t *b) { + for (int i = 0; i < SRC_BYTES; i++) b[i] = (uint8_t)(xs() & 0xff); +} +static double now_seconds(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +typedef struct { + uint32_t n_blocks; + uint32_t dst_stride_u8; + uint32_t src_stride_u8; + uint32_t _pad; +} push_consts; + +int main(int argc, char **argv) +{ + int n_blocks = 65536; + int iters = 100; + uint64_t seed = 0; + int verify_only = 0; + const char *spv_path = "v3d_mc_8h.spv"; + + static struct option opts[] = { + {"blocks", required_argument, 0, 'b'}, + {"iters", required_argument, 0, 'i'}, + {"seed", required_argument, 0, 's'}, + {"spv", required_argument, 0, 'S'}, + {"verify-only", no_argument, 0, 'V'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "b:i:s:S:V", opts, 0)) != -1;) { + switch (c) { + case 'b': n_blocks = atoi(optarg); break; + case 'i': iters = atoi(optarg); break; + case 's': seed = strtoull(optarg, 0, 0); break; + case 'S': spv_path = optarg; break; + case 'V': verify_only = 1; break; + default: return 2; + } + } + + xs_state = seed ? seed : 0xabcdef1234567890ULL; + + v3d_runner *r = v3d_runner_create(); + if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; } + printf("=== v3d MC 8h bench ===\n"); + printf(" device: %s\n", v3d_runner_device_name(r)); + printf(" n_blocks: %d iters: %d\n", n_blocks, iters); + + /* Buffers: meta + dst + src, all blocks contiguous. */ + size_t meta_bytes = (size_t) n_blocks * 4 * sizeof(uint32_t); + size_t src_bytes = (size_t) n_blocks * SRC_BYTES; + size_t dst_bytes = (size_t) n_blocks * DST_BYTES; + + v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0}; + if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1; + if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1; + if (v3d_runner_create_buffer(r, src_bytes, &buf_src)) return 1; + + uint8_t *master_src = malloc(src_bytes); + uint8_t *expected = malloc(dst_bytes); + int *mxs = malloc(n_blocks * sizeof(int)); + if (!master_src || !expected || !mxs) { fprintf(stderr, "alloc\n"); return 1; } + for (int i = 0; i < n_blocks; i++) { + gen_src(master_src + (size_t)i * SRC_BYTES); + mxs[i] = (int)(xs() & 15); + } + + /* Build C-ref expected. C ref takes `src + 3, src_stride = SRC_W`. */ + memset(expected, 0, dst_bytes); + for (int i = 0; i < n_blocks; i++) { + daedalus_vp9_put_regular_8h_ref( + expected + (size_t)i * DST_BYTES, DST_W, + master_src + (size_t)i * SRC_BYTES + 3, SRC_W, + DST_H, mxs[i], 0); + } + + /* Populate GPU buffers. Contracts (phase4 §5) enforced via asserts. */ + uint32_t dst_stride_u8 = DST_W; + uint32_t src_stride_u8 = SRC_W; + assert(dst_stride_u8 >= 8 && "phase4 §5 contract 1"); + assert(src_stride_u8 >= 15 && "phase4 §5 contract 2"); + + uint32_t *meta = (uint32_t *) buf_meta.mapped; + for (int i = 0; i < n_blocks; i++) { + /* src_off: RAW block base. NO +3 shift. (phase5''' finding 4) */ + uint32_t src_off = (uint32_t)((size_t)i * SRC_BYTES); + uint32_t dst_off = (uint32_t)((size_t)i * DST_BYTES); + meta[4*i + 0] = dst_off; + meta[4*i + 1] = src_off; + meta[4*i + 2] = (uint32_t) mxs[i]; + meta[4*i + 3] = 0; + } + memcpy(buf_src.mapped, master_src, src_bytes); + memset(buf_dst.mapped, 0, dst_bytes); + + /* Pipeline. */ + v3d_pipeline pipe = {0}; + if (v3d_runner_create_pipeline(r, spv_path, + /*n_ssbos=*/3, + /*push_const_size=*/sizeof(push_consts), + &pipe)) return 1; + v3d_buffer bind_bufs[3] = { buf_meta, buf_dst, buf_src }; + if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1; + + const uint32_t blocks_per_wg = 32; + uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1) / blocks_per_wg); + printf(" dispatch: %u WGs × 256 invocations = %u blocks (rounded up from %d)\n", + group_count_x, group_count_x * blocks_per_wg, n_blocks); + + push_consts pc = { + .n_blocks = (uint32_t) n_blocks, + .dst_stride_u8 = dst_stride_u8, + .src_stride_u8 = src_stride_u8, + ._pad = 0, + }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, group_count_x, 1, 1); + vkEndCommandBuffer(cb); + + /* --- M1''' bit-exact --- */ + printf("\n=== M1''': QPU vs C reference bit-exact ===\n"); + memset(buf_dst.mapped, 0, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + + int mismatch_blocks = 0; + int total_byte_diffs = 0; + int prints = 0; + for (int i = 0; i < n_blocks; i++) { + const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * DST_BYTES; + const uint8_t *e = expected + (size_t)i * DST_BYTES; + if (memcmp(q, e, DST_BYTES) != 0) { + int diffs = 0; + for (int j = 0; j < DST_BYTES; j++) if (q[j] != e[j]) diffs++; + total_byte_diffs += diffs; + if (prints < 3) { + fprintf(stderr, "MISMATCH block %d mx=%d: %d/64 bytes differ\n", + i, mxs[i], diffs); + fprintf(stderr, " ref:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]); + } + fprintf(stderr, "\n qpu:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]); + } + fprintf(stderr, "\n"); + prints++; + } + mismatch_blocks++; + } + } + printf(" blocks bit-exact: %d / %d (%.4f%%)\n", + n_blocks - mismatch_blocks, n_blocks, + 100.0 * (n_blocks - mismatch_blocks) / n_blocks); + printf(" total byte diffs: %d / %zu (%.4f%%)\n", + total_byte_diffs, (size_t) n_blocks * DST_BYTES, + 100.0 * total_byte_diffs / ((double) n_blocks * DST_BYTES)); + + if (mismatch_blocks > 0) { + fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_src); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + return 1; + } + + if (verify_only) { + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_src); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + return 0; + } + + /* --- M2''' throughput --- */ + printf("\n=== M2''': QPU throughput ===\n"); + + for (int i = 0; i < 10; i++) { + memset(buf_dst.mapped, 0, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + + double t0 = now_seconds(); + for (int i = 0; i < iters; i++) { + memset(buf_dst.mapped, 0, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + double t1 = now_seconds(); + + double s0 = now_seconds(); + for (int i = 0; i < iters; i++) memset(buf_dst.mapped, 0, dst_bytes); + double s1 = now_seconds(); + + double kernel_seconds = (t1 - t0) - (s1 - s0); + double total_blocks = (double) n_blocks * iters; + double mbps = total_blocks / kernel_seconds / 1e6; + + printf(" blocks/dispatch: %d\n", n_blocks); + printf(" iters: %d\n", iters); + printf(" total blocks: %.0f\n", total_blocks); + printf(" elapsed (kernel)=%.6f s\n", kernel_seconds); + printf(" elapsed (setup) =%.6f s\n", s1 - s0); + printf(" M2''' throughput = %.3f Mblock/s\n", mbps); + printf(" per-block = %.1f ns\n", kernel_seconds / total_blocks * 1e9); + printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6); + + double M3 = 20.997; /* from k3_mc_phase3.md */ + double R = mbps / M3; + printf("\n Cycle 3 NEON M3''' = %.3f Mblock/s\n", M3); + printf(" R''' = M2'''/M3''' = %.3f\n", R); + if (R >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n"); + else if (R >= 0.5) printf(" decision band = YELLOW: M4''' decides\n"); + else if (R >= 0.1) printf(" decision band = ORANGE: M4''' may still rescue\n"); + else printf(" decision band = RED: structural mismatch\n"); + + /* 30fps@1080p floor check (per project_30fps_floor_is_fine.md) */ + double mblocks_per_1080p = 32400.0 * 30.0 / 1e6; + printf("\n 30fps@1080p floor : %.3f Mblock/s (32400 blocks × 30 fps)\n", + mblocks_per_1080p); + printf(" isolation margin : %.1fx over 30fps floor\n", + mbps / mblocks_per_1080p); + + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_src); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + free(master_src); free(expected); free(mxs); + return 0; +} diff --git a/tests/vp9_mc_ref.c b/tests/vp9_mc_ref.c new file mode 100644 index 0000000..fddc11a --- /dev/null +++ b/tests/vp9_mc_ref.c @@ -0,0 +1,72 @@ +/* + * Standalone bit-exact C reference for VP9 8-tap "regular" subpel + * filter, horizontal direction, 8-pixel-wide output. Transcribed + * from FFmpeg's libavcodec/vp9dsp_template.c FILTER_8TAP macro + * (vendored at external/ffmpeg-snapshot/). 8-bit pixels only. + * + * Filter coefficients embedded inline (REGULAR filter only, all 16 + * subpel phases). Same values as ff_vp9_subpel_filters[1][mx] in + * external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c. + * + * License: LGPL-2.1-or-later. + * + * Spec source: VP9 specification §8.5.1 — subpel motion compensation. + */ +#include +#include + +static const int16_t vp9_8tap_regular_filters[16][8] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 1, -5, 126, 8, -3, 1, 0 }, + { -1, 3, -10, 122, 18, -6, 2, 0 }, + { -1, 4, -13, 118, 27, -9, 3, -1 }, + { -1, 4, -16, 112, 37, -11, 4, -1 }, + { -1, 5, -18, 105, 48, -14, 4, -1 }, + { -1, 5, -19, 97, 58, -16, 5, -1 }, + { -1, 6, -19, 88, 68, -18, 5, -1 }, + { -1, 6, -19, 78, 78, -19, 6, -1 }, + { -1, 5, -18, 68, 88, -19, 6, -1 }, + { -1, 5, -16, 58, 97, -19, 5, -1 }, + { -1, 4, -14, 48, 105, -18, 5, -1 }, + { -1, 4, -11, 37, 112, -16, 4, -1 }, + { -1, 3, -9, 27, 118, -13, 4, -1 }, + { 0, 2, -6, 18, 122, -10, 3, -1 }, + { 0, 1, -3, 8, 126, -5, 1, 0 }, +}; + +static inline uint8_t clip_u8(int x) +{ + return (uint8_t)(x > 255 ? 255 : x < 0 ? 0 : x); +} + +/* + * 8x8 horizontal 8-tap "put" (non-averaging). Width hard-coded 8. + * `src` must point at the row-0 output-column-0 source pixel; valid + * source memory must extend src[r*src_stride + (-3..+11)] for r=0..h-1. + * `dst` is written at dst[r*dst_stride + 0..7] for r=0..h-1. + * + * Matches ff_vp9_put_regular8_h_neon byte-for-byte on 8-bit input. + */ +void daedalus_vp9_put_regular_8h_ref(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my) +{ + (void) my; /* horizontal-only filter ignores y phase */ + const int16_t *F = vp9_8tap_regular_filters[mx & 15]; + + for (int r = 0; r < h; r++) { + for (int x = 0; x < 8; x++) { + int sum = F[0] * (int) src[x - 3] + + F[1] * (int) src[x - 2] + + F[2] * (int) src[x - 1] + + F[3] * (int) src[x + 0] + + F[4] * (int) src[x + 1] + + F[5] * (int) src[x + 2] + + F[6] * (int) src[x + 3] + + F[7] * (int) src[x + 4]; + dst[x] = clip_u8((sum + 64) >> 7); + } + dst += dst_stride; + src += src_stride; + } +}