diff --git a/CMakeLists.txt b/CMakeLists.txt index caecc61..a8402e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,14 @@ set(FFASM_SOURCES ${FFSNAP}/libavcodec/aarch64/vp9itxfm_neon.S ) +# Cycle 6 — H.264 IDCT 4x4 + 8x8 NEON (vendored 2026-05-18). +set(FFASM_H264IDCT_SOURCES + ${FFSNAP}/libavcodec/aarch64/h264idct_neon.S +) +set_source_files_properties(${FFASM_H264IDCT_SOURCES} PROPERTIES + COMPILE_OPTIONS "${FFASM_FLAGS}" + LANGUAGE ASM) + # Cycle 2 — VP9 loop filter NEON source (vendored 2026-05-18). set(FFASM_LPF_SOURCES ${FFSNAP}/libavcodec/aarch64/vp9lpf_neon.S @@ -96,6 +104,14 @@ set_source_files_properties(${FFASM_SOURCES} PROPERTIES # ---- NEON baseline microbenches -------------------------------------------- +# Cycle 6 — H.264 IDCT 4x4 NEON M3 baseline bench. +add_executable(bench_neon_h264idct4 + tests/bench_neon_h264idct4.c + tests/h264_idct4_ref.c + ${FFASM_H264IDCT_SOURCES} +) +target_compile_options(bench_neon_h264idct4 PRIVATE -O3 -march=armv8-a+simd) + add_executable(bench_neon_idct tests/bench_neon_idct.c tests/vp9_idct8_ref.c diff --git a/docs/k6_h264idct4_phase1.md b/docs/k6_h264idct4_phase1.md new file mode 100644 index 0000000..4118199 --- /dev/null +++ b/docs/k6_h264idct4_phase1.md @@ -0,0 +1,119 @@ +--- +cycle: 6 +phase: 1 +status: open +date_opened: 2026-05-18 +codec: H.264 +kernel: IDCT 4x4 + add (intra-block residual) +parent: project_h264_scope_added.md (memory) +--- + +# Cycle 6, Phase 1 — H.264 IDCT 4×4 + add + +First H.264 kernel. Per `project_h264_scope_added`, the user +added H.264 to daedalus-fourier scope 2026-05-18 because Pi 5 +has no hardware H.264 decoder despite H.264 being the most +common web codec. + +## Why IDCT 4×4 first + +- **Smallest H.264 transform.** 16 coefficients per block, 4×4 + output pixels. Simpler than VP9 IDCT 8×8 (cycle 1, 64 coefs). +- **Most-used.** H.264 macroblocks default to 4×4 intra + prediction + residual; 8×8 is High-profile only. 4×4 hits + most real-world H.264 streams. +- **Predicted GREEN.** Per the cycle 1-5 bandwidth-bound vs + compute-bound classification: 4×4 IDCT is bandwidth-bound + (16 reads, 16 writes, ~20 ALU ops/output). Should map well + to V3D 7.1 compute. +- **Clean reference.** FFmpeg's `ff_h264_idct_add_neon` is + standalone (no eob parameter, no complex DC dispatch). Single + call computes 1 block of IDCT + add. + +## Kernel contract + +Per H.264 spec §8.5.12, the inverse transform is an +integer-arithmetic transform (no rounding-by-cosine like VP9's +Q14 trig math). Each 4×4 block: + +1. Inverse row transform (4 row passes, each one 1D IDCT-like + integer butterfly). +2. Inverse column transform (4 column passes, same butterfly). +3. Round and add to `dst[r,c] = clamp(dst[r,c] + ((idct[r,c] + 32) >> 6), 0, 255)`. + +Spec coefficients (Hadamard-like with 1/2 scaling): +``` + [1 1 1 1/2] + [1 1/2 -1 -1] + [1 -1/2 -1 1] + [1 -1 1 -1/2] +``` +Integer form scales by 2: replace 1/2 with 1 and ½ with right- +shift in the round step. + +## NEON reference (M3 target) + +FFmpeg's `ff_h264_idct_add_neon` +(external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S +line 25, 56 instructions of NEON asm). Signature: + +``` +void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); +``` + +- `dst`: 4×4 pixel block in 8-bit luma surface, `stride` between rows. +- `block`: 16 int16 coefficients (row-major). +- destructively clears `block` to zero after the transform (per H.264 conformance). + +## 30fps@1080p H.264 floor + +H.264 1080p uses 16×16 macroblocks with up to 16 4×4 blocks per MB. +Luma: (1920/16) × (1080/16) = 120 × 67.5 = 8100 MB/frame × +16 blocks/MB = 129 600 4×4 blocks/frame. Plus chroma: 4 + 4 = 8 +chroma 4×4 per MB × 8100 = 64 800 chroma blocks. Total: ~195k +4×4 blocks/frame max (worst case; many real MBs use 8×8 or skip). + +At 30fps: ~5.85 Mblock/s required for full-frame 4×4 worst case. +A more realistic average (many MBs use 8×8, P-skip, etc.) is +~2 Mblock/s. + +**30fps@1080p H.264 4×4 floor (realistic): 2 Mblock/s.** +**30fps@1080p H.264 4×4 floor (worst case): 5.85 Mblock/s.** + +## R-band decision rules (carried from phase1.md) + +- R ≥ 1.0 → **GREEN** (QPU faster than NEON-1 in isolation). +- 0.5 ≤ R < 1.0 → **YELLOW** (M4 decides). +- 0.1 ≤ R < 0.5 → **ORANGE** (M4 may rescue). +- R < 0.1 → **RED** (structural mismatch). + +Floor margin: ratio of M2 (or M3 if CPU-only) over the 5.85 +Mblock/s worst-case 30fps floor. + +## Acceptance for Phase 7 + +- M1: 100.0000% bit-exact (QPU output vs C ref, 10000+ random + blocks). Same standard as cycles 1-5. +- M2: captured, classified per R band. +- M4: same-kernel mixed-bench measured (with Issue 003 caveats — + this is the worst-case framing). +- 30fps@1080p H.264 4×4 floor margin reported. + +## Cycle 6 deliverables + +1. `external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S` + (vendored 2026-05-18, this phase). +2. `tests/h264_idct4_ref.c` — standalone C reference (LGPL-2.1+ + transcribed from spec). +3. `tests/bench_neon_h264idct4.c` — Phase 3 M3 bench. +4. `src/v3d_h264idct4.comp` — Phase 6 QPU shader. +5. `tests/bench_v3d_h264idct4.c` — Phase 6+7 M1+M2 bench (3-way + vs NEON + C ref). +6. M4: extend `bench_concurrent_mixed.c` with K_H264_IDCT4. +7. Phase 4-7 docs. + +## Next step (within this phase) + +Move to Phase 3 (NEON baseline M3) after writing the C +reference. Phase 2 (libavcodec inventory) is implicit since we +know the kernel from the FFmpeg vendor. diff --git a/docs/k6_h264idct4_phase3.md b/docs/k6_h264idct4_phase3.md new file mode 100644 index 0000000..5d55daa --- /dev/null +++ b/docs/k6_h264idct4_phase3.md @@ -0,0 +1,132 @@ +--- +cycle: 6 +phase: 3 +status: closed 2026-05-18 — M1 PASS, M3₆ = 175 Mblock/s +date_opened: 2026-05-18 +date_closed: 2026-05-18 +codec: H.264 +kernel: IDCT 4x4 + add +parent: k6_h264idct4_phase1.md +host: hertz +--- + +# Cycle 6, Phase 3 — H.264 IDCT 4×4 NEON baseline + +## M3₆ throughput + +``` +=== M3₆ NEON throughput === + blocks/batch: 4096 + batches done: 51 206 + total blocks: 209 739 776 + elapsed (kernel)=1.199 s + throughput = 175.0 Mblock/s + per-block = 5.7 ns + H.264 1080p30 worst-case floor: 29.91× margin (5.85 Mblock/s req'd) + H.264 1080p30 realistic floor: 87.50× margin (2.0 Mblock/s req'd) +``` + +**Per-block 5.7 ns — by far the lightest cycle so far** (cycle 2 +LPF wd=4 was 21 ns, cycle 1 IDCT 8x8 was 122 ns). 4×4 is a +genuinely small kernel and FFmpeg's NEON is extremely tight +(56 instructions per block). + +NEON 4-core scaling: not measured this phase; based on cycle 2/4 +patterns, expect ~3-4× scaling (bandwidth-bound territory) → +~500-700 Mblock/s aggregate. That's >100× the floor. + +## M1 bit-exact gate + +``` +=== M1₆ bit-exact (10000 random 4x4 blocks) === +M1₆ correctness: 10000 / 10000 blocks bit-exact (100.0000%) +``` + +## Key Phase 9 lesson — H.264 block layout is column-major + +The bench's initial C reference assumed row-major block storage +(`block[r*4 + c]`), giving M1 = 4.98 % bit-exact (essentially all +random). After failed attempts swapping the row/column pass order +(both row-first and column-first gave the same 5 % rate), trace +analysis revealed the actual mismatch: + +- NEON `ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1]` does + **interleaved** loading (load 4 structures of 4 elements, + scattering across registers), NOT sequential — I initially + assumed sequential. +- Combined with FFmpeg's choice of **column-major** block layout + (`block[c*4 + r]` = coefficient at row r, column c), the + interleaved load gives each NEON vector `v_r` = row r of block + (lane = column). +- FFmpeg's C reference (`libavcodec/h264dsp_template.c`) uses + `block[i + 4*0]`, `block[i + 4*1]`, etc. which is column-major + indexing in disguise. + +Fix: read block as column-major (`block[c*4 + r]`) in the C +reference's row-pass loop. M1 then PASS 10000/10000. + +Lesson encoded for future H.264 cycles: +- **H.264 4×4 (and 8×8) blocks are column-major** in FFmpeg. +- This convention propagates through all the libavcodec/aarch64 + H.264 NEON kernels (h264idct, h264dsp, h264qpel, h264cmc). + Cycles 7+ (other H.264 kernels) should default-assume + column-major. + +## Comparison vs cycle 1 IDCT 8×8 (the closest analog) + +| | Cycle 1 IDCT 8×8 | Cycle 6 IDCT 4×4 | +|---|---|---| +| Codec | VP9 | H.264 | +| Block size | 8×8 (64 coefs) | 4×4 (16 coefs) | +| Transform math | Q14 trig DCT (heavy multiplies) | Integer butterfly (no multiplies, only shifts) | +| NEON cycles/block | 122 ns | **5.7 ns** (21× faster) | +| Block storage | row-major | column-major | +| 30fps@1080p floor margin | 8× | **30×** (vs worst case) | + +H.264 IDCT 4×4 is dramatically lighter than VP9 IDCT 8×8 — both +per-coef and per-block. This validates the "H.264 should be +easier" hypothesis from [project_h264_scope_added]. + +## Predicted R₆ band + +NEON per-block 5.7 ns is so fast that the QPU must be very fast +to compete. QPU dispatch overhead is ~30 µs per call (from M5), +so the QPU-call breakeven needs to amortize across many blocks +per dispatch. + +Per-block estimate for QPU on a similar tiny kernel: +- 4 lanes per block (per pixel), 64 invocations/WG → 16 blocks/WG +- ~50-100 instructions per block (much less than cycle 1 IDCT 8x8's 250) +- At 8 ns/instruction (NEON-tuned guess), ~600 ns per block. +- R₆ = 5.7 / 600 = 0.01 → **deep RED in isolation** + +But: per-WG packing of 16 blocks means dispatch overhead amortizes +better. And 4×4 is bandwidth-bound on NEON (5.7 ns/block ≈ 32 bytes +read + 16 bytes write = 48 bytes per 5.7 ns ≈ 8 GB/s, close to +LPDDR4 ceiling). So same-kernel M4 on QPU may pull free if QPU's +bandwidth doesn't contend on the same channel. + +Plan: implement QPU path anyway for cycle-completion and +opportunistic-helper hypothesis. If R₆ is deep RED but mixed-kernel +(per Issue 003) deployment shape uses QPU for VP9 cycles 1+2+4 and +CPU for H.264 IDCT 4×4, that's fine — the recipe carries over. + +## Next: Phase 4 plan + +Per the established cycle pattern. Plan the QPU shader. Phase 5 +Sonnet review. Phase 6 implementation. Phase 7 measurement. +Predicted R₆ = 0.01 (deep RED, isolation), but small enough kernel +to make per-call buffer alloc dominate the latency. + +Alternative path: defer cycle 6 Phase 4-7 (skip the QPU shader +build) and instead move directly to next H.264 cycles where QPU +might actually win — IDCT 8x8 (cycle 7), 6-tap MC (cycle 9), or +deblock (cycle 10). H.264 IDCT 4×4 on CPU is so fast that it +doesn't NEED QPU help. + +## Acceptance + +- ✓ M1 bit-exact (100.00 % on 10 000 random blocks) +- ✓ M3 captured (175 Mblock/s) +- ✓ 30fps@1080p floor exceeded by 30× worst-case +- ✓ Block-layout convention documented for future cycles diff --git a/external/ffmpeg-snapshot/PROVENANCE.md b/external/ffmpeg-snapshot/PROVENANCE.md index 71e2385..9f813d5 100644 --- a/external/ffmpeg-snapshot/PROVENANCE.md +++ b/external/ffmpeg-snapshot/PROVENANCE.md @@ -26,6 +26,7 @@ tagged commit, no modifications. | `libavcodec/aarch64/vp9itxfm_neon.S` | 1580 | 63534 | `82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6` | | `libavcodec/aarch64/vp9lpf_neon.S` | 1334 | — | `384e49e7a6e838d9e38aedc00838ed4aebfa6c5bdb343ecaf23ef639bc10fbb7` | | `libavcodec/aarch64/vp9mc_neon.S` | 665 | — | `6b1d50f9821742584fdd47758057f810644aff3a008faaa774ff5b9cac4d1fef` | +| `libavcodec/aarch64/h264idct_neon.S` | 415 | 16269 | `963ffe5f31b5a6a422e13b0d394cf5630126927abfb23aa214f7cbe83d60683f` — H.264 IDCT 4×4/8×8/DC NEON kernels for cycle 6+ | | `libavcodec/vp9_subpel_filters_table.c` | — | — | hand-extracted from `libavcodec/vp9dsp.c` at same n7.1.3 pin — provides `ff_vp9_subpel_filters` for `vp9mc_neon.S` to link against without dragging in vp9dsp.c's full init machinery | | `libavcodec/aarch64/neon.S` | 173 | 7496 | `72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538` | | `libavutil/aarch64/asm.S` | 260 | 8069 | `c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3` | diff --git a/external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S b/external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S new file mode 100644 index 0000000..3f7ff2c --- /dev/null +++ b/external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * Copyright (c) 2013 Janne Grunau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +function ff_h264_idct_add_neon, export=1 +.L_ff_h264_idct_add_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1] + sxtw x2, w2 + movi v30.8h, #0 + + add v4.4h, v0.4h, v2.4h + sshr v16.4h, v1.4h, #1 + st1 {v30.8h}, [x1], #16 + sshr v17.4h, v3.4h, #1 + st1 {v30.8h}, [x1], #16 + sub v5.4h, v0.4h, v2.4h + sub v6.4h, v16.4h, v3.4h + add v7.4h, v1.4h, v17.4h + add v0.4h, v4.4h, v7.4h + add v1.4h, v5.4h, v6.4h + sub v2.4h, v5.4h, v6.4h + sub v3.4h, v4.4h, v7.4h + + transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 + + add v4.4h, v0.4h, v2.4h + ld1 {v18.s}[0], [x0], x2 + sshr v16.4h, v3.4h, #1 + sshr v17.4h, v1.4h, #1 + ld1 {v18.s}[1], [x0], x2 + sub v5.4h, v0.4h, v2.4h + ld1 {v19.s}[1], [x0], x2 + add v6.4h, v16.4h, v1.4h + ins v4.d[1], v5.d[0] + sub v7.4h, v17.4h, v3.4h + ld1 {v19.s}[0], [x0], x2 + ins v6.d[1], v7.d[0] + sub x0, x0, x2, lsl #2 + add v0.8h, v4.8h, v6.8h + sub v1.8h, v4.8h, v6.8h + + srshr v0.8h, v0.8h, #6 + srshr v1.8h, v1.8h, #6 + + uaddw v0.8h, v0.8h, v18.8b + uaddw v1.8h, v1.8h, v19.8b + + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v1.s}[1], [x0], x2 + st1 {v1.s}[0], [x0], x2 + + sub x1, x1, #32 + ret +endfunc + +function ff_h264_idct_dc_add_neon, export=1 +.L_ff_h264_idct_dc_add_neon: + AARCH64_VALID_CALL_TARGET + sxtw x2, w2 + mov w3, #0 + ld1r {v2.8h}, [x1] + strh w3, [x1] + srshr v2.8h, v2.8h, #6 + ld1 {v0.s}[0], [x0], x2 + ld1 {v0.s}[1], [x0], x2 + uaddw v3.8h, v2.8h, v0.8b + ld1 {v1.s}[0], [x0], x2 + ld1 {v1.s}[1], [x0], x2 + uaddw v4.8h, v2.8h, v1.8b + sqxtun v0.8b, v3.8h + sqxtun v1.8b, v4.8h + sub x0, x0, x2, lsl #2 + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v1.s}[1], [x0], x2 + ret +endfunc + +function ff_h264_idct_add16_neon, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + subs w3, w3, #1 + b.lt 2f + ldrsh w3, [x1] + add x0, x0, x6 + ccmp w3, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs x10, x10, #1 + add x1, x1, #32 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add16intra_neon, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + add x0, x0, x6 + cmp w3, #0 + ldrsh w3, [x1] + csel x15, x13, x14, eq + ccmp w3, #0, #0, eq + b.eq 2f + blr x15 +2: subs x10, x10, #1 + add x1, x1, #32 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add8_neon, export=1 + stp x19, x20, [sp, #-0x40]! + mov x12, x30 + ldp x6, x15, [x0] // dest[0], dest[1] + add x5, x1, #16*4 // block_offset + add x9, x2, #16*32 // block + mov w19, w3 // stride + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon + movrel x7, scan8, 16 + mov x10, #0 + mov x11, #16 +1: mov w2, w19 + ldrb w3, [x7, x10] // scan8[i] + ldrsw x0, [x5, x10, lsl #2] // block_offset[i] + ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] + add x0, x0, x6 // block_offset[i] + dst[j-1] + add x1, x9, x10, lsl #5 // block + i * 16 + cmp w3, #0 + ldrsh w3, [x1] // block[i*16] + csel x20, x13, x14, eq + ccmp w3, #0, #0, eq + b.eq 2f + blr x20 +2: add x10, x10, #1 + cmp x10, #4 + csel x10, x11, x10, eq // mov x10, #16 + csel x6, x15, x6, eq + cmp x10, #20 + b.lt 1b + ldp x19, x20, [sp], #0x40 + ret x12 +endfunc + +.macro idct8x8_cols pass + .if \pass == 0 + va .req v18 + vb .req v30 + sshr v18.8h, v26.8h, #1 + add v16.8h, v24.8h, v28.8h + ld1 {v30.8h, v31.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 + sub v17.8h, v24.8h, v28.8h + sshr v19.8h, v30.8h, #1 + sub v18.8h, v18.8h, v30.8h + add v19.8h, v19.8h, v26.8h + .else + va .req v30 + vb .req v18 + sshr v30.8h, v26.8h, #1 + sshr v19.8h, v18.8h, #1 + add v16.8h, v24.8h, v28.8h + sub v17.8h, v24.8h, v28.8h + sub v30.8h, v30.8h, v18.8h + add v19.8h, v19.8h, v26.8h + .endif + add v26.8h, v17.8h, va.8h + sub v28.8h, v17.8h, va.8h + add v24.8h, v16.8h, v19.8h + sub vb.8h, v16.8h, v19.8h + sub v16.8h, v29.8h, v27.8h + add v17.8h, v31.8h, v25.8h + sub va.8h, v31.8h, v25.8h + add v19.8h, v29.8h, v27.8h + sub v16.8h, v16.8h, v31.8h + sub v17.8h, v17.8h, v27.8h + add va.8h, va.8h, v29.8h + add v19.8h, v19.8h, v25.8h + sshr v25.8h, v25.8h, #1 + sshr v27.8h, v27.8h, #1 + sshr v29.8h, v29.8h, #1 + sshr v31.8h, v31.8h, #1 + sub v16.8h, v16.8h, v31.8h + sub v17.8h, v17.8h, v27.8h + add va.8h, va.8h, v29.8h + add v19.8h, v19.8h, v25.8h + sshr v25.8h, v16.8h, #2 + sshr v27.8h, v17.8h, #2 + sshr v29.8h, va.8h, #2 + sshr v31.8h, v19.8h, #2 + sub v19.8h, v19.8h, v25.8h + sub va.8h, v27.8h, va.8h + add v17.8h, v17.8h, v29.8h + add v16.8h, v16.8h, v31.8h + .if \pass == 0 + sub v31.8h, v24.8h, v19.8h + add v24.8h, v24.8h, v19.8h + add v25.8h, v26.8h, v18.8h + sub v18.8h, v26.8h, v18.8h + add v26.8h, v28.8h, v17.8h + add v27.8h, v30.8h, v16.8h + sub v29.8h, v28.8h, v17.8h + sub v28.8h, v30.8h, v16.8h + .else + sub v31.8h, v24.8h, v19.8h + add v24.8h, v24.8h, v19.8h + add v25.8h, v26.8h, v30.8h + sub v30.8h, v26.8h, v30.8h + add v26.8h, v28.8h, v17.8h + sub v29.8h, v28.8h, v17.8h + add v27.8h, v18.8h, v16.8h + sub v28.8h, v18.8h, v16.8h + .endif + .unreq va + .unreq vb +.endm + +function ff_h264_idct8_add_neon, export=1 +.L_ff_h264_idct8_add_neon: + AARCH64_VALID_CALL_TARGET + movi v19.8h, #0 + sxtw x2, w2 + ld1 {v24.8h, v25.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 + ld1 {v26.8h, v27.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 + ld1 {v28.8h, v29.8h}, [x1] + st1 {v19.8h}, [x1], #16 + st1 {v19.8h}, [x1], #16 + + idct8x8_cols 0 + transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 + idct8x8_cols 1 + + mov x3, x0 + srshr v24.8h, v24.8h, #6 + ld1 {v0.8b}, [x0], x2 + srshr v25.8h, v25.8h, #6 + ld1 {v1.8b}, [x0], x2 + srshr v26.8h, v26.8h, #6 + ld1 {v2.8b}, [x0], x2 + srshr v27.8h, v27.8h, #6 + ld1 {v3.8b}, [x0], x2 + srshr v28.8h, v28.8h, #6 + ld1 {v4.8b}, [x0], x2 + srshr v29.8h, v29.8h, #6 + ld1 {v5.8b}, [x0], x2 + srshr v30.8h, v30.8h, #6 + ld1 {v6.8b}, [x0], x2 + srshr v31.8h, v31.8h, #6 + ld1 {v7.8b}, [x0], x2 + uaddw v24.8h, v24.8h, v0.8b + uaddw v25.8h, v25.8h, v1.8b + uaddw v26.8h, v26.8h, v2.8b + sqxtun v0.8b, v24.8h + uaddw v27.8h, v27.8h, v3.8b + sqxtun v1.8b, v25.8h + uaddw v28.8h, v28.8h, v4.8b + sqxtun v2.8b, v26.8h + st1 {v0.8b}, [x3], x2 + uaddw v29.8h, v29.8h, v5.8b + sqxtun v3.8b, v27.8h + st1 {v1.8b}, [x3], x2 + uaddw v30.8h, v30.8h, v6.8b + sqxtun v4.8b, v28.8h + st1 {v2.8b}, [x3], x2 + uaddw v31.8h, v31.8h, v7.8b + sqxtun v5.8b, v29.8h + st1 {v3.8b}, [x3], x2 + sqxtun v6.8b, v30.8h + sqxtun v7.8b, v31.8h + st1 {v4.8b}, [x3], x2 + st1 {v5.8b}, [x3], x2 + st1 {v6.8b}, [x3], x2 + st1 {v7.8b}, [x3], x2 + + sub x1, x1, #128 + ret +endfunc + +function ff_h264_idct8_dc_add_neon, export=1 +.L_ff_h264_idct8_dc_add_neon: + AARCH64_VALID_CALL_TARGET + mov w3, #0 + sxtw x2, w2 + ld1r {v31.8h}, [x1] + strh w3, [x1] + ld1 {v0.8b}, [x0], x2 + srshr v31.8h, v31.8h, #6 + ld1 {v1.8b}, [x0], x2 + ld1 {v2.8b}, [x0], x2 + uaddw v24.8h, v31.8h, v0.8b + ld1 {v3.8b}, [x0], x2 + uaddw v25.8h, v31.8h, v1.8b + ld1 {v4.8b}, [x0], x2 + uaddw v26.8h, v31.8h, v2.8b + ld1 {v5.8b}, [x0], x2 + uaddw v27.8h, v31.8h, v3.8b + ld1 {v6.8b}, [x0], x2 + uaddw v28.8h, v31.8h, v4.8b + ld1 {v7.8b}, [x0], x2 + uaddw v29.8h, v31.8h, v5.8b + uaddw v30.8h, v31.8h, v6.8b + uaddw v31.8h, v31.8h, v7.8b + sqxtun v0.8b, v24.8h + sqxtun v1.8b, v25.8h + sqxtun v2.8b, v26.8h + sqxtun v3.8b, v27.8h + sub x0, x0, x2, lsl #3 + st1 {v0.8b}, [x0], x2 + sqxtun v4.8b, v28.8h + st1 {v1.8b}, [x0], x2 + sqxtun v5.8b, v29.8h + st1 {v2.8b}, [x0], x2 + sqxtun v6.8b, v30.8h + st1 {v3.8b}, [x0], x2 + sqxtun v7.8b, v31.8h + st1 {v4.8b}, [x0], x2 + st1 {v5.8b}, [x0], x2 + st1 {v6.8b}, [x0], x2 + st1 {v7.8b}, [x0], x2 + ret +endfunc + +function ff_h264_idct8_add4_neon, export=1 + mov x12, x30 + mov x6, x0 + mov x5, x1 + mov x1, x2 + mov w2, w3 + movrel x7, scan8 + mov w10, #16 + movrel x13, .L_ff_h264_idct8_dc_add_neon + movrel x14, .L_ff_h264_idct8_add_neon +1: ldrb w9, [x7], #4 + ldrsw x0, [x5], #16 + ldrb w9, [x4, w9, uxtw] + subs w9, w9, #1 + b.lt 2f + ldrsh w11, [x1] + add x0, x6, x0 + ccmp w11, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs w10, w10, #4 + add x1, x1, #128 + b.ne 1b + ret x12 +endfunc + +const scan8 + .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 + .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 + .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 + .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 + .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 + .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 + .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 + .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 + .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 + .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 + .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 + .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 +endconst diff --git a/tests/bench_neon_h264idct4.c b/tests/bench_neon_h264idct4.c new file mode 100644 index 0000000..720c072 --- /dev/null +++ b/tests/bench_neon_h264idct4.c @@ -0,0 +1,210 @@ +/* + * Cycle 6 Phase 3 — NEON M3 baseline for H.264 IDCT 4x4 + add. + * + * Calls FFmpeg `ff_h264_idct_add_neon`. Reports M1 bit-exact vs + * the standalone C reference, plus M3 throughput. + * + * License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot. + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include + +extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); +extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); + +#define DST_STRIDE 16 /* arbitrary stride for the test surface */ +#define DST_ROWS 4 +#define DST_BYTES (DST_ROWS * DST_STRIDE) + +static uint64_t xs_state; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +static void gen_block(int16_t b[16]) +{ + /* Realistic H.264 residual: small coefficients, mostly zero, + * a few non-zero in low-frequency positions. */ + memset(b, 0, 16 * sizeof(int16_t)); + int n_nonzero = 1 + (int)(xs() % 8); + for (int i = 0; i < n_nonzero; i++) { + int pos = (int)(xs() % 16); + int16_t v = (int16_t)((int)(xs() % 1024) - 512); + b[pos] = v; + } +} + +static double now_seconds(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +static int correctness_check(uint64_t seed, int n) +{ + xs_state = seed ? seed : 0xc0de264cULL; + int mismatches = 0; + int prints = 0; + + int16_t block_a[16], block_b[16], block_saved[16]; + uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES], dst_initial[DST_BYTES]; + + for (int i = 0; i < n; i++) { + gen_block(block_a); + memcpy(block_b, block_a, sizeof(block_a)); + memcpy(block_saved, block_a, sizeof(block_a)); + + /* Random initial dst (4×4 region at offset 0, row stride DST_STRIDE). */ + for (int r = 0; r < 4; r++) + for (int c = 0; c < 4; c++) + dst_a[r * DST_STRIDE + c] = dst_b[r * DST_STRIDE + c] = (uint8_t)(xs() & 0xff); + memcpy(dst_initial, dst_a, DST_BYTES); + + daedalus_h264_idct_add_ref(dst_a, block_a, DST_STRIDE); + ff_h264_idct_add_neon(dst_b, block_b, DST_STRIDE); + + int diff = 0; + for (int r = 0; r < 4; r++) + for (int c = 0; c < 4; c++) + if (dst_a[r*DST_STRIDE + c] != dst_b[r*DST_STRIDE + c]) diff++; + if (diff) { + if (prints < 3) { + fprintf(stderr, "MISMATCH block %d (%d/16 pix diff):\n", i, diff); + fprintf(stderr, " input block (row-major):"); + for (int r = 0; r < 4; r++) { + fprintf(stderr, "\n r%d ", r); + for (int c = 0; c < 4; c++) fprintf(stderr, "%6d ", block_saved[r*4 + c]); + } + fprintf(stderr, "\n initial dst:"); + for (int r = 0; r < 4; r++) { + fprintf(stderr, "\n r%d ", r); + for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_initial[r*DST_STRIDE + c]); + } + fprintf(stderr, "\n"); + fprintf(stderr, " ref:"); + for (int r = 0; r < 4; r++) { + fprintf(stderr, "\n r%d ", r); + for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_a[r*DST_STRIDE+c]); + } + fprintf(stderr, "\n neon:"); + for (int r = 0; r < 4; r++) { + fprintf(stderr, "\n r%d ", r); + for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_b[r*DST_STRIDE+c]); + } + fprintf(stderr, "\n"); + prints++; + } + mismatches++; + } + } + + printf("M1₆ correctness: %d / %d blocks bit-exact (%.4f%%)\n", + n - mismatches, n, 100.0 * (n - mismatches) / n); + return mismatches; +} + +static void throughput_neon(uint64_t seed, int n_blocks, double duration_s) +{ + xs_state = seed ? seed : 0xc0de264cULL; + int16_t *master_blocks = malloc((size_t) n_blocks * 16 * sizeof(int16_t)); + int16_t *work_blocks = malloc((size_t) n_blocks * 16 * sizeof(int16_t)); + uint8_t *master_dst = malloc((size_t) n_blocks * 16); + uint8_t *work_dst = malloc((size_t) n_blocks * 16); + if (!master_blocks || !work_blocks || !master_dst || !work_dst) { + fprintf(stderr, "alloc fail\n"); exit(1); + } + for (int i = 0; i < n_blocks; i++) { + gen_block(master_blocks + i * 16); + for (int j = 0; j < 16; j++) master_dst[i * 16 + j] = (uint8_t)(xs() & 0xff); + } + + /* Warm-up. */ + memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t)); + memcpy(work_dst, master_dst, (size_t) n_blocks * 16); + for (int i = 0; i < n_blocks; i++) + ff_h264_idct_add_neon(work_dst + i * 16, work_blocks + i * 16, 4); + + double t0 = now_seconds(); + double t_end = t0 + duration_s; + uint64_t done = 0; + while (now_seconds() < t_end) { + memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t)); + memcpy(work_dst, master_dst, (size_t) n_blocks * 16); + for (int i = 0; i < n_blocks; i++) + ff_h264_idct_add_neon(work_dst + i * 16, work_blocks + i * 16, 4); + done += n_blocks; + } + double elapsed = now_seconds() - t0; + + /* Subtract setup cost. */ + int iters = (int)(done / n_blocks); + double s0 = now_seconds(); + for (int i = 0; i < iters; i++) { + memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t)); + memcpy(work_dst, master_dst, (size_t) n_blocks * 16); + } + double s1 = now_seconds(); + + double kernel_seconds = elapsed - (s1 - s0); + double mbps = done / kernel_seconds / 1e6; + + printf("M3₆ NEON throughput:\n"); + printf(" blocks/batch: %d\n", n_blocks); + printf(" batches done: %d\n", iters); + printf(" total blocks: %llu\n", (unsigned long long) done); + printf(" elapsed (kernel)=%.6f s\n", kernel_seconds); + printf(" throughput = %.3f Mblock/s\n", mbps); + printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9); + /* H.264 1080p 4×4 floor: ~5.85 Mblock/s worst-case, ~2 realistic. */ + printf(" H.264 1080p30 worst-case floor: %.2fx margin (5.85 Mblock/s req'd)\n", mbps / 5.85); + printf(" H.264 1080p30 realistic floor: %.2fx margin (2.0 Mblock/s req'd)\n", mbps / 2.0); + + free(master_blocks); free(work_blocks); free(master_dst); free(work_dst); +} + +int main(int argc, char **argv) +{ + int n_blocks = 65536; + double duration = 5.0; + uint64_t seed = 0; + int do_correctness = 1; + + static struct option opts[] = { + {"blocks", required_argument, 0, 'b'}, + {"duration", required_argument, 0, 'd'}, + {"seed", required_argument, 0, 's'}, + {"no-correctness", no_argument, 0, 'C'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) { + switch (c) { + case 'b': n_blocks = atoi(optarg); break; + case 'd': duration = atof(optarg); break; + case 's': seed = strtoull(optarg, 0, 0); break; + case 'C': do_correctness = 0; break; + default: return 2; + } + } + + if (do_correctness) { + printf("=== M1₆ bit-exact (10000 random 4x4 blocks) ===\n"); + int mis = correctness_check(seed, 10000); + if (mis != 0) { + fprintf(stderr, "M1 gate FAILED — refusing to measure throughput.\n"); + return 1; + } + printf("\n"); + } + + printf("=== M3₆ NEON throughput ===\n"); + throughput_neon(seed, n_blocks, duration); + return 0; +} diff --git a/tests/h264_idct4_ref.c b/tests/h264_idct4_ref.c new file mode 100644 index 0000000..7050583 --- /dev/null +++ b/tests/h264_idct4_ref.c @@ -0,0 +1,81 @@ +/* + * Standalone bit-exact C reference for H.264 4x4 inverse integer + * transform + add. Algorithm per H.264 spec §8.5.12.1 (4x4 IT for + * blocks coded with TransformBypassFlag = 0). + * + * Mirrors FFmpeg `ff_h264_idct_add_neon` in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S + * (n7.1.3 pin). Destructively zeroes `block` to match upstream + * convention (post-call block must be zero for the H.264 conformance + * residual loop). + * + * Signature mirrors the NEON convention: + * void(uint8_t *dst, int16_t *block, ptrdiff_t stride); + * + * License: LGPL-2.1-or-later (matches FFmpeg upstream the algorithm + * was transcribed from). Spec is H.264 ITU-T Rec H.264 / ISO/IEC + * 14496-10. + */ +#include +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } + +/* 1D butterfly per H.264 spec §8.5.12.1. + * d[0..3] are input, e/f/g/h are intermediate, h_c[0..3] are output. */ +static inline void h264_idct4_butterfly(const int d[4], int h_c[4]) +{ + int e = d[0] + d[2]; + int f = d[0] - d[2]; + int g = (d[1] >> 1) - d[3]; + int h = d[1] + (d[3] >> 1); + h_c[0] = e + h; + h_c[1] = f + g; + h_c[2] = f - g; + h_c[3] = e - h; +} + +void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride) +{ + /* H.264/FFmpeg block layout is COLUMN-MAJOR: + * block[c*4 + r] = coefficient at row r, column c. + * NEON ld1.4h{4 regs} interleaves consecutive memory across + * registers; with column-major source this gives v_r[c] = block at + * (row=r, col=c). The first lane-wise butterfly (v0+v2 etc.) then + * combines column 0 and column 2 within each row → row pass. + * JM and FFmpeg C reference both do row-first then column-pass. + * + * dst is row-major (dst[r*stride + c]). + */ + int tmp[4][4]; + + /* Row pass FIRST. Read block as column-major (block[c*4 + r]). */ + for (int r = 0; r < 4; r++) { + int d[4] = { block[0*4 + r], block[1*4 + r], + block[2*4 + r], block[3*4 + r] }; + int h_c[4]; + h264_idct4_butterfly(d, h_c); + for (int c = 0; c < 4; c++) tmp[r][c] = h_c[c]; + } + + /* Column pass NEXT (on row-major tmp). */ + int col_out[4][4]; + for (int c = 0; c < 4; c++) { + int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] }; + int h_c[4]; + h264_idct4_butterfly(d, h_c); + for (int r = 0; r < 4; r++) col_out[r][c] = h_c[r]; + } + + /* Round (+32) >> 6, add to dst, clip to u8. */ + for (int r = 0; r < 4; r++) { + for (int c = 0; c < 4; c++) { + int rounded = (col_out[r][c] + 32) >> 6; + dst[r * stride + c] = (uint8_t) clip_u8(dst[r * stride + c] + rounded); + } + } + + /* FFmpeg convention: zero the block after the transform. */ + memset(block, 0, 16 * sizeof(int16_t)); +}