From 9d5451e0fe6866bfed7691f118b776804191fe86 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 24 May 2026 23:28:56 +0200 Subject: [PATCH 1/2] =?UTF-8?q?h264:=20deblock=5Fluma=5Fh=20=E2=80=94=20CP?= =?UTF-8?q?U/NEON=20via=20vendored=20ff=5Fh264=5Fh=5Floop=5Ffilter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the horizontal-edge sibling of cycle 8's deblock_luma_v. The vendored FFmpeg snapshot already includes ff_h264_h_loop_filter_luma_neon in libavcodec/aarch64/h264dsp_neon.S — this PR wires up the symbol, the bit-exact reference, and the recipe-table entry so daedalus-decoder and other consumers can call the H variant through the same dispatch shape they use for _v. Scope: - Public API: daedalus_dispatch_h264_deblock_luma_h(ctx, sub, ...) + daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, ...) wrapper. - Internal: dispatch_h264_deblock_h_cpu() calls the NEON entry. - Recipe table: new DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10, mapped to DAEDALUS_SUBSTRATE_CPU until a QPU shader is written. An explicit SUBSTRATE_QPU request on the H dispatch returns -1 (fails fast, no silent CPU degradation). - C reference: tests/h264_h_loop_filter_luma_ref.c — the column-axis transpose of h264_deblock_ref.c. Same per-segment kernel; pix[-4..+3] accesses cols instead of rows*stride. - Test: test_api_h264 grows a test_deblock_h() with 8 tiles (8 cols x 16 rows each, edge at col 4), random alpha/beta/tc0; compares NEON dispatch against reference byte-for-byte. Verified on hertz (Pi 5 / V3D 7.1): $ ./build/test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === H264_IDCT4 recipe substrate: 2 (1=CPU, 2=QPU) H264_IDCT8 recipe substrate: 2 H264_DEBLOCK_LV recipe substrate: 2 H264_QPEL_MC20 recipe substrate: 2 H264_DEBLOCK_LH recipe substrate: 1 (CPU, no QPU H shader yet) H.264 IDCT 4x4: 2048/2048 bytes bit-exact (100.0000%) H.264 IDCT 8x8: 2048/2048 bytes bit-exact (100.0000%) H.264 deblock luma v: 2048/2048 bytes bit-exact (100.0000%) H.264 deblock luma h: 1024/1024 bytes bit-exact (100.0000%) H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%) All 5 kernels bit-exact PASS. The new H variant joins the suite with 1024 random-input bytes per tile x 8 tiles. Why CPU-only for now: the daedalus-decoder downstream needs the H edge dispatched somewhere — even at CPU NEON cost (~6 ns/edge per the cycle 8 M3 baseline) a frame's worth at 1080p is ~ 8160 MBs * 4 edges = 32 640 edges = ~200 us — well inside the 30 fps budget. Writing the V3D H-edge shader is a follow-up (would be cycle 8' or similar; the V-edge shader's transpose isn't mechanical because of how the workgroup organisation maps to columns vs rows). Backlog addition (out of scope for this PR): - V3D shader for the H variant (mirror of v3d_h264deblock.spv). - bS=4 intra-strength filter (different algebra; both _v and _h). - Chroma deblock luma_v/_h (8-cell variants). --- .claude/scheduled_tasks.lock | 1 + CMakeLists.txt | 1 + include/daedalus.h | 24 ++++++ src/daedalus_core.c | 49 ++++++++++++ tests/h264_h_loop_filter_luma_ref.c | 116 ++++++++++++++++++++++++++++ tests/test_api_h264.c | 50 ++++++++++++ 6 files changed, 241 insertions(+) create mode 100644 .claude/scheduled_tasks.lock create mode 100644 tests/h264_h_loop_filter_luma_ref.c diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock new file mode 100644 index 0000000..08f5855 --- /dev/null +++ b/.claude/scheduled_tasks.lock @@ -0,0 +1 @@ +{"sessionId":"f7ed922b-c4ce-4e57-9fe1-019511e83999","pid":896,"procStart":"36288444","acquiredAt":1779566389820} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index c7747b9..f5c7f4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -519,6 +519,7 @@ add_executable(test_api_h264 tests/h264_idct4_ref.c tests/h264_idct8_ref.c tests/h264_deblock_ref.c + tests/h264_h_loop_filter_luma_ref.c tests/h264_qpel8_mc20_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) diff --git a/include/daedalus.h b/include/daedalus.h index ee434f9..1cb8f1f 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -263,6 +263,29 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta); +/* H.264 luma "h_loop_filter" — sibling of _v, applies filter + * HORIZONTALLY across a VERTICAL edge (16 rows tall; pix points to + * row 0 of the right block, col 0 = leftmost output column). Same + * non-intra (bS < 4) variant. + * + * Each tile is 8 cols x 16 rows of context (cols -4..+3 around the + * edge). dst_off points to row 0 col 0 of the RIGHT block. + * + * Constraint: (dst_off % dst_stride) >= 4 (the kernel reads p3 at + * pix[-4]). Caller must ensure this. + * + * QPU shader for the H variant is not yet implemented; recipe table + * routes AUTO to CPU NEON. An explicit DAEDALUS_SUBSTRATE_QPU on + * the _h dispatch returns -1 rather than silently degrading. + */ +int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + /* ------------------------------------------------------------------- * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see @@ -309,6 +332,7 @@ typedef enum { DAEDALUS_KERNEL_H264_IDCT8 = 7, DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8, DAEDALUS_KERNEL_H264_QPEL_MC20 = 9, + DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index a7a184e..34ac6d0 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -130,6 +130,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */ case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */ + case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */ case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ } return DAEDALUS_SUBSTRATE_CPU; @@ -155,6 +156,8 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -266,6 +269,21 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx, return 0; } +static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], + meta[i].tc0[2], meta[i].tc0[3] }; + ff_h264_h_loop_filter_luma_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta, tc0_local); + } + return 0; +} + static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) @@ -1165,6 +1183,29 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta); } +int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH); + /* No QPU shader for the H variant yet — always falls through to + * CPU. Mirror the _v shape anyway so the substrate switch is + * uniform; QPU just isn't a real option here yet. */ + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_QPU) { + /* QPU shader for H deblock isn't implemented yet; recipe + * table returns CPU, so AUTO never lands here. An explicit + * QPU request fails fast rather than silently degrading to + * CPU — matches the principle from the IDCT QPU substrate + * (explicit means explicit). */ + return -1; + } + return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta); +} + int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) @@ -1252,6 +1293,14 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, dst, dst_stride, n_edges, meta); } +int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + return daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, n_edges, meta); +} + int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) diff --git a/tests/h264_h_loop_filter_luma_ref.c b/tests/h264_h_loop_filter_luma_ref.c new file mode 100644 index 0000000..2dcd756 --- /dev/null +++ b/tests/h264_h_loop_filter_luma_ref.c @@ -0,0 +1,116 @@ +/* + * Standalone bit-exact C reference for H.264 luma "horizontal" + * loop filter (h_loop_filter_luma): applies filter HORIZONTALLY + * across a VERTICAL edge. The edge spans the 16-row macroblock + * height, between columns -1 and 0. + * + * Mirrors FFmpeg `ff_h264_h_loop_filter_luma_neon` in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S + * line 134. Operates on an 8-col × 16-row region: + * pix[r*stride + c] for r in 0..15, c in -4..+3 + * With pix pointing to row 0, col 0 of the right block (= the + * leftmost column of the bottom-/right-block half of the edge). + * + * 16 rows divided into 4 segments of 4 rows; each segment has its + * own tc0 strength (tc0[0..3]). + * + * Note: FFmpeg's "h_loop_filter" naming uses the FILTER DIRECTION + * (horizontal = across the edge from the left), not the edge + * orientation (vertical). H.264 spec calls this the "vertical + * edge" filter. + * + * This is the column-axis transpose of h264_v_loop_filter_luma_ref: + * - v variant: p3..p0 above the edge (pix[-4*stride..-1*stride]), + * q0..q3 below (pix[0..+3*stride]). 16 columns × 4 segments. + * - h variant: p3..p0 left of the edge (pix[-4..-1]), + * q0..q3 right (pix[0..+3]). 16 rows × 4 segments. + * Same per-segment kernel; only the address arithmetic transposes. + * + * Signature: + * void(uint8_t *pix, ptrdiff_t stride, + * int alpha, int beta, int8_t tc0[4]); + * + * License: LGPL-2.1-or-later (matches FFmpeg upstream). + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } +static inline int clip3(int v, int lo, int hi) { + return v < lo ? lo : v > hi ? hi : v; +} +static inline int abs_i(int x) { return x < 0 ? -x : x; } + +/* Apply luma deblock to one ROW at the vertical edge. + * p0..p3 are pixels left of the edge (pix[-1..-4]), + * q0..q3 right (pix[0..+3]). + * tc0_s is the segment's tc0 value (already known >= 0). + * + * Writes back to pix[-2], pix[-1], pix[0], pix[+1] + * (= p1, p0, q0, q1). + */ +static void h264_deblock_luma_row(uint8_t *pix, + int alpha, int beta, int tc0_s) +{ + int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1]; + int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3]; + (void) p3; (void) q3; /* not used in bS<4 path */ + + /* Edge pre-conditions. */ + if (abs_i(p0 - q0) >= alpha) return; + if (abs_i(p1 - p0) >= beta) return; + if (abs_i(q1 - q0) >= beta) return; + + /* Side conditions. */ + int ap = abs_i(p2 - p0); + int aq = abs_i(q2 - q0); + int ap_lt_beta = (ap < beta); + int aq_lt_beta = (aq < beta); + + /* Combined filter strength. */ + int tc = tc0_s + ap_lt_beta + aq_lt_beta; + + /* p0 / q0 update. */ + int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc); + int p0p = clip_u8(p0 + delta); + int q0p = clip_u8(q0 - delta); + + /* p1 update (only if ap> 1) - 2*p1) >> 1, -tc0_s, tc0_s); + p1p = p1 + delta_p1; + } + /* q1 update (only if aq> 1) - 2*q1) >> 1, -tc0_s, tc0_s); + q1p = q1 + delta_q1; + } + + pix[-2] = (uint8_t) p1p; + pix[-1] = (uint8_t) p0p; + pix[ 0] = (uint8_t) q0p; + pix[ 1] = (uint8_t) q1p; +} + +void daedalus_h264_h_loop_filter_luma_ref( + uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t tc0[4]) +{ + /* H.264 deblock "outer" precondition: alpha == 0 OR beta == 0 + * skips filtering. Also if ALL tc0[*] == -1, skip + * (h264_loop_filter_start macro check). */ + if (alpha == 0 || beta == 0) return; + if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return; + + /* 16 rows divided into 4 segments of 4 rows each. */ + for (int s = 0; s < 4; s++) { + int tc0_s = tc0[s]; + if (tc0_s < 0) continue; /* bS = 0 segment → skip */ + for (int r = 0; r < 4; r++) { + int row = s * 4 + r; + h264_deblock_luma_row(pix + row * stride, alpha, beta, tc0_s); + } + } +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index 412e3a7..a0a9991 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -16,6 +16,8 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); +extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t tc0[4]); extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, @@ -145,6 +147,50 @@ static int test_deblock(void) return diff == 0 ? 0 : 1; } +static int test_deblock_h(void) +{ + /* Mirror of test_deblock but for the H variant. Per-tile layout + * is now 8 cols x 16 rows (one vertical edge between cols 3 and 4 + * of the tile); EDGE_COL = 4 puts dst_off at the leftmost output + * column of the right block so the kernel's pix[-4..+3] read sits + * inside the tile. */ + enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16, + TILE_BYTES = TILE_STRIDE * TILE_ROWS, + TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_deblock_meta meta[N_EDGES]; + + for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); + for (int i = 0; i < N_EDGES; i++) { + meta[i].dst_off = i * TILE_BYTES + EDGE_COL; + meta[i].alpha = (int)(xs() % 64) + 1; + meta[i].beta = (int)(xs() % 16) + 1; + for (int s = 0; s < 4; s++) { + int r = (int)(xs() % 8); + meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); + } + } + + for (int i = 0; i < N_EDGES; i++) { + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; + daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, + meta[i].alpha, meta[i].beta, tc0_local); + } + + int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE, + N_EDGES, meta); + if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + static int test_qpel_mc20(void) { /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile @@ -197,10 +243,14 @@ int main(void) printf(" H264_QPEL_MC20 recipe substrate: %d\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20)); + printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH)); + int fail = 0; fail |= test_idct4(); fail |= test_idct8(); fail |= test_deblock(); + fail |= test_deblock_h(); fail |= test_qpel_mc20(); return fail; } -- 2.47.3 From 818e71560ec7a5ed0578e3fed742a2d6cf2e58a0 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 24 May 2026 23:29:06 +0200 Subject: [PATCH 2/2] gitignore: exclude .claude/ runtime files The previous commit unintentionally added .claude/scheduled_tasks.lock which is an agent-runtime artefact, not source. Untrack it and add .claude/ to .gitignore so it stays out of future commits. --- .claude/scheduled_tasks.lock | 1 - .gitignore | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 .claude/scheduled_tasks.lock diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock deleted file mode 100644 index 08f5855..0000000 --- a/.claude/scheduled_tasks.lock +++ /dev/null @@ -1 +0,0 @@ -{"sessionId":"f7ed922b-c4ce-4e57-9fe1-019511e83999","pid":896,"procStart":"36288444","acquiredAt":1779566389820} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7a6eee8..f1ca54c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ build-*/ # Forensic snapshot of the corrupted .git from 2026-05-18 10:25 # working-tree wipe. Retained on disk for inspection; not tracked. .git-broken-2026-05-18/ +.claude/ -- 2.47.3