h264: deblock_luma_h — CPU/NEON via vendored ff_h264_h_loop_filter

Adds the horizontal-edge sibling of cycle 8's deblock_luma_v.  The
vendored FFmpeg snapshot already includes ff_h264_h_loop_filter_luma_neon
in libavcodec/aarch64/h264dsp_neon.S — this PR wires up the symbol,
the bit-exact reference, and the recipe-table entry so daedalus-decoder
and other consumers can call the H variant through the same dispatch
shape they use for _v.

Scope:
  - Public API: daedalus_dispatch_h264_deblock_luma_h(ctx, sub, ...)
    + daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, ...) wrapper.
  - Internal: dispatch_h264_deblock_h_cpu() calls the NEON entry.
  - Recipe table: new DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10, mapped
    to DAEDALUS_SUBSTRATE_CPU until a QPU shader is written.  An
    explicit SUBSTRATE_QPU request on the H dispatch returns -1
    (fails fast, no silent CPU degradation).
  - C reference: tests/h264_h_loop_filter_luma_ref.c — the
    column-axis transpose of h264_deblock_ref.c.  Same per-segment
    kernel; pix[-4..+3] accesses cols instead of rows*stride.
  - Test: test_api_h264 grows a test_deblock_h() with 8 tiles
    (8 cols x 16 rows each, edge at col 4), random alpha/beta/tc0;
    compares NEON dispatch against reference byte-for-byte.

Verified on hertz (Pi 5 / V3D 7.1):

  $ ./build/test_api_h264
  === Phase 8a API smoke: H.264 kernels via recipe dispatch ===
    H264_IDCT4 recipe substrate:      2 (1=CPU, 2=QPU)
    H264_IDCT8 recipe substrate:      2
    H264_DEBLOCK_LV recipe substrate: 2
    H264_QPEL_MC20 recipe substrate:  2
    H264_DEBLOCK_LH recipe substrate: 1 (CPU, no QPU H shader yet)
    H.264 IDCT 4x4: 2048/2048 bytes bit-exact (100.0000%)
    H.264 IDCT 8x8: 2048/2048 bytes bit-exact (100.0000%)
    H.264 deblock luma v: 2048/2048 bytes bit-exact (100.0000%)
    H.264 deblock luma h: 1024/1024 bytes bit-exact (100.0000%)
    H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)

  All 5 kernels bit-exact PASS.  The new H variant joins the suite
  with 1024 random-input bytes per tile x 8 tiles.

Why CPU-only for now: the daedalus-decoder downstream needs the H
edge dispatched somewhere — even at CPU NEON cost (~6 ns/edge per
the cycle 8 M3 baseline) a frame's worth at 1080p is
~ 8160 MBs * 4 edges = 32 640 edges = ~200 us — well inside the
30 fps budget.  Writing the V3D H-edge shader is a follow-up
(would be cycle 8' or similar; the V-edge shader's transpose isn't
mechanical because of how the workgroup organisation maps to columns
vs rows).

Backlog addition (out of scope for this PR):
  - V3D shader for the H variant (mirror of v3d_h264deblock.spv).
  - bS=4 intra-strength filter (different algebra; both _v and _h).
  - Chroma deblock luma_v/_h (8-cell variants).
This commit is contained in:
2026-05-24 23:28:56 +02:00
parent 0d54d68f38
commit 9d5451e0fe
6 changed files with 241 additions and 0 deletions
+50
View File
@@ -16,6 +16,8 @@
extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -145,6 +147,50 @@ static int test_deblock(void)
return diff == 0 ? 0 : 1;
}
static int test_deblock_h(void)
{
/* Mirror of test_deblock but for the H variant. Per-tile layout
* is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
* of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
* column of the right block so the kernel's pix[-4..+3] read sits
* inside the tile. */
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_mc20(void)
{
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -197,10 +243,14 @@ int main(void)
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
int fail = 0;
fail |= test_idct4();
fail |= test_idct8();
fail |= test_deblock();
fail |= test_deblock_h();
fail |= test_qpel_mc20();
return fail;
}