2026-05-24 21:48:01 +00:00
6 changed files with 241 additions and 0 deletions
@@ -0,0 +1 @@
+{"sessionId":"f7ed922b-c4ce-4e57-9fe1-019511e83999","pid":896,"procStart":"36288444","acquiredAt":1779566389820}
@@ -519,6 +519,7 @@ add_executable(test_api_h264
    tests/h264_idct4_ref.c
    tests/h264_idct8_ref.c
    tests/h264_deblock_ref.c
+    tests/h264_h_loop_filter_luma_ref.c
    tests/h264_qpel8_mc20_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
@@ -263,6 +263,29 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

+/* H.264 luma "h_loop_filter" — sibling of _v, applies filter
+ * HORIZONTALLY across a VERTICAL edge (16 rows tall; pix points to
+ * row 0 of the right block, col 0 = leftmost output column).  Same
+ * non-intra (bS < 4) variant.
+ *
+ * Each tile is 8 cols x 16 rows of context (cols -4..+3 around the
+ * edge).  dst_off points to row 0 col 0 of the RIGHT block.
+ *
+ * Constraint: (dst_off % dst_stride) >= 4 (the kernel reads p3 at
+ * pix[-4]).  Caller must ensure this.
+ *
+ * QPU shader for the H variant is not yet implemented; recipe table
+ * routes AUTO to CPU NEON.  An explicit DAEDALUS_SUBSTRATE_QPU on
+ * the _h dispatch returns -1 rather than silently degrading.
+ */
+int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
+int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
 /* -------------------------------------------------------------------
 * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
@@ -309,6 +332,7 @@ typedef enum {
    DAEDALUS_KERNEL_H264_IDCT8      = 7,
    DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
    DAEDALUS_KERNEL_H264_QPEL_MC20  = 9,
+    DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
 } daedalus_kernel;

 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
@@ -130,6 +130,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct4.spv */
    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct8.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264deblock.spv */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_LH:  return DAEDALUS_SUBSTRATE_CPU;	/* QPU H shader pending */
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    }
    return DAEDALUS_SUBSTRATE_CPU;
@@ -155,6 +156,8 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride
 extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
+extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
+                                              int alpha, int beta, int8_t *tc0);
 extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);

@@ -266,6 +269,21 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
    return 0;
 }

+static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
+                                 meta[i].tc0[2], meta[i].tc0[3] };
+        ff_h264_h_loop_filter_luma_neon(dst + meta[i].dst_off,
+                                         (ptrdiff_t) dst_stride,
+                                         meta[i].alpha, meta[i].beta, tc0_local);
+    }
+    return 0;
+}
+
 static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1165,6 +1183,29 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
 }

+int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH);
+    /* No QPU shader for the H variant yet — always falls through to
+     * CPU.  Mirror the _v shape anyway so the substrate switch is
+     * uniform; QPU just isn't a real option here yet. */
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU) {
+        /* QPU shader for H deblock isn't implemented yet; recipe
+         * table returns CPU, so AUTO never lands here.  An explicit
+         * QPU request fails fast rather than silently degrading to
+         * CPU — matches the principle from the IDCT QPU substrate
+         * (explicit means explicit). */
+        return -1;
+    }
+    return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
+}
+
 int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1252,6 +1293,14 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
                                                  dst, dst_stride, n_edges, meta);
 }

+int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    return daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                                  dst, dst_stride, n_edges, meta);
+}
+
 int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -0,0 +1,116 @@
+/*
+ * Standalone bit-exact C reference for H.264 luma "horizontal"
+ * loop filter (h_loop_filter_luma): applies filter HORIZONTALLY
+ * across a VERTICAL edge. The edge spans the 16-row macroblock
+ * height, between columns -1 and 0.
+ *
+ * Mirrors FFmpeg `ff_h264_h_loop_filter_luma_neon` in
+ * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S
+ * line 134. Operates on an 8-col × 16-row region:
+ *   pix[r*stride + c] for r in 0..15, c in -4..+3
+ * With pix pointing to row 0, col 0 of the right block (= the
+ * leftmost column of the bottom-/right-block half of the edge).
+ *
+ * 16 rows divided into 4 segments of 4 rows; each segment has its
+ * own tc0 strength (tc0[0..3]).
+ *
+ * Note: FFmpeg's "h_loop_filter" naming uses the FILTER DIRECTION
+ * (horizontal = across the edge from the left), not the edge
+ * orientation (vertical). H.264 spec calls this the "vertical
+ * edge" filter.
+ *
+ * This is the column-axis transpose of h264_v_loop_filter_luma_ref:
+ *   - v variant: p3..p0 above the edge (pix[-4*stride..-1*stride]),
+ *     q0..q3 below (pix[0..+3*stride]).  16 columns × 4 segments.
+ *   - h variant: p3..p0 left of the edge (pix[-4..-1]),
+ *     q0..q3 right (pix[0..+3]).            16 rows × 4 segments.
+ * Same per-segment kernel; only the address arithmetic transposes.
+ *
+ * Signature:
+ *   void(uint8_t *pix, ptrdiff_t stride,
+ *        int alpha, int beta, int8_t tc0[4]);
+ *
+ * License: LGPL-2.1-or-later (matches FFmpeg upstream).
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+static inline int clip3(int v, int lo, int hi) {
+    return v < lo ? lo : v > hi ? hi : v;
+}
+static inline int abs_i(int x) { return x < 0 ? -x : x; }
+
+/* Apply luma deblock to one ROW at the vertical edge.
+ * p0..p3 are pixels left of the edge (pix[-1..-4]),
+ * q0..q3 right (pix[0..+3]).
+ * tc0_s is the segment's tc0 value (already known >= 0).
+ *
+ * Writes back to pix[-2], pix[-1], pix[0], pix[+1]
+ * (= p1, p0, q0, q1).
+ */
+static void h264_deblock_luma_row(uint8_t *pix,
+                                   int alpha, int beta, int tc0_s)
+{
+    int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1];
+    int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3];
+    (void) p3; (void) q3;   /* not used in bS<4 path */
+
+    /* Edge pre-conditions. */
+    if (abs_i(p0 - q0) >= alpha) return;
+    if (abs_i(p1 - p0) >= beta)  return;
+    if (abs_i(q1 - q0) >= beta)  return;
+
+    /* Side conditions. */
+    int ap = abs_i(p2 - p0);
+    int aq = abs_i(q2 - q0);
+    int ap_lt_beta = (ap < beta);
+    int aq_lt_beta = (aq < beta);
+
+    /* Combined filter strength. */
+    int tc = tc0_s + ap_lt_beta + aq_lt_beta;
+
+    /* p0 / q0 update. */
+    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
+    int p0p = clip_u8(p0 + delta);
+    int q0p = clip_u8(q0 - delta);
+
+    /* p1 update (only if ap<beta). */
+    int p1p = p1;
+    if (ap_lt_beta) {
+        int delta_p1 = clip3((p2 + ((p0 + q0 + 1) >> 1) - 2*p1) >> 1, -tc0_s, tc0_s);
+        p1p = p1 + delta_p1;
+    }
+    /* q1 update (only if aq<beta). */
+    int q1p = q1;
+    if (aq_lt_beta) {
+        int delta_q1 = clip3((q2 + ((p0 + q0 + 1) >> 1) - 2*q1) >> 1, -tc0_s, tc0_s);
+        q1p = q1 + delta_q1;
+    }
+
+    pix[-2] = (uint8_t) p1p;
+    pix[-1] = (uint8_t) p0p;
+    pix[ 0] = (uint8_t) q0p;
+    pix[ 1] = (uint8_t) q1p;
+}
+
+void daedalus_h264_h_loop_filter_luma_ref(
+    uint8_t *pix, ptrdiff_t stride,
+    int alpha, int beta, int8_t tc0[4])
+{
+    /* H.264 deblock "outer" precondition: alpha == 0 OR beta == 0
+     * skips filtering. Also if ALL tc0[*] == -1, skip
+     * (h264_loop_filter_start macro check). */
+    if (alpha == 0 || beta == 0) return;
+    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
+
+    /* 16 rows divided into 4 segments of 4 rows each. */
+    for (int s = 0; s < 4; s++) {
+        int tc0_s = tc0[s];
+        if (tc0_s < 0) continue;   /* bS = 0 segment → skip */
+        for (int r = 0; r < 4; r++) {
+            int row = s * 4 + r;
+            h264_deblock_luma_row(pix + row * stride, alpha, beta, tc0_s);
+        }
+    }
+}
@@ -16,6 +16,8 @@

 extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
+extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
+                                                   int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -145,6 +147,50 @@ static int test_deblock(void)
    return diff == 0 ? 0 : 1;
 }

+static int test_deblock_h(void)
+{
+    /* Mirror of test_deblock but for the H variant.  Per-tile layout
+     * is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
+     * of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
+     * column of the right block so the kernel's pix[-4..+3] read sits
+     * inside the tile. */
+    enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
+           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
+           TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_deblock_meta meta[N_EDGES];
+
+    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N_EDGES; i++) {
+        meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
+        meta[i].alpha = (int)(xs() % 64) + 1;
+        meta[i].beta  = (int)(xs() % 16) + 1;
+        for (int s = 0; s < 4; s++) {
+            int r = (int)(xs() % 8);
+            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
+        }
+    }
+
+    for (int i = 0; i < N_EDGES; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
+        daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
+                                              meta[i].alpha, meta[i].beta, tc0_local);
+    }
+
+    int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
+                                                           N_EDGES, meta);
+    if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
 static int test_qpel_mc20(void)
 {
    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -197,10 +243,14 @@ int main(void)
    printf("  H264_QPEL_MC20 recipe substrate:  %d\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));

+    printf("  H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
+
    int fail = 0;
    fail |= test_idct4();
    fail |= test_idct8();
    fail |= test_deblock();
+    fail |= test_deblock_h();
    fail |= test_qpel_mc20();
    return fail;
 }
				`@@ -0,0 +1 @@`
				`{"sessionId":"f7ed922b-c4ce-4e57-9fe1-019511e83999","pid":896,"procStart":"36288444","acquiredAt":1779566389820}`