Phase 8a: H.264 kernels through public API

Extends include/daedalus.h with cycles 6, 7, 8 (H.264 IDCT 4x4, IDCT 8x8, luma deblock luma-v). All recipe-substrate = CPU (matches per-cycle Phase 7 verdicts). src/daedalus_core.c: NEON-path implementations + recipe routing. daedalus_core library now links the full FFmpeg H.264 NEON snapshot (h264idct + h264dsp) plus existing VP9 + dav1d. tests/test_api_h264.c: smoke test covering all 3 H.264 kernels via daedalus_recipe_dispatch_*. All pass 2048/2048 bit-exact. Public API coverage after this commit: - Cycles 1 IDCT 8x8 + 2 LPF4 + 4 LPF8: CPU+QPU+AUTO dispatch (test_api_idct, test_api_lpf, both pass) - Cycle 3 MC 8h: CPU only (QPU dispatch stub returns -1) - Cycle 5 CDEF: CPU only (QPU stub) - Cycle 6 H.264 IDCT 4x4: CPU only (recipe + only NEON wired) - Cycle 7 H.264 IDCT 8x8: CPU only - Cycle 8 H.264 deblock: CPU only (QPU opportunistic — not wired through API yet; bench_v3d_h264deblock exists for direct test) Next Phase 8 sub-step: wire opportunistic QPU dispatch for cycles 3+5+8 through the API (so override-mode users can request QPU). Then surface V4L2-wrapper architecture decisions to user. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:46:03 +00:00
parent 373f63a910
commit af8146a2cd
4 changed files with 354 additions and 10 deletions
@@ -347,6 +347,8 @@ add_library(daedalus_core STATIC
    ${FFASM_LPF_SOURCES}
    ${FFASM_MC_SOURCES}
    ${FFC_MC_SOURCES}
+    ${FFASM_H264IDCT_SOURCES}
+    ${FFASM_H264DSP_SOURCES}
    ${DAV1D_CDEF_ASM_SOURCES}
    ${DAV1D_CDEF_C_SOURCES}
 )
@@ -373,6 +375,15 @@ add_executable(test_api_lpf
 target_link_libraries(test_api_lpf PRIVATE daedalus_core)
 target_compile_options(test_api_lpf PRIVATE -O2)

+add_executable(test_api_h264
+    tests/test_api_h264.c
+    tests/h264_idct4_ref.c
+    tests/h264_idct8_ref.c
+    tests/h264_deblock_ref.c
+)
+target_link_libraries(test_api_h264 PRIVATE daedalus_core)
+target_compile_options(test_api_h264 PRIVATE -O2)
+
 if (DAEDALUS_BUILD_VULKAN)
 # (re-open the conditional so the closing endif() below balances)

@@ -195,15 +195,86 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
    const uint16_t *tmp,
    size_t n_blocks, const daedalus_cdef_meta *meta);

+/* -------------------------------------------------------------------
+ * H.264 IDCT 4x4 + add — cycle 6 (CPU by recipe; QPU unused)
+ *
+ * Per H.264 §8.5.12.1, integer 4x4 inverse transform. block is
+ * COLUMN-major: block[c*4 + r] = coefficient at (row r, col c).
+ * Block is destructively zeroed after the transform (FFmpeg
+ * convention).
+ *
+ * `coeffs` is an array of n_blocks * 16 int16. `dst_off` is byte
+ * offset into dst per block.
+ * ----------------------------------------------------------------- */
+typedef struct {
+    uint32_t dst_off;
+    uint32_t _pad0, _pad1, _pad2;
+} daedalus_h264_block_meta;
+
+int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs,           /* not const — destructively zeroed */
+    size_t n_blocks, const daedalus_h264_block_meta *meta);
+
+int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs,
+    size_t n_blocks, const daedalus_h264_block_meta *meta);
+
+/* H.264 IDCT 8x8 + add — cycle 7 (CPU by recipe).
+ * Per H.264 §8.5.13.2, integer 8x8 inverse transform.
+ * `coeffs` is an array of n_blocks * 64 int16, column-major per block.
+ */
+int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs,
+    size_t n_blocks, const daedalus_h264_block_meta *meta);
+
+int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs,
+    size_t n_blocks, const daedalus_h264_block_meta *meta);
+
+/* -------------------------------------------------------------------
+ * H.264 luma "v_loop_filter" — cycle 8 (CPU primary; QPU opportunistic)
+ *
+ * Filter applied VERTICALLY across a HORIZONTAL edge (16 columns
+ * wide; pix points to row 0 of the bottom block). Non-intra
+ * (bS < 4) variant.
+ *
+ * Each tile is 16 cols × 8 rows of context (rows -4..+3 around
+ * the edge). dst_off points to row 0 col 0 of the bottom block.
+ *
+ * Constraint: dst_off >= 4 * dst_stride (the kernel reads p3 at
+ * -4*stride). Caller must ensure this.
+ * ----------------------------------------------------------------- */
+typedef struct {
+    uint32_t dst_off;
+    int32_t  alpha;             /* 0..63 typical, table-derived */
+    int32_t  beta;              /* 0..63 typical */
+    int8_t   tc0[4];            /* per-segment filter strength; -1 means skip */
+} daedalus_h264_deblock_meta;
+
+int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
+int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
 /* -------------------------------------------------------------------
 * Recipe query — what does the API recommend for each kernel?
 * ----------------------------------------------------------------- */
 typedef enum {
-    DAEDALUS_KERNEL_VP9_IDCT8     = 1,
-    DAEDALUS_KERNEL_VP9_LPF4_INNER = 2,
-    DAEDALUS_KERNEL_VP9_MC_8H     = 3,
-    DAEDALUS_KERNEL_VP9_LPF8_INNER = 4,
-    DAEDALUS_KERNEL_AV1_CDEF_8X8  = 5,
+    DAEDALUS_KERNEL_VP9_IDCT8       = 1,
+    DAEDALUS_KERNEL_VP9_LPF4_INNER  = 2,
+    DAEDALUS_KERNEL_VP9_MC_8H       = 3,
+    DAEDALUS_KERNEL_VP9_LPF8_INNER  = 4,
+    DAEDALUS_KERNEL_AV1_CDEF_8X8    = 5,
+    DAEDALUS_KERNEL_H264_IDCT4      = 6,
+    DAEDALUS_KERNEL_H264_IDCT8      = 7,
+    DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
 } daedalus_kernel;

 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
@@ -76,11 +76,14 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
 {
    switch (k) {
-    case DAEDALUS_KERNEL_VP9_IDCT8:      return DAEDALUS_SUBSTRATE_QPU;
-    case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
-    case DAEDALUS_KERNEL_VP9_MC_8H:      return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
-    case DAEDALUS_KERNEL_AV1_CDEF_8X8:   return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_VP9_IDCT8:        return DAEDALUS_SUBSTRATE_QPU;
+    case DAEDALUS_KERNEL_VP9_LPF4_INNER:   return DAEDALUS_SUBSTRATE_QPU;
+    case DAEDALUS_KERNEL_VP9_MC_8H:        return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_VP9_LPF8_INNER:   return DAEDALUS_SUBSTRATE_QPU;
+    case DAEDALUS_KERNEL_AV1_CDEF_8X8:     return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_CPU;
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -101,6 +104,10 @@ extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
                                          int pri_strength, int sec_strength,
                                          int dir, int damping, int h,
                                          size_t edges);
+extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
+extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
+extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
+                                              int alpha, int beta, int8_t *tc0);

 /* -------------------- CPU dispatch implementations -------------- */

@@ -168,6 +175,48 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx,
    return 0;
 }

+static int dispatch_h264_idct4_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs, size_t n_blocks,
+    const daedalus_h264_block_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_blocks; i++)
+        ff_h264_idct_add_neon(dst + meta[i].dst_off,
+                              coeffs + i * 16,
+                              (ptrdiff_t) dst_stride);
+    return 0;
+}
+
+static int dispatch_h264_idct8_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs, size_t n_blocks,
+    const daedalus_h264_block_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_blocks; i++)
+        ff_h264_idct8_add_neon(dst + meta[i].dst_off,
+                               coeffs + i * 64,
+                               (ptrdiff_t) dst_stride);
+    return 0;
+}
+
+static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        /* NEON expects mutable tc0 pointer; copy to a local. */
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
+                                 meta[i].tc0[2], meta[i].tc0[3] };
+        ff_h264_v_loop_filter_luma_neon(dst + meta[i].dst_off,
+                                         (ptrdiff_t) dst_stride,
+                                         meta[i].alpha, meta[i].beta, tc0_local);
+    }
+    return 0;
+}
+
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */

 typedef struct {
@@ -471,6 +520,32 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
                   dst, dst_stride, tmp, n_blocks, meta);
 }

+int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs, size_t n_blocks,
+    const daedalus_h264_block_meta *meta)
+{
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
+                   dst, dst_stride, coeffs, n_blocks, meta);
+}
+
+int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs, size_t n_blocks,
+    const daedalus_h264_block_meta *meta)
+{
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
+                   dst, dst_stride, coeffs, n_blocks, meta);
+}
+
+int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_DEBLOCK_LV, dispatch_h264_deblock_cpu,
+                   dst, dst_stride, n_edges, meta);
+}
+
 /* -------------------- Recipe convenience wrappers --------------- */

 int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
@@ -515,3 +590,29 @@ int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
    return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                       dst, dst_stride, tmp, n_blocks, meta);
 }
+
+int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs, size_t n_blocks,
+    const daedalus_h264_block_meta *meta)
+{
+    return daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                         dst, dst_stride, coeffs, n_blocks, meta);
+}
+
+int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    int16_t *coeffs, size_t n_blocks,
+    const daedalus_h264_block_meta *meta)
+{
+    return daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                         dst, dst_stride, coeffs, n_blocks, meta);
+}
+
+int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                                  dst, dst_stride, n_edges, meta);
+}
@@ -0,0 +1,161 @@
+/*
+ * Phase 8a — H.264 kernels through the public API.
+ *
+ * Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel
+ * exercised through daedalus_recipe_dispatch_* and compared to
+ * the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8
+ * verdicts).
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "../include/daedalus.h"
+
+extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
+extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
+extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
+                                                  int alpha, int beta, int8_t tc0[4]);
+
+static uint64_t xs_state = 0xa11264ULL;
+static inline uint64_t xs(void) {
+    uint64_t x = xs_state;
+    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
+    return xs_state = x;
+}
+
+static int test_idct4(void)
+{
+    enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    int16_t coeffs[N * 16], coeffs_ref[N * 16];
+    uint8_t dst[BYTES], dst_ref[BYTES];
+    daedalus_h264_block_meta meta[N];
+
+    /* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols).
+     * Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big
+     * enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use
+     * 8 row-blocks. */
+    enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE };
+    uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES];
+    for (int i = 0; i < FULL_BYTES; i++)
+        big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff);
+
+    for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512);
+
+    for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
+        int i = by * BX + bx;
+        meta[i].dst_off = by * 4 * STRIDE + bx * 4;
+    }
+
+    for (int i = 0; i < N; i++)
+        daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off,
+                                    coeffs_ref + i * 16, STRIDE);
+
+    int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE,
+                                                   coeffs, N, meta);
+    if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++;
+    printf("  H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n",
+           FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+static int test_idct8(void)
+{
+    enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    int16_t coeffs[N * 64], coeffs_ref[N * 64];
+    uint8_t dst[BYTES], dst_ref[BYTES];
+    daedalus_h264_block_meta meta[N];
+
+    for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024);
+
+    /* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks
+     * for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64
+     * blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */
+    int BX = 8, BY = 2;   /* 16 blocks total */
+    for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
+        int i = by * BX + bx;
+        meta[i].dst_off = by * 8 * STRIDE + bx * 8;
+    }
+
+    for (int i = 0; i < N; i++)
+        daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off,
+                                     coeffs_ref + i * 64, STRIDE);
+
+    int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE,
+                                                   coeffs, N, meta);
+    if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n",
+           BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+static int test_deblock(void)
+{
+    /* One edge per 16x16 tile. */
+    enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
+           TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_deblock_meta meta[N_EDGES];
+
+    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N_EDGES; i++) {
+        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
+        meta[i].alpha = (int)(xs() % 64) + 1;
+        meta[i].beta  = (int)(xs() % 16) + 1;
+        for (int s = 0; s < 4; s++) {
+            int r = (int)(xs() % 8);
+            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
+        }
+    }
+
+    for (int i = 0; i < N_EDGES; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
+        daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
+                                              meta[i].alpha, meta[i].beta, tc0_local);
+    }
+
+    int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE,
+                                                            N_EDGES, meta);
+    if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+int main(void)
+{
+    printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
+    printf("  H264_IDCT4 recipe substrate:      %d (1=CPU, 2=QPU)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4));
+    printf("  H264_IDCT8 recipe substrate:      %d\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
+    printf("  H264_DEBLOCK_LV recipe substrate: %d\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
+
+    int fail = 0;
+    fail |= test_idct4();
+    fail |= test_idct8();
+    fail |= test_deblock();
+    return fail;
+}