diff --git a/CMakeLists.txt b/CMakeLists.txt index 83057e9..678b32b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -347,6 +347,8 @@ add_library(daedalus_core STATIC ${FFASM_LPF_SOURCES} ${FFASM_MC_SOURCES} ${FFC_MC_SOURCES} + ${FFASM_H264IDCT_SOURCES} + ${FFASM_H264DSP_SOURCES} ${DAV1D_CDEF_ASM_SOURCES} ${DAV1D_CDEF_C_SOURCES} ) @@ -373,6 +375,15 @@ add_executable(test_api_lpf target_link_libraries(test_api_lpf PRIVATE daedalus_core) target_compile_options(test_api_lpf PRIVATE -O2) +add_executable(test_api_h264 + tests/test_api_h264.c + tests/h264_idct4_ref.c + tests/h264_idct8_ref.c + tests/h264_deblock_ref.c +) +target_link_libraries(test_api_h264 PRIVATE daedalus_core) +target_compile_options(test_api_h264 PRIVATE -O2) + if (DAEDALUS_BUILD_VULKAN) # (re-open the conditional so the closing endif() below balances) diff --git a/include/daedalus.h b/include/daedalus.h index 2e91795..6a26866 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -195,15 +195,86 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, const uint16_t *tmp, size_t n_blocks, const daedalus_cdef_meta *meta); +/* ------------------------------------------------------------------- + * H.264 IDCT 4x4 + add — cycle 6 (CPU by recipe; QPU unused) + * + * Per H.264 §8.5.12.1, integer 4x4 inverse transform. block is + * COLUMN-major: block[c*4 + r] = coefficient at (row r, col c). + * Block is destructively zeroed after the transform (FFmpeg + * convention). + * + * `coeffs` is an array of n_blocks * 16 int16. `dst_off` is byte + * offset into dst per block. + * ----------------------------------------------------------------- */ +typedef struct { + uint32_t dst_off; + uint32_t _pad0, _pad1, _pad2; +} daedalus_h264_block_meta; + +int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, /* not const — destructively zeroed */ + size_t n_blocks, const daedalus_h264_block_meta *meta); + +int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, + size_t n_blocks, const daedalus_h264_block_meta *meta); + +/* H.264 IDCT 8x8 + add — cycle 7 (CPU by recipe). + * Per H.264 §8.5.13.2, integer 8x8 inverse transform. + * `coeffs` is an array of n_blocks * 64 int16, column-major per block. + */ +int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, + size_t n_blocks, const daedalus_h264_block_meta *meta); + +int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, + size_t n_blocks, const daedalus_h264_block_meta *meta); + +/* ------------------------------------------------------------------- + * H.264 luma "v_loop_filter" — cycle 8 (CPU primary; QPU opportunistic) + * + * Filter applied VERTICALLY across a HORIZONTAL edge (16 columns + * wide; pix points to row 0 of the bottom block). Non-intra + * (bS < 4) variant. + * + * Each tile is 16 cols × 8 rows of context (rows -4..+3 around + * the edge). dst_off points to row 0 col 0 of the bottom block. + * + * Constraint: dst_off >= 4 * dst_stride (the kernel reads p3 at + * -4*stride). Caller must ensure this. + * ----------------------------------------------------------------- */ +typedef struct { + uint32_t dst_off; + int32_t alpha; /* 0..63 typical, table-derived */ + int32_t beta; /* 0..63 typical */ + int8_t tc0[4]; /* per-segment filter strength; -1 means skip */ +} daedalus_h264_deblock_meta; + +int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ typedef enum { - DAEDALUS_KERNEL_VP9_IDCT8 = 1, - DAEDALUS_KERNEL_VP9_LPF4_INNER = 2, - DAEDALUS_KERNEL_VP9_MC_8H = 3, - DAEDALUS_KERNEL_VP9_LPF8_INNER = 4, - DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5, + DAEDALUS_KERNEL_VP9_IDCT8 = 1, + DAEDALUS_KERNEL_VP9_LPF4_INNER = 2, + DAEDALUS_KERNEL_VP9_MC_8H = 3, + DAEDALUS_KERNEL_VP9_LPF8_INNER = 4, + DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5, + DAEDALUS_KERNEL_H264_IDCT4 = 6, + DAEDALUS_KERNEL_H264_IDCT8 = 7, + DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 9f624cf..f0ee7fa 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -76,11 +76,14 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx) daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) { switch (k) { - case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU; - case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU; - case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU; - case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU; + case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU; + case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU; + case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU; } return DAEDALUS_SUBSTRATE_CPU; } @@ -101,6 +104,10 @@ extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride, int pri_strength, int sec_strength, int dir, int damping, int h, size_t edges); +extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); +extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); +extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); /* -------------------- CPU dispatch implementations -------------- */ @@ -168,6 +175,48 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx, return 0; } +static int dispatch_h264_idct4_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_blocks; i++) + ff_h264_idct_add_neon(dst + meta[i].dst_off, + coeffs + i * 16, + (ptrdiff_t) dst_stride); + return 0; +} + +static int dispatch_h264_idct8_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_blocks; i++) + ff_h264_idct8_add_neon(dst + meta[i].dst_off, + coeffs + i * 64, + (ptrdiff_t) dst_stride); + return 0; +} + +static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + /* NEON expects mutable tc0 pointer; copy to a local. */ + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], + meta[i].tc0[2], meta[i].tc0[3] }; + ff_h264_v_loop_filter_luma_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta, tc0_local); + } + return 0; +} + /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ typedef struct { @@ -471,6 +520,32 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, dst, dst_stride, tmp, n_blocks, meta); } +int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu, + dst, dst_stride, coeffs, n_blocks, meta); +} + +int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu, + dst, dst_stride, coeffs, n_blocks, meta); +} + +int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_DEBLOCK_LV, dispatch_h264_deblock_cpu, + dst, dst_stride, n_edges, meta); +} + /* -------------------- Recipe convenience wrappers --------------- */ int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx, @@ -515,3 +590,29 @@ int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx, return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, tmp, n_blocks, meta); } + +int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + return daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, coeffs, n_blocks, meta); +} + +int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + return daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, coeffs, n_blocks, meta); +} + +int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, n_edges, meta); +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c new file mode 100644 index 0000000..b9d29a2 --- /dev/null +++ b/tests/test_api_h264.c @@ -0,0 +1,161 @@ +/* + * Phase 8a — H.264 kernels through the public API. + * + * Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel + * exercised through daedalus_recipe_dispatch_* and compared to + * the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8 + * verdicts). + */ +#include +#include +#include +#include +#include + +#include "../include/daedalus.h" + +extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); +extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); +extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t tc0[4]); + +static uint64_t xs_state = 0xa11264ULL; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +static int test_idct4(void) +{ + enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + int16_t coeffs[N * 16], coeffs_ref[N * 16]; + uint8_t dst[BYTES], dst_ref[BYTES]; + daedalus_h264_block_meta meta[N]; + + /* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols). + * Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big + * enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use + * 8 row-blocks. */ + enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE }; + uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES]; + for (int i = 0; i < FULL_BYTES; i++) + big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff); + + for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512); + + for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) { + int i = by * BX + bx; + meta[i].dst_off = by * 4 * STRIDE + bx * 4; + } + + for (int i = 0; i < N; i++) + daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off, + coeffs_ref + i * 16, STRIDE); + + int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE, + coeffs, N, meta); + if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++; + printf(" H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n", + FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +static int test_idct8(void) +{ + enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + int16_t coeffs[N * 64], coeffs_ref[N * 64]; + uint8_t dst[BYTES], dst_ref[BYTES]; + daedalus_h264_block_meta meta[N]; + + for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); + for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024); + + /* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks + * for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64 + * blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */ + int BX = 8, BY = 2; /* 16 blocks total */ + for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) { + int i = by * BX + bx; + meta[i].dst_off = by * 8 * STRIDE + bx * 8; + } + + for (int i = 0; i < N; i++) + daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off, + coeffs_ref + i * 64, STRIDE); + + int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE, + coeffs, N, meta); + if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n", + BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +static int test_deblock(void) +{ + /* One edge per 16x16 tile. */ + enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE, + TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_deblock_meta meta[N_EDGES]; + + for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); + for (int i = 0; i < N_EDGES; i++) { + meta[i].dst_off = i * TILE_BYTES + EDGE_OFF; + meta[i].alpha = (int)(xs() % 64) + 1; + meta[i].beta = (int)(xs() % 16) + 1; + for (int s = 0; s < 4; s++) { + int r = (int)(xs() % 8); + meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); + } + } + + for (int i = 0; i < N_EDGES; i++) { + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; + daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, + meta[i].alpha, meta[i].beta, tc0_local); + } + + int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE, + N_EDGES, meta); + if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +int main(void) +{ + printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); + printf(" H264_IDCT4 recipe substrate: %d (1=CPU, 2=QPU)\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4)); + printf(" H264_IDCT8 recipe substrate: %d\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8)); + printf(" H264_DEBLOCK_LV recipe substrate: %d\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV)); + + int fail = 0; + fail |= test_idct4(); + fail |= test_idct8(); + fail |= test_deblock(); + return fail; +}