Phase 8a: H.264 kernels through public API
Extends include/daedalus.h with cycles 6, 7, 8 (H.264 IDCT 4x4, IDCT 8x8, luma deblock luma-v). All recipe-substrate = CPU (matches per-cycle Phase 7 verdicts). src/daedalus_core.c: NEON-path implementations + recipe routing. daedalus_core library now links the full FFmpeg H.264 NEON snapshot (h264idct + h264dsp) plus existing VP9 + dav1d. tests/test_api_h264.c: smoke test covering all 3 H.264 kernels via daedalus_recipe_dispatch_*. All pass 2048/2048 bit-exact. Public API coverage after this commit: - Cycles 1 IDCT 8x8 + 2 LPF4 + 4 LPF8: CPU+QPU+AUTO dispatch (test_api_idct, test_api_lpf, both pass) - Cycle 3 MC 8h: CPU only (QPU dispatch stub returns -1) - Cycle 5 CDEF: CPU only (QPU stub) - Cycle 6 H.264 IDCT 4x4: CPU only (recipe + only NEON wired) - Cycle 7 H.264 IDCT 8x8: CPU only - Cycle 8 H.264 deblock: CPU only (QPU opportunistic — not wired through API yet; bench_v3d_h264deblock exists for direct test) Next Phase 8 sub-step: wire opportunistic QPU dispatch for cycles 3+5+8 through the API (so override-mode users can request QPU). Then surface V4L2-wrapper architecture decisions to user. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+106
-5
@@ -76,11 +76,14 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
{
|
||||
switch (k) {
|
||||
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -101,6 +104,10 @@ extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
int pri_strength, int sec_strength,
|
||||
int dir, int damping, int h,
|
||||
size_t edges);
|
||||
extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||
extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
|
||||
/* -------------------- CPU dispatch implementations -------------- */
|
||||
|
||||
@@ -168,6 +175,48 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_idct4_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_blocks; i++)
|
||||
ff_h264_idct_add_neon(dst + meta[i].dst_off,
|
||||
coeffs + i * 16,
|
||||
(ptrdiff_t) dst_stride);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_idct8_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_blocks; i++)
|
||||
ff_h264_idct8_add_neon(dst + meta[i].dst_off,
|
||||
coeffs + i * 64,
|
||||
(ptrdiff_t) dst_stride);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
/* NEON expects mutable tc0 pointer; copy to a local. */
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||||
meta[i].tc0[2], meta[i].tc0[3] };
|
||||
ff_h264_v_loop_filter_luma_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||
|
||||
typedef struct {
|
||||
@@ -471,6 +520,32 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
dst, dst_stride, tmp, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
|
||||
dst, dst_stride, coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
|
||||
dst, dst_stride, coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_DEBLOCK_LV, dispatch_h264_deblock_cpu,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
/* -------------------- Recipe convenience wrappers --------------- */
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||
@@ -515,3 +590,29 @@ int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
|
||||
return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, tmp, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user