Phase 8a: H.264 kernels through public API
Extends include/daedalus.h with cycles 6, 7, 8 (H.264 IDCT 4x4, IDCT 8x8, luma deblock luma-v). All recipe-substrate = CPU (matches per-cycle Phase 7 verdicts). src/daedalus_core.c: NEON-path implementations + recipe routing. daedalus_core library now links the full FFmpeg H.264 NEON snapshot (h264idct + h264dsp) plus existing VP9 + dav1d. tests/test_api_h264.c: smoke test covering all 3 H.264 kernels via daedalus_recipe_dispatch_*. All pass 2048/2048 bit-exact. Public API coverage after this commit: - Cycles 1 IDCT 8x8 + 2 LPF4 + 4 LPF8: CPU+QPU+AUTO dispatch (test_api_idct, test_api_lpf, both pass) - Cycle 3 MC 8h: CPU only (QPU dispatch stub returns -1) - Cycle 5 CDEF: CPU only (QPU stub) - Cycle 6 H.264 IDCT 4x4: CPU only (recipe + only NEON wired) - Cycle 7 H.264 IDCT 8x8: CPU only - Cycle 8 H.264 deblock: CPU only (QPU opportunistic — not wired through API yet; bench_v3d_h264deblock exists for direct test) Next Phase 8 sub-step: wire opportunistic QPU dispatch for cycles 3+5+8 through the API (so override-mode users can request QPU). Then surface V4L2-wrapper architecture decisions to user. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+76
-5
@@ -195,15 +195,86 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
const uint16_t *tmp,
|
||||
size_t n_blocks, const daedalus_cdef_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 IDCT 4x4 + add — cycle 6 (CPU by recipe; QPU unused)
|
||||
*
|
||||
* Per H.264 §8.5.12.1, integer 4x4 inverse transform. block is
|
||||
* COLUMN-major: block[c*4 + r] = coefficient at (row r, col c).
|
||||
* Block is destructively zeroed after the transform (FFmpeg
|
||||
* convention).
|
||||
*
|
||||
* `coeffs` is an array of n_blocks * 16 int16. `dst_off` is byte
|
||||
* offset into dst per block.
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef struct {
|
||||
uint32_t dst_off;
|
||||
uint32_t _pad0, _pad1, _pad2;
|
||||
} daedalus_h264_block_meta;
|
||||
|
||||
int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, /* not const — destructively zeroed */
|
||||
size_t n_blocks, const daedalus_h264_block_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs,
|
||||
size_t n_blocks, const daedalus_h264_block_meta *meta);
|
||||
|
||||
/* H.264 IDCT 8x8 + add — cycle 7 (CPU by recipe).
|
||||
* Per H.264 §8.5.13.2, integer 8x8 inverse transform.
|
||||
* `coeffs` is an array of n_blocks * 64 int16, column-major per block.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs,
|
||||
size_t n_blocks, const daedalus_h264_block_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs,
|
||||
size_t n_blocks, const daedalus_h264_block_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 luma "v_loop_filter" — cycle 8 (CPU primary; QPU opportunistic)
|
||||
*
|
||||
* Filter applied VERTICALLY across a HORIZONTAL edge (16 columns
|
||||
* wide; pix points to row 0 of the bottom block). Non-intra
|
||||
* (bS < 4) variant.
|
||||
*
|
||||
* Each tile is 16 cols × 8 rows of context (rows -4..+3 around
|
||||
* the edge). dst_off points to row 0 col 0 of the bottom block.
|
||||
*
|
||||
* Constraint: dst_off >= 4 * dst_stride (the kernel reads p3 at
|
||||
* -4*stride). Caller must ensure this.
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef struct {
|
||||
uint32_t dst_off;
|
||||
int32_t alpha; /* 0..63 typical, table-derived */
|
||||
int32_t beta; /* 0..63 typical */
|
||||
int8_t tc0[4]; /* per-segment filter strength; -1 means skip */
|
||||
} daedalus_h264_deblock_meta;
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* Recipe query — what does the API recommend for each kernel?
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef enum {
|
||||
DAEDALUS_KERNEL_VP9_IDCT8 = 1,
|
||||
DAEDALUS_KERNEL_VP9_LPF4_INNER = 2,
|
||||
DAEDALUS_KERNEL_VP9_MC_8H = 3,
|
||||
DAEDALUS_KERNEL_VP9_LPF8_INNER = 4,
|
||||
DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5,
|
||||
DAEDALUS_KERNEL_VP9_IDCT8 = 1,
|
||||
DAEDALUS_KERNEL_VP9_LPF4_INNER = 2,
|
||||
DAEDALUS_KERNEL_VP9_MC_8H = 3,
|
||||
DAEDALUS_KERNEL_VP9_LPF8_INNER = 4,
|
||||
DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5,
|
||||
DAEDALUS_KERNEL_H264_IDCT4 = 6,
|
||||
DAEDALUS_KERNEL_H264_IDCT8 = 7,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
|
||||
} daedalus_kernel;
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||
|
||||
Reference in New Issue
Block a user