Files
daedalus-fourier/include/daedalus.h
T
claude-noether 8fdef27a7d Phase 8c: H.264 luma qpel mc20 through public API
Extends daedalus-fourier with daedalus_recipe_dispatch_h264_qpel_mc20
so libavcodec.so can route H264QpelContext.put_h264_qpel_pixels_tab[1][2]
through the recipe layer instead of ff_put_h264_qpel8_mc20_neon directly.

API additions (header + library):
  - daedalus_h264_qpel_meta { dst_off, src_off }
  - daedalus_dispatch_h264_qpel_mc20(ctx, sub, dst, src, stride,
                                     n_blocks, meta)
  - daedalus_recipe_dispatch_h264_qpel_mc20(...)  (AUTO wrapper)
  - DAEDALUS_KERNEL_H264_QPEL_MC20 = 9 in the recipe-query enum
  - daedalus_recipe_substrate_for() returns CPU NEON for cycle 9

The 6-tap horizontal half-pel filter signature matches FFmpeg's
H264QpelContext convention exactly: dst and src share a single stride
and src already points at output column 0 (filter reads cols -2..+3).
Single-stride API to make the marfrit-packages FFmpeg shim a
straight pointer-pass; no buffer rearrangement.

Verdict per docs/k9_h264qpel_mc20.md: CPU NEON.  Per-block 7.6 ns
gives 135x margin over 30 fps 1080p; QPU dispatch floor at ~250 ns
makes any V3D shader strictly worse.  Recipe table reflects that —
the recipe_dispatch entry is a one-line forward to the CPU path.

CMakeLists changes:
  - h264qpel_neon.S added to the daedalus_core static lib (only the
    bench targets owned it before; now the public API needs it too)
  - tests/h264_qpel8_mc20_ref.c added to the test_api_h264 target

Phase 8a/8b smoke gains a 4th case (test_qpel_mc20): 1024/1024
bytes bit-exact via daedalus_recipe_dispatch_h264_qpel_mc20.

Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9.
2026-05-23 03:25:24 +02:00

320 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* daedalus-fourier — public C API.
*
* Stable surface for the integration layer (Phase 8 V4L2 shim,
* libva-v4l2-request-fourier consumer, or any future skin) to
* dispatch per-kernel work to the right substrate per the
* cycle 1-5 deployment recipe.
*
* Recipe (verdict at end of cycles 1-5, see docs/k*_phase7.md):
*
* VP9 IDCT 8x8 → V3D QPU (R=0.92 GREEN; M4 +7.2 %)
* VP9 LPF wd=4 inner → V3D QPU (R=0.41 ORANGE; M4 +6.9 %)
* VP9 MC 8-tap horiz → CPU NEON (R=0.067 RED; M4 -19.5 %)
* VP9 LPF wd=8 inner → V3D QPU (R=0.34 ORANGE; M4 +4.1 %)
* AV1 CDEF 8x8 luma → CPU NEON (R=0.116 ORANGE; QPU = opportunistic helper at 0.4 Mblock/s)
*
* The API exposes BOTH substrates for every kernel — the
* integration layer can override the recipe at runtime if it
* has scheduler knowledge the kernel-level R-band measurement
* didn't capture. The recommended path is to use
* `daedalus_recipe_dispatch_*` which picks the recipe substrate
* automatically.
*
* License: BSD-2-Clause. This header is part of the library API
* boundary; the implementation links against vendored
* LGPL-2.1+ FFmpeg snapshot and BSD-2-Clause dav1d snapshot.
*
* Threading: a `daedalus_ctx *` owns Vulkan + V3D state. A
* context is single-threaded; use one per worker thread if you
* need parallelism on the QPU side. NEON-side dispatch is
* stateless and re-entrant.
*
* ABI: pre-1.0 — no stability guarantees yet. The function names
* and signatures will become ABI-stable at v1.0; until then the
* integration layer should rebuild against the headers it links
* with.
*/
#ifndef DAEDALUS_FOURIER_H
#define DAEDALUS_FOURIER_H
#include <stdint.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/* -------------------------------------------------------------------
* Substrate selection
*
* Most callers should NOT specify a substrate — use the
* `daedalus_recipe_dispatch_*` family below, which picks the
* substrate per the cycles-1-5 verdict. Explicit substrate
* selection is for benchmarking, debugging, and future
* runtime-aware schedulers.
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_SUBSTRATE_AUTO = 0, /* per recipe table */
DAEDALUS_SUBSTRATE_CPU = 1, /* force ARM NEON */
DAEDALUS_SUBSTRATE_QPU = 2, /* force V3D compute */
} daedalus_substrate;
/* -------------------------------------------------------------------
* Context lifecycle
* ----------------------------------------------------------------- */
typedef struct daedalus_ctx daedalus_ctx;
/* Create a context. Initialises V3D Vulkan device if available;
* NEON-only fallback OK if V3D init fails. Returns NULL on alloc
* failure. */
daedalus_ctx *daedalus_ctx_create(void);
/* Same but skip V3D init — for callers that know they want CPU
* only and want a fast-creating context. */
daedalus_ctx *daedalus_ctx_create_no_qpu(void);
/* Returns 1 if QPU dispatch is available on this context, 0 if
* NEON-only. Useful for the integration layer to short-circuit
* QPU dispatch attempts. */
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx);
void daedalus_ctx_destroy(daedalus_ctx *ctx);
/* -------------------------------------------------------------------
* VP9 IDCT 8x8 add — cycle 1 (QPU by recipe)
*
* For each of n_blocks: take 64 int16 coefficients, perform 8x8
* inverse DCT, add to dst[r,c] = clamp(dst[r,c] + ((q + 16)>>5)).
*
* `meta` is an array of (dst_byte_offset, block_x, block_y) for
* each block, where dst_byte_offset is byte offset into dst.
*
* Returns 0 on success, negative errno-like on failure.
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off; /* byte offset into dst */
uint32_t block_x; /* used only by QPU path for placement */
uint32_t block_y;
uint32_t _pad;
} daedalus_idct8_meta;
int daedalus_recipe_dispatch_vp9_idct8(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta);
int daedalus_dispatch_vp9_idct8(
daedalus_ctx *ctx,
daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta);
/* -------------------------------------------------------------------
* VP9 LPF wd=4 / wd=8 — cycles 2 and 4 (QPU by recipe)
*
* Loop filter at horizontal edge crossing pixel column 4 of an
* 8x8 block. Per-edge thresholds (E, I, H).
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off; /* byte offset into dst, at col 4 of edge */
int32_t E, I, H;
} daedalus_lpf_meta;
int daedalus_recipe_dispatch_vp9_lpf4(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
int daedalus_recipe_dispatch_vp9_lpf8(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
/* -------------------------------------------------------------------
* VP9 MC 8-tap horizontal — cycle 3 (CPU by recipe)
*
* Subpel-fractional 8-tap horizontal filter; mx selects filter
* row. CPU path is the high-performance default; QPU path is
* available but never recommended by the recipe.
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off;
uint32_t src_off; /* raw, no pre-advance — shader handles -3 internally */
int32_t mx;
uint32_t _pad;
} daedalus_mc_meta;
int daedalus_recipe_dispatch_vp9_mc_8h(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta);
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta);
/* -------------------------------------------------------------------
* AV1 CDEF 8x8 luma — cycle 5 (CPU by recipe; QPU opportunistic)
*
* tmp is an array of n_blocks * 192 uint16, with the padded-buffer
* layout that dav1d's NEON expects (stride 16, padding 2-rows-top +
* 2-cols-left + 2-cols-right + 2-rows-bottom). Caller supplies
* tmp populated with either source pixels (if all edges valid) or
* INT16_MIN sentinels at the boundary (if edge filtered out).
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off;
uint32_t tmp_off_u16; /* offset to block-origin in tmp[] (= padded_origin + 2*16+2) */
int32_t pri_strength; /* 1..7 */
int32_t sec_strength; /* 1..4 */
int32_t dir; /* 0..7 */
int32_t damping; /* 1..6 */
} daedalus_cdef_meta;
int daedalus_recipe_dispatch_cdef_8x8(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta);
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta);
/* -------------------------------------------------------------------
* H.264 IDCT 4x4 + add — cycle 6 (CPU by recipe; QPU unused)
*
* Per H.264 §8.5.12.1, integer 4x4 inverse transform. block is
* COLUMN-major: block[c*4 + r] = coefficient at (row r, col c).
* Block is destructively zeroed after the transform (FFmpeg
* convention).
*
* `coeffs` is an array of n_blocks * 16 int16. `dst_off` is byte
* offset into dst per block.
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off;
uint32_t _pad0, _pad1, _pad2;
} daedalus_h264_block_meta;
int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, /* not const — destructively zeroed */
size_t n_blocks, const daedalus_h264_block_meta *meta);
int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs,
size_t n_blocks, const daedalus_h264_block_meta *meta);
/* H.264 IDCT 8x8 + add — cycle 7 (CPU by recipe).
* Per H.264 §8.5.13.2, integer 8x8 inverse transform.
* `coeffs` is an array of n_blocks * 64 int16, column-major per block.
*/
int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs,
size_t n_blocks, const daedalus_h264_block_meta *meta);
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs,
size_t n_blocks, const daedalus_h264_block_meta *meta);
/* -------------------------------------------------------------------
* H.264 luma "v_loop_filter" — cycle 8 (CPU primary; QPU opportunistic)
*
* Filter applied VERTICALLY across a HORIZONTAL edge (16 columns
* wide; pix points to row 0 of the bottom block). Non-intra
* (bS < 4) variant.
*
* Each tile is 16 cols × 8 rows of context (rows -4..+3 around
* the edge). dst_off points to row 0 col 0 of the bottom block.
*
* Constraint: dst_off >= 4 * dst_stride (the kernel reads p3 at
* -4*stride). Caller must ensure this.
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off;
int32_t alpha; /* 0..63 typical, table-derived */
int32_t beta; /* 0..63 typical */
int8_t tc0[4]; /* per-segment filter strength; -1 means skip */
} daedalus_h264_deblock_meta;
int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
/* -------------------------------------------------------------------
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
* docs/k9_h264qpel_mc20.md for the R-band rationale).
*
* Per H.264 §8.4.2.2.1, horizontal half-pel luma 6-tap filter:
* dst[r,c] = clip255((s[r,c-2] - 5*s[r,c-1] + 20*s[r,c]
* + 20*s[r,c+1] - 5*s[r,c+2] + s[r,c+3]
* + 16) >> 5)
*
* Single-stride: dst and src share `stride`; this matches FFmpeg's
* H264QpelContext.put_h264_qpel_pixels_tab[][] convention and the
* vendored ff_put_h264_qpel8_mc20_neon signature.
*
* `src + src_off` points at the leftmost OUTPUT column (col 0); the
* filter reads cols -2..+3, so the caller must guarantee src has at
* least 2 pixels of left context and 3 pixels of right context per
* row. (FFmpeg already maintains an edge-emulated buffer for the
* frame boundary; this matches that contract.)
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off; /* byte offset into dst (block top-left) */
uint32_t src_off; /* byte offset into src (col 0, row 0) */
} daedalus_h264_qpel_meta;
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
/* -------------------------------------------------------------------
* Recipe query — what does the API recommend for each kernel?
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_KERNEL_VP9_IDCT8 = 1,
DAEDALUS_KERNEL_VP9_LPF4_INNER = 2,
DAEDALUS_KERNEL_VP9_MC_8H = 3,
DAEDALUS_KERNEL_VP9_LPF8_INNER = 4,
DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5,
DAEDALUS_KERNEL_H264_IDCT4 = 6,
DAEDALUS_KERNEL_H264_IDCT8 = 7,
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
DAEDALUS_KERNEL_H264_QPEL_MC20 = 9,
} daedalus_kernel;
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
#ifdef __cplusplus
}
#endif
#endif /* DAEDALUS_FOURIER_H */