/* SPDX-License-Identifier: BSD-2-Clause */ /* * daedalus-decoder — public C API. * * Frame-level GPU H.264 decoder targeting V3D7 (Raspberry Pi 5). Built * on daedalus-fourier's V3D compute primitives at frame granularity — * one Vulkan submit per frame, one fence wait per frame, encoded * bitstream in (via libavcodec's per-MB intercept), NV12 frame out. * * Per the 2026-05-24 Phase 1 design decisions: * - libavcodec intercept is at macroblock-level (substitution-arc * evolution): the caller is expected to drive the per-MB CABAC / * CAVLC entropy decode and feed each macroblock's descriptor + * coefficients via daedalus_decoder_append_mb(). flush_frame() * builds the per-frame VkCommandBuffer and submits. * - DPB is Vulkan-native VkImage with VK_KHR_external_memory_dma_buf * export. The caller can obtain the output frame's dmabuf fd * via daedalus_decoder_export_dmabuf(). * - Daemon integration shape: this library is statically linked into * daedalus_v4l2_daemon. No IPC. * * STATUS: scaffold. No GPU pipeline implemented yet; all functions * are stubs that compile but do not decode anything. See DESIGN.md * for the architecture. * * ABI: pre-0.1 — every signature here may change. Don't rely on * stability yet. */ #ifndef DAEDALUS_DECODER_H #define DAEDALUS_DECODER_H #include #include #ifdef __cplusplus extern "C" { #endif /* ------------------------------------------------------------------- * Opaque decoder context. One per concurrent stream. * ----------------------------------------------------------------- */ typedef struct daedalus_decoder daedalus_decoder; /* ------------------------------------------------------------------- * Per-edge deblock metadata. One entry per filter-edge; the caller * derives these from H.264 §8.7.2.1 boundary-strength rules. * * Coordinate convention: * mb_x / mb_y — the MB whose top-left this edge sits on (the "right" * side for vertical edges, "bottom" side for horizontal * edges, in H.264 spec's q-side convention). * edge_idx — 0..3 within the MB: * luma: edge 0 = MB boundary, edges 1..3 = internal * at cols/rows 4, 8, 12. * chroma: edge 0 = MB boundary, edge 1 = internal at * col/row 4. edge_idx > 1 invalid for chroma. * Edges at frame boundaries (top row of MBs for H edges; * left column for V edges) MUST be bS=0 — the kernel * reads p3 at four samples beyond the edge. * orient — 0 = vertical edge (filtered horizontally across), 1 = horizontal. * plane — 0 = luma, 1 = chroma Cb, 2 = chroma Cr. Cb and Cr * always share the same filter parameters per H.264 * spec, but are listed separately so the caller can * omit one or the other if needed. * bS — 0 = skip this edge (no GPU work), 1..3 = bS<4 path * (uses tc0), 4 = bS=4 "intra" path (ignores tc0). * alpha, beta — H.264 §8.7.2.2 table 8-16/8-17 values, both 0..255. * tc0[4] — per-4-cell segment strength along the edge (luma has * 4 segments; chroma has 4 also, with 2 cells each). * IGNORED when bS == 4. * ----------------------------------------------------------------- */ struct daedalus_decoder_edge { uint16_t mb_x; uint16_t mb_y; uint8_t edge_idx; uint8_t orient; uint8_t plane; uint8_t bS; uint8_t alpha; uint8_t beta; int8_t tc0[4]; }; /* ------------------------------------------------------------------- * Per-macroblock input. Mirrors §3 of DESIGN.md. The caller's * libavcodec intercept populates this from the H264SliceContext * fields after ff_h264_decode_mb_cabac/cavlc returns and before * ff_h264_hl_decode_mb is supposed to run (we replace the latter). * ----------------------------------------------------------------- */ struct daedalus_decoder_mb_input { /* Frame coordinates (macroblock units). */ uint16_t mb_x; uint16_t mb_y; /* Type + quantisation. */ uint8_t mb_type; /* H.264 spec table 7-13/7-14/7-17/7-18 enum */ uint8_t mb_qp_y; uint8_t mb_qp_uv; uint8_t cbp; /* coded block pattern, 0..47 */ /* Intra prediction (used iff mb_type == I_NxN or I_16x16). */ uint8_t intra_4x4_modes[16]; uint8_t intra_16x16_mode; uint8_t intra_chroma_mode; /* Inter motion / partitions (used iff P_* or B_*). */ uint8_t partition_mode; /* P_16x16 / P_16x8 / P_8x16 / P_8x8 / etc. */ int8_t ref_idx_l0[4]; /* per partition; -1 = not used */ int8_t ref_idx_l1[4]; /* B only */ int16_t mv_l0[4][2]; /* qpel precision (1/4 sample); (x, y) */ int16_t mv_l1[4][2]; /* Deblocking filter parameters. */ uint8_t deblock_disable; /* 0 = enabled */ int8_t deblock_alpha_c0; int8_t deblock_beta; /* High-profile 8x8 transform selector. * 0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks * (16 coeffs each, raster sb_y*4+sb_x); the chroma section is * always 4x4. * 1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs * each, raster sb_y*2+sb_x). Set per H.264's * transform_8x8_size_flag. Chroma remains 4x4 (4:2:0). */ uint8_t transform_8x8; /* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all * column-major within each 4x4 or 8x8 block (matches FFmpeg * convention). Caller-owned; copied during append. */ const int16_t *coeffs; /* points at exactly 384 int16_t */ /* Reconstructed predicted samples for this MB, planar order: * [ 0 .. 256) — 16×16 luma, ROW-MAJOR raster (row 0 cols 0..15, * row 1 cols 0..15, ..., row 15 cols 0..15) * [256 .. 320) — 8×8 Cb, ROW-MAJOR raster * [320 .. 384) — 8×8 Cr, ROW-MAJOR raster * * The caller (libavcodec's CPU intra-prediction kernels for Phase 1 * I-frames; MC fallback for Phase 2 P-frames before GPU MC lands) * populates this from neighbour samples per H.264 §8.3 / §8.4. * `flush_frame()`'s reconstruction step is `clip255(predicted + * idct(coeffs))` — the IDCT shader reads dst, adds the inverse * transform, writes clipped — so a non-zero `predicted` here makes * the output pixel a valid H.264 reconstruction; zero means * residual-only (used by IDCT-isolation tests). * * NULL is legal and means "all-zero predicted samples" for this MB * (the per-frame predicted buffer is zeroed at flush time so a NULL * is indistinguishable from explicit zeros). */ const uint8_t *predicted; /* NULL or exactly 384 uint8_t */ /* Per-MB deblock edges — caller-derived per H.264 §8.7.2. Typical * count: 4 V-luma + 4 H-luma + 2 V-Cb + 2 H-Cb + 2 V-Cr + 2 H-Cr * = 16 edges per MB (omit zero-bS edges if preferred — frame * boundaries MUST be bS=0 since the kernels read p3 at four * samples beyond the edge). daedalus_decoder routes each entry * to the appropriate luma/chroma × V/H × bS=4/<4 dispatch in * flush_frame and pays a single Vulkan submit per non-empty * (direction × bS-band) partition (≤8 deblock submits / frame * total) per the Q1 architecture decision (one-submit-per-kernel * for now; cmdbuf-builder deferred to Stage 4). * * NULL or n_edges == 0 → no deblock on this MB. */ const struct daedalus_decoder_edge *edges; uint8_t n_edges; }; /* ------------------------------------------------------------------- * Output frame format selector. * ----------------------------------------------------------------- */ typedef enum { DAEDALUS_DECODER_OUTPUT_NV12 = 0, /* default; Stage 4 final */ DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */ } daedalus_decoder_output_format; /* ------------------------------------------------------------------- * Substrate selector. Determines which backend daedalus-fourier * dispatches the per-frame compute through. * * AUTO is the only sensible choice for production — it picks per the * recipe table baked into daedalus-fourier (post 2026-05-23 decree: * QPU when a V3D shader exists, CPU NEON otherwise). The explicit * options exist for testing: * * - CPU forces the dispatch onto the NEON path even when V3D7 is * available. Lets the bit-exact ctests run on hosts without a * working Vulkan/V3D stack (CI runners, dev x86 boxes via * cross-build), and lets us cross-check the V3D shader output * against the NEON reference path on hosts that DO have V3D. * - QPU is the dual — force QPU even on a CPU-preferred kernel. * Useful for benchmarking specific QPU paths in isolation. * * A non-AUTO selection on a host that can't satisfy it * (DAEDALUS_DECODER_SUBSTRATE_QPU on an x86 dev box) propagates a * dispatch failure back through flush_frame as -3. * ----------------------------------------------------------------- */ typedef enum { DAEDALUS_DECODER_SUBSTRATE_AUTO = 0, DAEDALUS_DECODER_SUBSTRATE_CPU = 1, DAEDALUS_DECODER_SUBSTRATE_QPU = 2, } daedalus_decoder_substrate; /* ------------------------------------------------------------------- * Lifecycle * ----------------------------------------------------------------- */ /* Create a decoder context for the given **coded** frame dimensions. * * width, height: pixels of the H.264 coded picture, NOT the displayed * picture. Both must be multiples of 16 (macroblock granularity). * For displayed 1080p (1920×1080), the coded frame is 1920×1088 with * the SPS's `frame_cropping_*` offsets cropping the bottom 8 rows. * The caller is responsible for translating from SPS dims + crop * rectangle to the values passed here; we decode the coded frame. * * Returns NULL on bad dimensions or allocation failure. Returns a * usable context with daedalus_decoder_has_qpu() == 0 when Vulkan * init fails — callers that need GPU work should check has_qpu * before relying on it. */ daedalus_decoder *daedalus_decoder_create(int width, int height); /* Free all resources. Safe with NULL. */ void daedalus_decoder_destroy(daedalus_decoder *dec); /* Switch output format BEFORE the first append_mb call of a frame. * Default is NV12. Returns 0 on success, -1 if called mid-frame * (caller must flush first). */ int daedalus_decoder_set_output_format(daedalus_decoder *dec, daedalus_decoder_output_format fmt); /* Override the dispatch substrate for subsequent flush_frame calls. * Default is AUTO. Same mid-frame-change restriction as * set_output_format. */ int daedalus_decoder_set_substrate(daedalus_decoder *dec, daedalus_decoder_substrate sub); /* ------------------------------------------------------------------- * Per-frame submission * ----------------------------------------------------------------- */ /* Append one macroblock's data to the current frame's descriptor SSBO * + coefficient SSBO. No GPU dispatch yet — just CPU-side writes. * * Must be called in raster order (mb_y * mb_width + mb_x) for the * intra-prediction wavefront to work correctly in Phase 1. * * Returns 0 on success, negative on bounds violation or OOM. */ int daedalus_decoder_append_mb(daedalus_decoder *dec, const struct daedalus_decoder_mb_input *mb); /* End-of-frame flush: builds the per-frame VkCommandBuffer with all * pipeline stages, submits once, waits on a single fence, copies the * NV12 (or RGBA when opted in) output into the caller-provided * planes. * * For NV12: * out_y / y_stride: Y plane (W*H bytes minimum, at the given stride) * out_uv / uv_stride: interleaved UV plane (W*(H/2) bytes minimum) * * For RGBA: out_y receives 4*W*H bytes at y_stride; out_uv ignored. * * Returns 0 on success, negative on Vulkan failure or undecodable * frame. After return, the decoder is ready for the next frame's * append calls. */ int daedalus_decoder_flush_frame(daedalus_decoder *dec, uint8_t *out_y, size_t y_stride, uint8_t *out_uv, size_t uv_stride); /* Export the most-recently-decoded frame as a dma_buf fd. The fd is * owned by the caller and must be closed when done. Lets V4L2 * consumers (daedalus_v4l2_daemon, libva-v4l2-request-fourier) attach * the GPU-decoded surface directly to a CAPTURE plane without a CPU * round-trip. * * Returns the dmabuf fd on success, -1 on failure. Must be called * AFTER flush_frame returns for the relevant frame. */ int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane); /* ------------------------------------------------------------------- * Diagnostics * ----------------------------------------------------------------- */ /* daedalus-decoder build version (semver string, e.g. "0.0.1+g0a1b2c3"). */ const char *daedalus_decoder_version(void); /* Whether the underlying daedalus-fourier context picked up a working * V3D7 Vulkan instance. Returns 0 if Vulkan init failed and the * decoder is operating in stub / failure mode. */ int daedalus_decoder_has_qpu(const daedalus_decoder *dec); #ifdef __cplusplus } #endif #endif /* DAEDALUS_DECODER_H */