/* SPDX-License-Identifier: BSD-2-Clause */ /* * daedalus-decoder — public C API. * * Frame-level GPU H.264 decoder targeting V3D7 (Raspberry Pi 5). Built * on daedalus-fourier's V3D compute primitives at frame granularity — * one Vulkan submit per frame, one fence wait per frame, encoded * bitstream in (via libavcodec's per-MB intercept), NV12 frame out. * * Per the 2026-05-24 Phase 1 design decisions: * - libavcodec intercept is at macroblock-level (substitution-arc * evolution): the caller is expected to drive the per-MB CABAC / * CAVLC entropy decode and feed each macroblock's descriptor + * coefficients via daedalus_decoder_append_mb(). flush_frame() * builds the per-frame VkCommandBuffer and submits. * - DPB is Vulkan-native VkImage with VK_KHR_external_memory_dma_buf * export. The caller can obtain the output frame's dmabuf fd * via daedalus_decoder_export_dmabuf(). * - Daemon integration shape: this library is statically linked into * daedalus_v4l2_daemon. No IPC. * * STATUS: scaffold. No GPU pipeline implemented yet; all functions * are stubs that compile but do not decode anything. See DESIGN.md * for the architecture. * * ABI: pre-0.1 — every signature here may change. Don't rely on * stability yet. */ #ifndef DAEDALUS_DECODER_H #define DAEDALUS_DECODER_H #include #include #ifdef __cplusplus extern "C" { #endif /* ------------------------------------------------------------------- * Opaque decoder context. One per concurrent stream. * ----------------------------------------------------------------- */ typedef struct daedalus_decoder daedalus_decoder; /* ------------------------------------------------------------------- * Per-macroblock input. Mirrors §3 of DESIGN.md. The caller's * libavcodec intercept populates this from the H264SliceContext * fields after ff_h264_decode_mb_cabac/cavlc returns and before * ff_h264_hl_decode_mb is supposed to run (we replace the latter). * ----------------------------------------------------------------- */ struct daedalus_decoder_mb_input { /* Frame coordinates (macroblock units). */ uint16_t mb_x; uint16_t mb_y; /* Type + quantisation. */ uint8_t mb_type; /* H.264 spec table 7-13/7-14/7-17/7-18 enum */ uint8_t mb_qp_y; uint8_t mb_qp_uv; uint8_t cbp; /* coded block pattern, 0..47 */ /* Intra prediction (used iff mb_type == I_NxN or I_16x16). */ uint8_t intra_4x4_modes[16]; uint8_t intra_16x16_mode; uint8_t intra_chroma_mode; /* Inter motion / partitions (used iff P_* or B_*). */ uint8_t partition_mode; /* P_16x16 / P_16x8 / P_8x16 / P_8x8 / etc. */ int8_t ref_idx_l0[4]; /* per partition; -1 = not used */ int8_t ref_idx_l1[4]; /* B only */ int16_t mv_l0[4][2]; /* qpel precision (1/4 sample); (x, y) */ int16_t mv_l1[4][2]; /* Deblocking filter parameters. */ uint8_t deblock_disable; /* 0 = enabled */ int8_t deblock_alpha_c0; int8_t deblock_beta; /* High-profile 8x8 transform selector. * 0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks * (16 coeffs each, raster sb_y*4+sb_x); the chroma section is * always 4x4. * 1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs * each, raster sb_y*2+sb_x). Set per H.264's * transform_8x8_size_flag. Chroma remains 4x4 (4:2:0). */ uint8_t transform_8x8; /* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all * column-major within each 4x4 or 8x8 block (matches FFmpeg * convention). Caller-owned; copied during append. */ const int16_t *coeffs; /* points at exactly 384 int16_t */ }; /* ------------------------------------------------------------------- * Output frame format selector. * ----------------------------------------------------------------- */ typedef enum { DAEDALUS_DECODER_OUTPUT_NV12 = 0, /* default; Stage 4 final */ DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */ } daedalus_decoder_output_format; /* ------------------------------------------------------------------- * Lifecycle * ----------------------------------------------------------------- */ /* Create a decoder context for the given **coded** frame dimensions. * * width, height: pixels of the H.264 coded picture, NOT the displayed * picture. Both must be multiples of 16 (macroblock granularity). * For displayed 1080p (1920×1080), the coded frame is 1920×1088 with * the SPS's `frame_cropping_*` offsets cropping the bottom 8 rows. * The caller is responsible for translating from SPS dims + crop * rectangle to the values passed here; we decode the coded frame. * * Returns NULL on bad dimensions or allocation failure. Returns a * usable context with daedalus_decoder_has_qpu() == 0 when Vulkan * init fails — callers that need GPU work should check has_qpu * before relying on it. */ daedalus_decoder *daedalus_decoder_create(int width, int height); /* Free all resources. Safe with NULL. */ void daedalus_decoder_destroy(daedalus_decoder *dec); /* Switch output format BEFORE the first append_mb call of a frame. * Default is NV12. Returns 0 on success, -1 if called mid-frame * (caller must flush first). */ int daedalus_decoder_set_output_format(daedalus_decoder *dec, daedalus_decoder_output_format fmt); /* ------------------------------------------------------------------- * Per-frame submission * ----------------------------------------------------------------- */ /* Append one macroblock's data to the current frame's descriptor SSBO * + coefficient SSBO. No GPU dispatch yet — just CPU-side writes. * * Must be called in raster order (mb_y * mb_width + mb_x) for the * intra-prediction wavefront to work correctly in Phase 1. * * Returns 0 on success, negative on bounds violation or OOM. */ int daedalus_decoder_append_mb(daedalus_decoder *dec, const struct daedalus_decoder_mb_input *mb); /* End-of-frame flush: builds the per-frame VkCommandBuffer with all * pipeline stages, submits once, waits on a single fence, copies the * NV12 (or RGBA when opted in) output into the caller-provided * planes. * * For NV12: * out_y / y_stride: Y plane (W*H bytes minimum, at the given stride) * out_uv / uv_stride: interleaved UV plane (W*(H/2) bytes minimum) * * For RGBA: out_y receives 4*W*H bytes at y_stride; out_uv ignored. * * Returns 0 on success, negative on Vulkan failure or undecodable * frame. After return, the decoder is ready for the next frame's * append calls. */ int daedalus_decoder_flush_frame(daedalus_decoder *dec, uint8_t *out_y, size_t y_stride, uint8_t *out_uv, size_t uv_stride); /* Export the most-recently-decoded frame as a dma_buf fd. The fd is * owned by the caller and must be closed when done. Lets V4L2 * consumers (daedalus_v4l2_daemon, libva-v4l2-request-fourier) attach * the GPU-decoded surface directly to a CAPTURE plane without a CPU * round-trip. * * Returns the dmabuf fd on success, -1 on failure. Must be called * AFTER flush_frame returns for the relevant frame. */ int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane); /* ------------------------------------------------------------------- * Diagnostics * ----------------------------------------------------------------- */ /* daedalus-decoder build version (semver string, e.g. "0.0.1+g0a1b2c3"). */ const char *daedalus_decoder_version(void); /* Whether the underlying daedalus-fourier context picked up a working * V3D7 Vulkan instance. Returns 0 if Vulkan init failed and the * decoder is operating in stub / failure mode. */ int daedalus_decoder_has_qpu(const daedalus_decoder *dec); #ifdef __cplusplus } #endif #endif /* DAEDALUS_DECODER_H */