Files
daedalus-decoder/include/daedalus_decoder.h
2026-05-25 23:14:24 +02:00

301 lines
13 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* daedalus-decoder — public C API.
*
* Frame-level GPU H.264 decoder targeting V3D7 (Raspberry Pi 5). Built
* on daedalus-fourier's V3D compute primitives at frame granularity —
* one Vulkan submit per frame, one fence wait per frame, encoded
* bitstream in (via libavcodec's per-MB intercept), NV12 frame out.
*
* Per the 2026-05-24 Phase 1 design decisions:
* - libavcodec intercept is at macroblock-level (substitution-arc
* evolution): the caller is expected to drive the per-MB CABAC /
* CAVLC entropy decode and feed each macroblock's descriptor +
* coefficients via daedalus_decoder_append_mb(). flush_frame()
* builds the per-frame VkCommandBuffer and submits.
* - DPB is Vulkan-native VkImage with VK_KHR_external_memory_dma_buf
* export. The caller can obtain the output frame's dmabuf fd
* via daedalus_decoder_export_dmabuf().
* - Daemon integration shape: this library is statically linked into
* daedalus_v4l2_daemon. No IPC.
*
* STATUS: scaffold. No GPU pipeline implemented yet; all functions
* are stubs that compile but do not decode anything. See DESIGN.md
* for the architecture.
*
* ABI: pre-0.1 — every signature here may change. Don't rely on
* stability yet.
*/
#ifndef DAEDALUS_DECODER_H
#define DAEDALUS_DECODER_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/* -------------------------------------------------------------------
* Opaque decoder context. One per concurrent stream.
* ----------------------------------------------------------------- */
typedef struct daedalus_decoder daedalus_decoder;
/* -------------------------------------------------------------------
* Per-edge deblock metadata. One entry per filter-edge; the caller
* derives these from H.264 §8.7.2.1 boundary-strength rules.
*
* Coordinate convention:
* mb_x / mb_y — the MB whose top-left this edge sits on (the "right"
* side for vertical edges, "bottom" side for horizontal
* edges, in H.264 spec's q-side convention).
* edge_idx — 0..3 within the MB:
* luma: edge 0 = MB boundary, edges 1..3 = internal
* at cols/rows 4, 8, 12.
* chroma: edge 0 = MB boundary, edge 1 = internal at
* col/row 4. edge_idx > 1 invalid for chroma.
* Edges at frame boundaries (top row of MBs for H edges;
* left column for V edges) MUST be bS=0 — the kernel
* reads p3 at four samples beyond the edge.
* orient — 0 = vertical edge (filtered horizontally across), 1 = horizontal.
* plane — 0 = luma, 1 = chroma Cb, 2 = chroma Cr. Cb and Cr
* always share the same filter parameters per H.264
* spec, but are listed separately so the caller can
* omit one or the other if needed.
* bS — 0 = skip this edge (no GPU work), 1..3 = bS<4 path
* (uses tc0), 4 = bS=4 "intra" path (ignores tc0).
* alpha, beta — H.264 §8.7.2.2 table 8-16/8-17 values, both 0..255.
* tc0[4] — per-4-cell segment strength along the edge (luma has
* 4 segments; chroma has 4 also, with 2 cells each).
* IGNORED when bS == 4.
* ----------------------------------------------------------------- */
struct daedalus_decoder_edge {
uint16_t mb_x;
uint16_t mb_y;
uint8_t edge_idx;
uint8_t orient;
uint8_t plane;
uint8_t bS;
uint8_t alpha;
uint8_t beta;
int8_t tc0[4];
};
/* -------------------------------------------------------------------
* Per-macroblock input. Mirrors §3 of DESIGN.md. The caller's
* libavcodec intercept populates this from the H264SliceContext
* fields after ff_h264_decode_mb_cabac/cavlc returns and before
* ff_h264_hl_decode_mb is supposed to run (we replace the latter).
* ----------------------------------------------------------------- */
struct daedalus_decoder_mb_input {
/* Frame coordinates (macroblock units). */
uint16_t mb_x;
uint16_t mb_y;
/* Type + quantisation. */
uint8_t mb_type; /* H.264 spec table 7-13/7-14/7-17/7-18 enum */
uint8_t mb_qp_y;
uint8_t mb_qp_uv;
uint8_t cbp; /* coded block pattern, 0..47 */
/* Intra prediction (used iff mb_type == I_NxN or I_16x16). */
uint8_t intra_4x4_modes[16];
uint8_t intra_16x16_mode;
uint8_t intra_chroma_mode;
/* Inter motion / partitions (used iff P_* or B_*). */
uint8_t partition_mode; /* P_16x16 / P_16x8 / P_8x16 / P_8x8 / etc. */
int8_t ref_idx_l0[4]; /* per partition; -1 = not used */
int8_t ref_idx_l1[4]; /* B only */
int16_t mv_l0[4][2]; /* qpel precision (1/4 sample); (x, y) */
int16_t mv_l1[4][2];
/* Deblocking filter parameters. */
uint8_t deblock_disable; /* 0 = enabled */
int8_t deblock_alpha_c0;
int8_t deblock_beta;
/* High-profile 8x8 transform selector.
* 0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks
* (16 coeffs each, raster sb_y*4+sb_x); the chroma section is
* always 4x4.
* 1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs
* each, raster sb_y*2+sb_x). Set per H.264's
* transform_8x8_size_flag. Chroma remains 4x4 (4:2:0).
*/
uint8_t transform_8x8;
/* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all
* column-major within each 4x4 or 8x8 block (matches FFmpeg
* convention). Caller-owned; copied during append. */
const int16_t *coeffs; /* points at exactly 384 int16_t */
/* Reconstructed predicted samples for this MB, planar order:
* [ 0 .. 256) — 16×16 luma, ROW-MAJOR raster (row 0 cols 0..15,
* row 1 cols 0..15, ..., row 15 cols 0..15)
* [256 .. 320) — 8×8 Cb, ROW-MAJOR raster
* [320 .. 384) — 8×8 Cr, ROW-MAJOR raster
*
* The caller (libavcodec's CPU intra-prediction kernels for Phase 1
* I-frames; MC fallback for Phase 2 P-frames before GPU MC lands)
* populates this from neighbour samples per H.264 §8.3 / §8.4.
* `flush_frame()`'s reconstruction step is `clip255(predicted +
* idct(coeffs))` — the IDCT shader reads dst, adds the inverse
* transform, writes clipped — so a non-zero `predicted` here makes
* the output pixel a valid H.264 reconstruction; zero means
* residual-only (used by IDCT-isolation tests).
*
* NULL is legal and means "all-zero predicted samples" for this MB
* (the per-frame predicted buffer is zeroed at flush time so a NULL
* is indistinguishable from explicit zeros). */
const uint8_t *predicted; /* NULL or exactly 384 uint8_t */
/* Per-MB deblock edges — caller-derived per H.264 §8.7.2. Typical
* count: 4 V-luma + 4 H-luma + 2 V-Cb + 2 H-Cb + 2 V-Cr + 2 H-Cr
* = 16 edges per MB (omit zero-bS edges if preferred — frame
* boundaries MUST be bS=0 since the kernels read p3 at four
* samples beyond the edge). daedalus_decoder routes each entry
* to the appropriate luma/chroma × V/H × bS=4/<4 dispatch in
* flush_frame and pays a single Vulkan submit per non-empty
* (direction × bS-band) partition (≤8 deblock submits / frame
* total) per the Q1 architecture decision (one-submit-per-kernel
* for now; cmdbuf-builder deferred to Stage 4).
*
* NULL or n_edges == 0 → no deblock on this MB. */
const struct daedalus_decoder_edge *edges;
uint8_t n_edges;
};
/* -------------------------------------------------------------------
* Output frame format selector.
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_DECODER_OUTPUT_NV12 = 0, /* default; Stage 4 final */
DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */
} daedalus_decoder_output_format;
/* -------------------------------------------------------------------
* Substrate selector. Determines which backend daedalus-fourier
* dispatches the per-frame compute through.
*
* AUTO is the only sensible choice for production — it picks per the
* recipe table baked into daedalus-fourier (post 2026-05-23 decree:
* QPU when a V3D shader exists, CPU NEON otherwise). The explicit
* options exist for testing:
*
* - CPU forces the dispatch onto the NEON path even when V3D7 is
* available. Lets the bit-exact ctests run on hosts without a
* working Vulkan/V3D stack (CI runners, dev x86 boxes via
* cross-build), and lets us cross-check the V3D shader output
* against the NEON reference path on hosts that DO have V3D.
* - QPU is the dual — force QPU even on a CPU-preferred kernel.
* Useful for benchmarking specific QPU paths in isolation.
*
* A non-AUTO selection on a host that can't satisfy it
* (DAEDALUS_DECODER_SUBSTRATE_QPU on an x86 dev box) propagates a
* dispatch failure back through flush_frame as -3.
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_DECODER_SUBSTRATE_AUTO = 0,
DAEDALUS_DECODER_SUBSTRATE_CPU = 1,
DAEDALUS_DECODER_SUBSTRATE_QPU = 2,
} daedalus_decoder_substrate;
/* -------------------------------------------------------------------
* Lifecycle
* ----------------------------------------------------------------- */
/* Create a decoder context for the given **coded** frame dimensions.
*
* width, height: pixels of the H.264 coded picture, NOT the displayed
* picture. Both must be multiples of 16 (macroblock granularity).
* For displayed 1080p (1920×1080), the coded frame is 1920×1088 with
* the SPS's `frame_cropping_*` offsets cropping the bottom 8 rows.
* The caller is responsible for translating from SPS dims + crop
* rectangle to the values passed here; we decode the coded frame.
*
* Returns NULL on bad dimensions or allocation failure. Returns a
* usable context with daedalus_decoder_has_qpu() == 0 when Vulkan
* init fails — callers that need GPU work should check has_qpu
* before relying on it.
*/
daedalus_decoder *daedalus_decoder_create(int width, int height);
/* Free all resources. Safe with NULL. */
void daedalus_decoder_destroy(daedalus_decoder *dec);
/* Switch output format BEFORE the first append_mb call of a frame.
* Default is NV12. Returns 0 on success, -1 if called mid-frame
* (caller must flush first). */
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
daedalus_decoder_output_format fmt);
/* Override the dispatch substrate for subsequent flush_frame calls.
* Default is AUTO. Same mid-frame-change restriction as
* set_output_format. */
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
daedalus_decoder_substrate sub);
/* -------------------------------------------------------------------
* Per-frame submission
* ----------------------------------------------------------------- */
/* Append one macroblock's data to the current frame's descriptor SSBO
* + coefficient SSBO. No GPU dispatch yet — just CPU-side writes.
*
* Must be called in raster order (mb_y * mb_width + mb_x) for the
* intra-prediction wavefront to work correctly in Phase 1.
*
* Returns 0 on success, negative on bounds violation or OOM.
*/
int daedalus_decoder_append_mb(daedalus_decoder *dec,
const struct daedalus_decoder_mb_input *mb);
/* End-of-frame flush: builds the per-frame VkCommandBuffer with all
* pipeline stages, submits once, waits on a single fence, copies the
* NV12 (or RGBA when opted in) output into the caller-provided
* planes.
*
* For NV12:
* out_y / y_stride: Y plane (W*H bytes minimum, at the given stride)
* out_uv / uv_stride: interleaved UV plane (W*(H/2) bytes minimum)
*
* For RGBA: out_y receives 4*W*H bytes at y_stride; out_uv ignored.
*
* Returns 0 on success, negative on Vulkan failure or undecodable
* frame. After return, the decoder is ready for the next frame's
* append calls.
*/
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
uint8_t *out_y, size_t y_stride,
uint8_t *out_uv, size_t uv_stride);
/* Export the most-recently-decoded frame as a dma_buf fd. The fd is
* owned by the caller and must be closed when done. Lets V4L2
* consumers (daedalus_v4l2_daemon, libva-v4l2-request-fourier) attach
* the GPU-decoded surface directly to a CAPTURE plane without a CPU
* round-trip.
*
* Returns the dmabuf fd on success, -1 on failure. Must be called
* AFTER flush_frame returns for the relevant frame.
*/
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane);
/* -------------------------------------------------------------------
* Diagnostics
* ----------------------------------------------------------------- */
/* daedalus-decoder build version (semver string, e.g. "0.0.1+g0a1b2c3"). */
const char *daedalus_decoder_version(void);
/* Whether the underlying daedalus-fourier context picked up a working
* V3D7 Vulkan instance. Returns 0 if Vulkan init failed and the
* decoder is operating in stub / failure mode. */
int daedalus_decoder_has_qpu(const daedalus_decoder *dec);
#ifdef __cplusplus
}
#endif
#endif /* DAEDALUS_DECODER_H */