301 lines
13 KiB
C
301 lines
13 KiB
C
/* SPDX-License-Identifier: BSD-2-Clause */
|
||
/*
|
||
* daedalus-decoder — public C API.
|
||
*
|
||
* Frame-level GPU H.264 decoder targeting V3D7 (Raspberry Pi 5). Built
|
||
* on daedalus-fourier's V3D compute primitives at frame granularity —
|
||
* one Vulkan submit per frame, one fence wait per frame, encoded
|
||
* bitstream in (via libavcodec's per-MB intercept), NV12 frame out.
|
||
*
|
||
* Per the 2026-05-24 Phase 1 design decisions:
|
||
* - libavcodec intercept is at macroblock-level (substitution-arc
|
||
* evolution): the caller is expected to drive the per-MB CABAC /
|
||
* CAVLC entropy decode and feed each macroblock's descriptor +
|
||
* coefficients via daedalus_decoder_append_mb(). flush_frame()
|
||
* builds the per-frame VkCommandBuffer and submits.
|
||
* - DPB is Vulkan-native VkImage with VK_KHR_external_memory_dma_buf
|
||
* export. The caller can obtain the output frame's dmabuf fd
|
||
* via daedalus_decoder_export_dmabuf().
|
||
* - Daemon integration shape: this library is statically linked into
|
||
* daedalus_v4l2_daemon. No IPC.
|
||
*
|
||
* STATUS: scaffold. No GPU pipeline implemented yet; all functions
|
||
* are stubs that compile but do not decode anything. See DESIGN.md
|
||
* for the architecture.
|
||
*
|
||
* ABI: pre-0.1 — every signature here may change. Don't rely on
|
||
* stability yet.
|
||
*/
|
||
#ifndef DAEDALUS_DECODER_H
|
||
#define DAEDALUS_DECODER_H
|
||
|
||
#include <stddef.h>
|
||
#include <stdint.h>
|
||
|
||
#ifdef __cplusplus
|
||
extern "C" {
|
||
#endif
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Opaque decoder context. One per concurrent stream.
|
||
* ----------------------------------------------------------------- */
|
||
typedef struct daedalus_decoder daedalus_decoder;
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Per-edge deblock metadata. One entry per filter-edge; the caller
|
||
* derives these from H.264 §8.7.2.1 boundary-strength rules.
|
||
*
|
||
* Coordinate convention:
|
||
* mb_x / mb_y — the MB whose top-left this edge sits on (the "right"
|
||
* side for vertical edges, "bottom" side for horizontal
|
||
* edges, in H.264 spec's q-side convention).
|
||
* edge_idx — 0..3 within the MB:
|
||
* luma: edge 0 = MB boundary, edges 1..3 = internal
|
||
* at cols/rows 4, 8, 12.
|
||
* chroma: edge 0 = MB boundary, edge 1 = internal at
|
||
* col/row 4. edge_idx > 1 invalid for chroma.
|
||
* Edges at frame boundaries (top row of MBs for H edges;
|
||
* left column for V edges) MUST be bS=0 — the kernel
|
||
* reads p3 at four samples beyond the edge.
|
||
* orient — 0 = vertical edge (filtered horizontally across), 1 = horizontal.
|
||
* plane — 0 = luma, 1 = chroma Cb, 2 = chroma Cr. Cb and Cr
|
||
* always share the same filter parameters per H.264
|
||
* spec, but are listed separately so the caller can
|
||
* omit one or the other if needed.
|
||
* bS — 0 = skip this edge (no GPU work), 1..3 = bS<4 path
|
||
* (uses tc0), 4 = bS=4 "intra" path (ignores tc0).
|
||
* alpha, beta — H.264 §8.7.2.2 table 8-16/8-17 values, both 0..255.
|
||
* tc0[4] — per-4-cell segment strength along the edge (luma has
|
||
* 4 segments; chroma has 4 also, with 2 cells each).
|
||
* IGNORED when bS == 4.
|
||
* ----------------------------------------------------------------- */
|
||
struct daedalus_decoder_edge {
|
||
uint16_t mb_x;
|
||
uint16_t mb_y;
|
||
uint8_t edge_idx;
|
||
uint8_t orient;
|
||
uint8_t plane;
|
||
uint8_t bS;
|
||
uint8_t alpha;
|
||
uint8_t beta;
|
||
int8_t tc0[4];
|
||
};
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Per-macroblock input. Mirrors §3 of DESIGN.md. The caller's
|
||
* libavcodec intercept populates this from the H264SliceContext
|
||
* fields after ff_h264_decode_mb_cabac/cavlc returns and before
|
||
* ff_h264_hl_decode_mb is supposed to run (we replace the latter).
|
||
* ----------------------------------------------------------------- */
|
||
struct daedalus_decoder_mb_input {
|
||
/* Frame coordinates (macroblock units). */
|
||
uint16_t mb_x;
|
||
uint16_t mb_y;
|
||
|
||
/* Type + quantisation. */
|
||
uint8_t mb_type; /* H.264 spec table 7-13/7-14/7-17/7-18 enum */
|
||
uint8_t mb_qp_y;
|
||
uint8_t mb_qp_uv;
|
||
uint8_t cbp; /* coded block pattern, 0..47 */
|
||
|
||
/* Intra prediction (used iff mb_type == I_NxN or I_16x16). */
|
||
uint8_t intra_4x4_modes[16];
|
||
uint8_t intra_16x16_mode;
|
||
uint8_t intra_chroma_mode;
|
||
|
||
/* Inter motion / partitions (used iff P_* or B_*). */
|
||
uint8_t partition_mode; /* P_16x16 / P_16x8 / P_8x16 / P_8x8 / etc. */
|
||
int8_t ref_idx_l0[4]; /* per partition; -1 = not used */
|
||
int8_t ref_idx_l1[4]; /* B only */
|
||
int16_t mv_l0[4][2]; /* qpel precision (1/4 sample); (x, y) */
|
||
int16_t mv_l1[4][2];
|
||
|
||
/* Deblocking filter parameters. */
|
||
uint8_t deblock_disable; /* 0 = enabled */
|
||
int8_t deblock_alpha_c0;
|
||
int8_t deblock_beta;
|
||
|
||
/* High-profile 8x8 transform selector.
|
||
* 0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks
|
||
* (16 coeffs each, raster sb_y*4+sb_x); the chroma section is
|
||
* always 4x4.
|
||
* 1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs
|
||
* each, raster sb_y*2+sb_x). Set per H.264's
|
||
* transform_8x8_size_flag. Chroma remains 4x4 (4:2:0).
|
||
*/
|
||
uint8_t transform_8x8;
|
||
|
||
/* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all
|
||
* column-major within each 4x4 or 8x8 block (matches FFmpeg
|
||
* convention). Caller-owned; copied during append. */
|
||
const int16_t *coeffs; /* points at exactly 384 int16_t */
|
||
|
||
/* Reconstructed predicted samples for this MB, planar order:
|
||
* [ 0 .. 256) — 16×16 luma, ROW-MAJOR raster (row 0 cols 0..15,
|
||
* row 1 cols 0..15, ..., row 15 cols 0..15)
|
||
* [256 .. 320) — 8×8 Cb, ROW-MAJOR raster
|
||
* [320 .. 384) — 8×8 Cr, ROW-MAJOR raster
|
||
*
|
||
* The caller (libavcodec's CPU intra-prediction kernels for Phase 1
|
||
* I-frames; MC fallback for Phase 2 P-frames before GPU MC lands)
|
||
* populates this from neighbour samples per H.264 §8.3 / §8.4.
|
||
* `flush_frame()`'s reconstruction step is `clip255(predicted +
|
||
* idct(coeffs))` — the IDCT shader reads dst, adds the inverse
|
||
* transform, writes clipped — so a non-zero `predicted` here makes
|
||
* the output pixel a valid H.264 reconstruction; zero means
|
||
* residual-only (used by IDCT-isolation tests).
|
||
*
|
||
* NULL is legal and means "all-zero predicted samples" for this MB
|
||
* (the per-frame predicted buffer is zeroed at flush time so a NULL
|
||
* is indistinguishable from explicit zeros). */
|
||
const uint8_t *predicted; /* NULL or exactly 384 uint8_t */
|
||
|
||
/* Per-MB deblock edges — caller-derived per H.264 §8.7.2. Typical
|
||
* count: 4 V-luma + 4 H-luma + 2 V-Cb + 2 H-Cb + 2 V-Cr + 2 H-Cr
|
||
* = 16 edges per MB (omit zero-bS edges if preferred — frame
|
||
* boundaries MUST be bS=0 since the kernels read p3 at four
|
||
* samples beyond the edge). daedalus_decoder routes each entry
|
||
* to the appropriate luma/chroma × V/H × bS=4/<4 dispatch in
|
||
* flush_frame and pays a single Vulkan submit per non-empty
|
||
* (direction × bS-band) partition (≤8 deblock submits / frame
|
||
* total) per the Q1 architecture decision (one-submit-per-kernel
|
||
* for now; cmdbuf-builder deferred to Stage 4).
|
||
*
|
||
* NULL or n_edges == 0 → no deblock on this MB. */
|
||
const struct daedalus_decoder_edge *edges;
|
||
uint8_t n_edges;
|
||
};
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Output frame format selector.
|
||
* ----------------------------------------------------------------- */
|
||
typedef enum {
|
||
DAEDALUS_DECODER_OUTPUT_NV12 = 0, /* default; Stage 4 final */
|
||
DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */
|
||
} daedalus_decoder_output_format;
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Substrate selector. Determines which backend daedalus-fourier
|
||
* dispatches the per-frame compute through.
|
||
*
|
||
* AUTO is the only sensible choice for production — it picks per the
|
||
* recipe table baked into daedalus-fourier (post 2026-05-23 decree:
|
||
* QPU when a V3D shader exists, CPU NEON otherwise). The explicit
|
||
* options exist for testing:
|
||
*
|
||
* - CPU forces the dispatch onto the NEON path even when V3D7 is
|
||
* available. Lets the bit-exact ctests run on hosts without a
|
||
* working Vulkan/V3D stack (CI runners, dev x86 boxes via
|
||
* cross-build), and lets us cross-check the V3D shader output
|
||
* against the NEON reference path on hosts that DO have V3D.
|
||
* - QPU is the dual — force QPU even on a CPU-preferred kernel.
|
||
* Useful for benchmarking specific QPU paths in isolation.
|
||
*
|
||
* A non-AUTO selection on a host that can't satisfy it
|
||
* (DAEDALUS_DECODER_SUBSTRATE_QPU on an x86 dev box) propagates a
|
||
* dispatch failure back through flush_frame as -3.
|
||
* ----------------------------------------------------------------- */
|
||
typedef enum {
|
||
DAEDALUS_DECODER_SUBSTRATE_AUTO = 0,
|
||
DAEDALUS_DECODER_SUBSTRATE_CPU = 1,
|
||
DAEDALUS_DECODER_SUBSTRATE_QPU = 2,
|
||
} daedalus_decoder_substrate;
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Lifecycle
|
||
* ----------------------------------------------------------------- */
|
||
|
||
/* Create a decoder context for the given **coded** frame dimensions.
|
||
*
|
||
* width, height: pixels of the H.264 coded picture, NOT the displayed
|
||
* picture. Both must be multiples of 16 (macroblock granularity).
|
||
* For displayed 1080p (1920×1080), the coded frame is 1920×1088 with
|
||
* the SPS's `frame_cropping_*` offsets cropping the bottom 8 rows.
|
||
* The caller is responsible for translating from SPS dims + crop
|
||
* rectangle to the values passed here; we decode the coded frame.
|
||
*
|
||
* Returns NULL on bad dimensions or allocation failure. Returns a
|
||
* usable context with daedalus_decoder_has_qpu() == 0 when Vulkan
|
||
* init fails — callers that need GPU work should check has_qpu
|
||
* before relying on it.
|
||
*/
|
||
daedalus_decoder *daedalus_decoder_create(int width, int height);
|
||
|
||
/* Free all resources. Safe with NULL. */
|
||
void daedalus_decoder_destroy(daedalus_decoder *dec);
|
||
|
||
/* Switch output format BEFORE the first append_mb call of a frame.
|
||
* Default is NV12. Returns 0 on success, -1 if called mid-frame
|
||
* (caller must flush first). */
|
||
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
|
||
daedalus_decoder_output_format fmt);
|
||
|
||
/* Override the dispatch substrate for subsequent flush_frame calls.
|
||
* Default is AUTO. Same mid-frame-change restriction as
|
||
* set_output_format. */
|
||
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
|
||
daedalus_decoder_substrate sub);
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Per-frame submission
|
||
* ----------------------------------------------------------------- */
|
||
|
||
/* Append one macroblock's data to the current frame's descriptor SSBO
|
||
* + coefficient SSBO. No GPU dispatch yet — just CPU-side writes.
|
||
*
|
||
* Must be called in raster order (mb_y * mb_width + mb_x) for the
|
||
* intra-prediction wavefront to work correctly in Phase 1.
|
||
*
|
||
* Returns 0 on success, negative on bounds violation or OOM.
|
||
*/
|
||
int daedalus_decoder_append_mb(daedalus_decoder *dec,
|
||
const struct daedalus_decoder_mb_input *mb);
|
||
|
||
/* End-of-frame flush: builds the per-frame VkCommandBuffer with all
|
||
* pipeline stages, submits once, waits on a single fence, copies the
|
||
* NV12 (or RGBA when opted in) output into the caller-provided
|
||
* planes.
|
||
*
|
||
* For NV12:
|
||
* out_y / y_stride: Y plane (W*H bytes minimum, at the given stride)
|
||
* out_uv / uv_stride: interleaved UV plane (W*(H/2) bytes minimum)
|
||
*
|
||
* For RGBA: out_y receives 4*W*H bytes at y_stride; out_uv ignored.
|
||
*
|
||
* Returns 0 on success, negative on Vulkan failure or undecodable
|
||
* frame. After return, the decoder is ready for the next frame's
|
||
* append calls.
|
||
*/
|
||
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
||
uint8_t *out_y, size_t y_stride,
|
||
uint8_t *out_uv, size_t uv_stride);
|
||
|
||
/* Export the most-recently-decoded frame as a dma_buf fd. The fd is
|
||
* owned by the caller and must be closed when done. Lets V4L2
|
||
* consumers (daedalus_v4l2_daemon, libva-v4l2-request-fourier) attach
|
||
* the GPU-decoded surface directly to a CAPTURE plane without a CPU
|
||
* round-trip.
|
||
*
|
||
* Returns the dmabuf fd on success, -1 on failure. Must be called
|
||
* AFTER flush_frame returns for the relevant frame.
|
||
*/
|
||
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane);
|
||
|
||
/* -------------------------------------------------------------------
|
||
* Diagnostics
|
||
* ----------------------------------------------------------------- */
|
||
|
||
/* daedalus-decoder build version (semver string, e.g. "0.0.1+g0a1b2c3"). */
|
||
const char *daedalus_decoder_version(void);
|
||
|
||
/* Whether the underlying daedalus-fourier context picked up a working
|
||
* V3D7 Vulkan instance. Returns 0 if Vulkan init failed and the
|
||
* decoder is operating in stub / failure mode. */
|
||
int daedalus_decoder_has_qpu(const daedalus_decoder *dec);
|
||
|
||
#ifdef __cplusplus
|
||
}
|
||
#endif
|
||
|
||
#endif /* DAEDALUS_DECODER_H */
|