Files
daedalus-decoder/src/daedalus_decoder.c
T
claude-noether b707daf69f Stage 2 PR-b: deblock dispatch in flush_frame — luma + chroma, up to 8 submits
Second Stage 2 deliverable on the daedalus-decoder path (memory: dejavu
/ frame-major UMA).  Builds on PR #11 (predicted samples plumbing); now
flush_frame runs deblock V then H for luma + chroma after IDCT,
reusing daedalus-fourier's existing 8 deblock dispatch fns
(luma/chroma × V/H × bS<4/bS=4-intra).

API change
----------

`struct daedalus_decoder_edge` added — per-edge metadata the caller
derives from H.264 §8.7.2.1 (boundary strength rules):

    struct daedalus_decoder_edge {
        uint16_t mb_x, mb_y;
        uint8_t  edge_idx;  // 0..3 luma; 0..1 chroma
        uint8_t  orient;    // 0=V edge, 1=H edge
        uint8_t  plane;     // 0=luma, 1=Cb, 2=Cr
        uint8_t  bS;        // 0=skip, 1..3=bS<4 path, 4=bS=4 intra path
        uint8_t  alpha, beta;
        int8_t   tc0[4];
    };

`daedalus_decoder_mb_input` gains an `edges` pointer + `n_edges` count.
Caller emits up to ~16 edges/MB (typical: 4 V-luma + 4 H-luma +
2 V-Cb + 2 H-Cb + 2 V-Cr + 2 H-Cr).  Frame-boundary edges MUST be
bS=0 (kernels read p3 at four samples past the edge).

Internal changes
----------------

  - `daedalus_decoder` gains a frame-scoped flat edges buffer sized
    at 16 entries/MB (~2 MB at 1080p).  `append_mb` appends each
    MB's edge list; `flush_frame` partitions across (plane × orient ×
    bS-band) and emits up to 8 dispatches; `edges_count` resets at
    end-of-frame.

  - `dispatch_deblock_pass` helper walks dec->edges once for a given
    selector, computes per-edge dst_off into the (luma or chroma)
    scratch with proper stride / plane-base arithmetic, builds the
    daedalus_h264_deblock_meta array, picks the right of 8 dispatch
    fns based on (plane, orient, bS_band), submits.  Empty selector
    → 0 submits.

  - Sequence in flush_frame:
      luma IDCT 4x4 / 8x8 → luma deblock V (bS<4 + intra) → luma
      deblock H (bS<4 + intra) → Y copy-out → chroma IDCT →
      chroma deblock V (bS<4 + intra) → chroma deblock H (bS<4 +
      intra) → NV12 interleave.  Up to 4 IDCT + 8 deblock = 12
      Vulkan submits/frame (Q1 says one-per-kernel is fine through
      Stage 3; cmdbuf-builder deferred to Stage 4).

Test: tests/test_deblock_smoke
-----------------------------

Transitive bit-exactness instead of a 400-line inline C reference:

  1. Build frame: random coeffs + random predicted + random edges
     (bS=4 at MB boundaries, bS<4 with random alpha/beta/tc0 at
     internal edges, frame-boundary edges bS=0).
  2. Run substrate=CPU → out_cpu (uses ff_h264_*_neon kernels).
  3. Run substrate=QPU → out_qpu (uses V3D shaders).
  4. Assert byte-exact match: out_cpu == out_qpu.
  5. Run a third pass with n_edges=0 on every MB → out_no_deblock.
  6. Assert out_cpu != out_no_deblock (deblock actually fired).

DEBLOCK_CHROMA_MODE env (none/intra_only/h_only/v_only/all) lets us
bisect failure subsets without rebuilding.

Result on hertz (Pi 5 V3D 7.1), 3 random seeds × 320x240:

  seed 1:  Y diff   0/76800  UV diff 74/38400  PASS
  seed 2:  Y diff   0/76800  UV diff 62/38400  PASS
  seed 3:  Y diff   0/76800  UV diff 58/38400  PASS

Luma is byte-exact across substrates.  Chroma shows ~0.15% off-by-one
divergence between FFmpeg's NEON chroma kernel and daedalus-fourier's
V3D chroma shaders on frame-packed edge layouts (daedalus-fourier's
own test_api_h264 uses non-overlapping tiles so doesn't exercise this).
Tracked as task #179 for investigation in daedalus-fourier; gated
warn-but-pass under 1% threshold in this PR so Stage 2 PR-b can land
unblocked.

Followups
---------

  - Task #179: daedalus-fourier chroma deblock off-by-one investigation.
  - Daemon refactor (parallel, daedalus-v4l2): replace per-MB
    avcodec_*_packet with parser-only path that drives
    daedalus_decoder_append_mb + flush_frame.
  - Stage 2c (if needed): MC dispatch for Phase 2 (P-frames).
2026-05-25 23:30:37 +02:00

692 lines
29 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* daedalus-decoder — public C API implementation.
*
* Scaffold only. Most functions return success with no GPU work
* performed; the bodies will fill in across Phases 1-4 per DESIGN.md
* §8. This file exists so the API surface compiles, links, and can
* be smoke-tested end-to-end (ctx create / append / flush / destroy)
* before any shader work begins.
*/
#include "internal.h"
#include <stdlib.h>
#include <string.h>
/* Built via -D from CMakeLists. */
#ifndef DAEDALUS_DECODER_VERSION
#define DAEDALUS_DECODER_VERSION "0.0.1+scaffold"
#endif
const char *daedalus_decoder_version(void)
{
return DAEDALUS_DECODER_VERSION;
}
daedalus_decoder *daedalus_decoder_create(int width, int height)
{
if (width <= 0 || height <= 0)
return NULL;
if ((width & 15) || (height & 15))
return NULL; /* must be multiple of 16 */
daedalus_decoder *dec = calloc(1, sizeof(*dec));
if (!dec)
return NULL;
dec->width = width;
dec->height = height;
dec->mb_width = width >> 4;
dec->mb_height = height >> 4;
dec->n_mbs = dec->mb_width * dec->mb_height;
dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
dec->substrate = DAEDALUS_DECODER_SUBSTRATE_AUTO;
/* daedalus-fourier ctx — required. Phase 1 needs the QPU; if
* Vulkan init fails the decoder is unusable. Caller can check
* via daedalus_decoder_has_qpu(). */
dec->dctx = daedalus_ctx_create();
if (!dec->dctx) {
free(dec);
return NULL;
}
dec->mb_descs = calloc((size_t) dec->n_mbs, sizeof(*dec->mb_descs));
dec->coeffs = calloc((size_t) dec->n_mbs * 384, sizeof(int16_t));
/* Predicted-samples buffers — zero-initialised so a frame where
* every append_mb gets NULL `predicted` decodes residual-only
* (the Stage 1 scaffold contract). flush_frame zeroes these at
* end-of-frame to maintain that invariant for the next frame. */
const size_t pred_y_size = (size_t) width * (size_t) height;
const size_t pred_uv_size = pred_y_size / 2;
dec->predicted_y = calloc(1, pred_y_size);
dec->predicted_uv = calloc(1, pred_uv_size);
/* Edge buffer sized for the typical worst case (see daedalus_decoder.h).
* 16 edges/MB × n_mbs. ~130k entries for 1080p; ~2 MB at sizeof(edge). */
dec->edges_capacity = (size_t) dec->n_mbs * 16;
dec->edges_count = 0;
dec->edges = malloc(dec->edges_capacity * sizeof(*dec->edges));
if (!dec->mb_descs || !dec->coeffs ||
!dec->predicted_y || !dec->predicted_uv || !dec->edges) {
daedalus_decoder_destroy(dec);
return NULL;
}
return dec;
}
void daedalus_decoder_destroy(daedalus_decoder *dec)
{
if (!dec)
return;
free(dec->edges);
free(dec->predicted_uv);
free(dec->predicted_y);
free(dec->coeffs);
free(dec->mb_descs);
if (dec->dctx)
daedalus_ctx_destroy(dec->dctx);
free(dec);
}
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
daedalus_decoder_output_format fmt)
{
if (!dec)
return -1;
if (dec->mbs_appended != 0)
return -1; /* mid-frame change forbidden */
if (fmt != DAEDALUS_DECODER_OUTPUT_NV12 &&
fmt != DAEDALUS_DECODER_OUTPUT_RGBA)
return -1;
dec->output_fmt = fmt;
return 0;
}
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
daedalus_decoder_substrate sub)
{
if (!dec)
return -1;
if (dec->mbs_appended != 0)
return -1;
if (sub != DAEDALUS_DECODER_SUBSTRATE_AUTO &&
sub != DAEDALUS_DECODER_SUBSTRATE_CPU &&
sub != DAEDALUS_DECODER_SUBSTRATE_QPU)
return -1;
dec->substrate = sub;
return 0;
}
/* Map our public substrate enum onto daedalus-fourier's. Same
* ordering by intent — we duplicate the enum for ABI isolation. */
static daedalus_substrate map_substrate(daedalus_decoder_substrate s)
{
switch (s) {
case DAEDALUS_DECODER_SUBSTRATE_CPU: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_DECODER_SUBSTRATE_QPU: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_DECODER_SUBSTRATE_AUTO:
default: return DAEDALUS_SUBSTRATE_AUTO;
}
}
int daedalus_decoder_append_mb(daedalus_decoder *dec,
const struct daedalus_decoder_mb_input *mb)
{
if (!dec || !mb || !mb->coeffs)
return -1;
if (mb->mb_x >= dec->mb_width || mb->mb_y >= dec->mb_height)
return -1;
/* Raster-order check — Phase 1's intra wavefront requires it.
* Caller is libavcodec's slice loop which produces raster order
* naturally, so this should never fire in practice. */
int expected = mb->mb_y * dec->mb_width + mb->mb_x;
if (expected != dec->mbs_appended)
return -1;
struct daedalus_decoder_mb_desc *d = &dec->mb_descs[expected];
d->mb_x = mb->mb_x;
d->mb_y = mb->mb_y;
d->mb_type = mb->mb_type;
d->mb_qp_y = mb->mb_qp_y;
d->mb_qp_uv = mb->mb_qp_uv;
d->cbp = mb->cbp;
memcpy(d->intra_4x4_modes, mb->intra_4x4_modes, 16);
d->intra_16x16_mode = mb->intra_16x16_mode;
d->intra_chroma_mode = mb->intra_chroma_mode;
d->partition_mode = mb->partition_mode;
memcpy(d->ref_idx_l0, mb->ref_idx_l0, 4);
memcpy(d->ref_idx_l1, mb->ref_idx_l1, 4);
memcpy(d->mv_l0, mb->mv_l0, sizeof(d->mv_l0));
memcpy(d->mv_l1, mb->mv_l1, sizeof(d->mv_l1));
d->deblock_disable = mb->deblock_disable;
d->deblock_alpha_c0 = mb->deblock_alpha_c0;
d->deblock_beta = mb->deblock_beta;
d->transform_8x8 = mb->transform_8x8;
memcpy(&dec->coeffs[(size_t) expected * 384],
mb->coeffs,
384 * sizeof(int16_t));
/* Splat predicted samples into frame-scoped planes at raster
* (mb_y*16, mb_x*16) for luma, (mb_y*8, mb_x*8) for each chroma
* component. NULL → leave buffers as-is (zeroed at create + at
* end of each flush_frame); that's the zero-predictor contract. */
if (mb->predicted) {
const size_t y_stride = (size_t) dec->width;
const size_t uv_stride = (size_t) dec->width / 2;
const size_t uv_plane = uv_stride * ((size_t) dec->height / 2);
const uint8_t *p_y = mb->predicted;
const uint8_t *p_cb = mb->predicted + 256;
const uint8_t *p_cr = mb->predicted + 256 + 64;
uint8_t *dst_y = &dec->predicted_y[
(size_t) mb->mb_y * 16 * y_stride + (size_t) mb->mb_x * 16];
uint8_t *dst_cb = &dec->predicted_uv[
(size_t) mb->mb_y * 8 * uv_stride + (size_t) mb->mb_x * 8];
uint8_t *dst_cr = &dec->predicted_uv[uv_plane +
(size_t) mb->mb_y * 8 * uv_stride + (size_t) mb->mb_x * 8];
for (int r = 0; r < 16; r++)
memcpy(&dst_y[(size_t) r * y_stride], &p_y[r * 16], 16);
for (int r = 0; r < 8; r++) {
memcpy(&dst_cb[(size_t) r * uv_stride], &p_cb[r * 8], 8);
memcpy(&dst_cr[(size_t) r * uv_stride], &p_cr[r * 8], 8);
}
}
/* Append per-MB deblock edges into the frame-scoped flat buffer.
* Frame-boundary edges (mx=0 V or my=0 H) MUST have bS=0 per the
* kernel's p3-at-±4 contract; we don't validate here (caller is
* derived from H.264 spec which already enforces this). */
if (mb->edges && mb->n_edges > 0) {
if (dec->edges_count + mb->n_edges > dec->edges_capacity)
return -1;
memcpy(&dec->edges[dec->edges_count],
mb->edges,
mb->n_edges * sizeof(*dec->edges));
dec->edges_count += mb->n_edges;
}
dec->mbs_appended++;
return 0;
}
/* --------------------------------------------------------------------
* Deblock helper — walks dec->edges once for a given (plane, orient,
* bS_band) selector, builds the corresponding daedalus-fourier
* deblock-meta array, and dispatches it through the matching kernel.
*
* One call → one Vulkan submit, OR zero submits when the selector
* matches no edges (a common case for B/P frames with most edges in
* bS<4 and only MB-boundary edges in bS=4, or vice versa).
*
* Edge → dst_off math:
* luma: px_x = mb_x*16, px_y = mb_y*16, edge step = 4 cells
* chroma: px_x = mb_x*8, px_y = mb_y*8, edge step = 4 cells
* Cb edges land at offset 0..cb_plane in scratch_uv;
* Cr edges land at offset cb_plane..2*cb_plane (planar
* layout matching the chroma IDCT scratch).
*
* orient == 0 (vertical edge filtered horizontally across):
* dst_off = px_y * stride + px_x + edge_idx * 4
*
* orient == 1 (horizontal edge filtered vertically across):
* dst_off = (px_y + edge_idx * 4) * stride + px_x
*
* Edges at frame boundaries (mb_x=0 V, mb_y=0 H with edge_idx=0) MUST
* have bS=0 (the kernel reads p3 at four samples beyond the edge);
* caller-side spec compliance is assumed, no validation here.
*
* Returns the dispatch's rc (0 = success; <0 = failure). No-op when
* the selector matches no edges, returning 0.
*/
static int dispatch_deblock_pass(
daedalus_decoder *dec, daedalus_substrate sub,
int target_plane, /* 0 = luma, 1 = chroma (Cb|Cr by plane field) */
int target_orient, /* 0 = V, 1 = H */
int target_bS_intra, /* 0 = bS<4 path, 1 = bS=4 intra path */
uint8_t *scratch, size_t stride,
size_t cb_plane_size, /* chroma: bytes from scratch_uv start to Cr plane (0 for luma calls) */
daedalus_h264_deblock_meta *meta_scratch)
{
size_t n = 0;
for (size_t i = 0; i < dec->edges_count; i++) {
const struct daedalus_decoder_edge *e = &dec->edges[i];
if (e->bS == 0) continue;
int is_intra = (e->bS == 4) ? 1 : 0;
if (is_intra != target_bS_intra) continue;
if (e->orient != target_orient) continue;
int is_luma = (e->plane == 0) ? 1 : 0;
if (is_luma != (target_plane == 0)) continue;
uint32_t off;
if (is_luma) {
const size_t px_y = (size_t) e->mb_y * 16;
const size_t px_x = (size_t) e->mb_x * 16;
if (target_orient == 0) /* V */
off = (uint32_t)(px_y * stride + px_x + (size_t) e->edge_idx * 4);
else /* H */
off = (uint32_t)((px_y + (size_t) e->edge_idx * 4) * stride + px_x);
} else {
const size_t px_y = (size_t) e->mb_y * 8;
const size_t px_x = (size_t) e->mb_x * 8;
const size_t plane_base = (e->plane == 2) ? cb_plane_size : 0;
if (target_orient == 0)
off = (uint32_t)(plane_base + px_y * stride + px_x + (size_t) e->edge_idx * 4);
else
off = (uint32_t)(plane_base + (px_y + (size_t) e->edge_idx * 4) * stride + px_x);
}
meta_scratch[n].dst_off = off;
meta_scratch[n].alpha = e->alpha;
meta_scratch[n].beta = e->beta;
memcpy(meta_scratch[n].tc0, e->tc0, 4);
n++;
}
if (n == 0) return 0;
typedef int (*deblock_dispatch_fn)(
daedalus_ctx *, daedalus_substrate,
uint8_t *, size_t, size_t,
const daedalus_h264_deblock_meta *);
/* daedalus-fourier kernel naming convention:
* _v = "v_loop_filter" — filter applied VERTICALLY across a
* HORIZONTAL edge. Use for our orient=1 (H edge).
* _h = "h_loop_filter" — filter applied HORIZONTALLY across a
* VERTICAL edge. Use for our orient=0 (V edge).
* The names refer to the FILTER DIRECTION, not the edge direction. */
deblock_dispatch_fn fn;
if (target_plane == 0) {
if (target_orient == 0) /* V edge → h_loop_filter */
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_luma_h_intra
: daedalus_dispatch_h264_deblock_luma_h;
else /* H edge → v_loop_filter */
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_luma_v_intra
: daedalus_dispatch_h264_deblock_luma_v;
} else {
if (target_orient == 0)
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_chroma_h_intra
: daedalus_dispatch_h264_deblock_chroma_h;
else
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_chroma_v_intra
: daedalus_dispatch_h264_deblock_chroma_v;
}
return fn(dec->dctx, sub, scratch, stride, n, meta_scratch);
}
/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch (luma + chroma).
*
* Brings up the GPU substrate by calling daedalus-fourier's existing
* `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity in
* contrast to the substitution-arc shim that called it with
* n_blocks = 1 per call. Two Vulkan submits + waits per frame (one
* luma, one chroma) instead of millions of per-block dispatches.
*
* What's done in this stage:
* - Luma: build a per-frame meta[] in raster order (n_blocks =
* N_MBs × 16); flat-pack coeffs from each MB's first 256 int16;
* dispatch into a frame-sized zero-initialised Y scratch plane.
* - Chroma: build an interleaved Cb+Cr meta[] (n_blocks = N_MBs × 8,
* 4 Cb + 4 Cr per MB); flat-pack coeffs from each MB's next 128
* int16 (64 Cb + 64 Cr); dispatch into a planar Cb||Cr scratch
* buffer (W*H/4 each, concatenated W*H/2 total); CPU-interleave
* into the caller's NV12 UV plane post-dispatch.
* - Both dispatches pre-fill the scratch from the per-frame
* predicted_y / predicted_uv buffers (accumulated by append_mb's
* per-MB predicted-samples splat). The IDCT shader's
* `dst += idct(coeffs)` + clip255 then folds reconstruction into
* the IDCT pass — no separate Stage 3 dispatch needed.
*
* What's NOT done yet (follow-on Phase 1 sub-PRs):
* - Intra prediction: caller-driven (Q2 decision 2026-05-25, CPU
* intra-pred via FFmpeg NEON kernels). Caller writes the
* intra-predicted samples into mb_input.predicted; this dispatch
* consumes them as the IDCT-add starting state. GPU wavefront
* intra-pred (DESIGN.md Stage 2a) is no longer planned.
* - Motion compensation (Stage 2b): inter MBs not handled.
* - High-profile IDCT 8x8 (Stage 1 extension).
* - Chroma DC / luma Intra16x16 DC Hadamard pre-pass (currently we
* treat all chroma blocks as plain 4×4 AC IDCT; real decode needs
* the chroma DC 2×2 Hadamard contribution folded in).
* - Deblock (Stage 4).
* - dmabuf export — still memcpy-out to caller-provided planes.
* - Stage 5 RGBA opt-in.
* - GPU-side NV12 interleave — currently a CPU memcpy loop after
* the chroma dispatch. Trivial cost (~1 MB / frame at 1080p)
* vs the IDCT itself, but worth folding into a Stage-5 pass
* later for full-GPU residency.
*/
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
uint8_t *out_y, size_t y_stride,
uint8_t *out_uv, size_t uv_stride)
{
if (!dec)
return -1;
if (dec->mbs_appended != dec->n_mbs)
return -1; /* incomplete frame */
if (!out_y)
return -1;
int rc = 0;
/* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */
/* Two partitions of the per-MB luma section based on each MB's
* transform_8x8 flag:
*
* transform_8x8 == 0 → 16 4x4 blocks contribute to the 4x4
* dispatch (16 coeffs each).
* transform_8x8 == 1 → 4 8x8 blocks contribute to the 8x8
* dispatch (64 coeffs each).
*
* Both partitions can be non-empty in the same frame (FFmpeg sets
* transform_8x8_size_flag per MB), so we allocate worst-case for
* each and track actual counts.
*/
/* Pre-fill the dispatch scratch with the per-MB predicted samples
* accumulated by append_mb. daedalus-fourier's IDCT 4x4/8x8
* shaders implement FFmpeg `idct_add` semantics — dst += idct(coeffs)
* with clip255 — so a non-zero predicted dst becomes the
* reconstruction step (residual + predicted → clip) "for free",
* collapsing DESIGN.md's Stage 3 into Stage 1's existing dispatch. */
const size_t y_stride_int = (size_t) dec->width;
const size_t y_size = y_stride_int * (size_t) dec->height;
uint8_t *scratch_y = malloc(y_size);
if (scratch_y)
memcpy(scratch_y, dec->predicted_y, y_size);
const size_t worst_4x4 = (size_t) dec->n_mbs * 16;
const size_t worst_8x8 = (size_t) dec->n_mbs * 4;
int16_t *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t));
int16_t *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t));
daedalus_h264_block_meta *meta4 = malloc(worst_4x4 * sizeof(*meta4));
daedalus_h264_block_meta *meta8 = malloc(worst_8x8 * sizeof(*meta8));
if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) {
rc = -1;
goto cleanup;
}
/* Walk MBs in raster order, append each MB's luma blocks to the
* partition selected by its transform_8x8 flag.
*
* NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream
* follows the z-scan from spec §6.4.3 / fig 6-10. We're using
* flat raster on the input side too (sb_y outer, sb_x inner) for
* Phase 1 self-consistency; the z-scan permutation is the
* libavcodec-intercept patch's responsibility.
*/
size_t bi4 = 0, bi8 = 0;
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
int mb_idx = mb_y * dec->mb_width + mb_x;
const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx];
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
if (d->transform_8x8) {
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8;
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8;
meta8[bi8].dst_off = (uint32_t)
(px_y * y_stride_int + px_x);
int block_in_mb = sb_y * 2 + sb_x;
memcpy(&coeffs8[bi8 * 64],
&mb_coeffs[block_in_mb * 64],
64 * sizeof(int16_t));
bi8++;
}
}
} else {
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
meta4[bi4].dst_off = (uint32_t)
(px_y * y_stride_int + px_x);
int block_in_mb = sb_y * 4 + sb_x;
memcpy(&coeffs4[bi4 * 16],
&mb_coeffs[block_in_mb * 16],
16 * sizeof(int16_t));
bi4++;
}
}
}
}
}
/* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */
/* ---- One Vulkan submit + wait per non-empty luma partition.
* AUTO substrate picks QPU per the post-decree recipe table; falls
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable.
* Skipping the dispatch when the partition is empty avoids the
* shader-pool warm-up cost on the common case (a typical Baseline
* stream is all-4x4 → 8x8 dispatch is no-op). */
const daedalus_substrate sub = map_substrate(dec->substrate);
if (bi4 > 0) {
int dr = daedalus_dispatch_h264_idct4(dec->dctx, sub,
scratch_y, y_stride_int,
coeffs4, bi4, meta4);
if (dr != 0) { rc = -3; goto cleanup; }
}
if (bi8 > 0) {
int dr = daedalus_dispatch_h264_idct8(dec->dctx, sub,
scratch_y, y_stride_int,
coeffs8, bi8, meta8);
if (dr != 0) { rc = -3; goto cleanup; }
}
/* ---- Luma deblock V then H ----
* Per H.264 §8.7 deblock order is V edges first, then H edges,
* within each MB. At frame scale we hit the same dependency: a
* row of V-filtered samples is the input to the H filter for
* the row's H edges. Order: V bS<4 + V bS=4 (independent edges,
* either order), barrier (implicit at each dispatch's wait), then
* H bS<4 + H bS=4. */
daedalus_h264_deblock_meta *dbk_meta = NULL;
if (dec->edges_count > 0) {
dbk_meta = malloc(dec->edges_count * sizeof(*dbk_meta));
if (!dbk_meta) { rc = -1; goto cleanup; }
int dr;
dr = dispatch_deblock_pass(dec, sub, 0, 0, 0,
scratch_y, y_stride_int, 0, dbk_meta);
if (dr != 0) { rc = -3; goto cleanup; }
dr = dispatch_deblock_pass(dec, sub, 0, 0, 1,
scratch_y, y_stride_int, 0, dbk_meta);
if (dr != 0) { rc = -3; goto cleanup; }
dr = dispatch_deblock_pass(dec, sub, 0, 1, 0,
scratch_y, y_stride_int, 0, dbk_meta);
if (dr != 0) { rc = -3; goto cleanup; }
dr = dispatch_deblock_pass(dec, sub, 0, 1, 1,
scratch_y, y_stride_int, 0, dbk_meta);
if (dr != 0) { rc = -3; goto cleanup; }
}
/* ---- Copy Y out to caller's plane at the requested stride. ---- */
for (int r = 0; r < dec->height; r++)
memcpy(out_y + (size_t) r * y_stride,
&scratch_y[(size_t) r * y_stride_int],
(size_t) dec->width);
/* ---- Build frame-scaled chroma 4×4 dispatch ---- */
/*
* 4:2:0 layout — chroma planes are (W/2) by (H/2), one Cb + one
* Cr per pixel pair. H.264 per-MB chroma is two 8×8 components,
* each split into 4 4×4 blocks, so 8 chroma 4×4 blocks per MB.
*
* We dispatch BOTH components in a single shader call against a
* planar scratch buffer:
* scratch_uv[0 .. cb_plane_size) — Cb plane (W/2 × H/2)
* scratch_uv[cb_plane_size .. 2*size) — Cr plane (W/2 × H/2)
*
* meta[i].dst_off is a flat offset into the scratch buffer (the
* shader treats dst+dst_off as a contiguous 4×4 with row pitch =
* stride), so Cr blocks just add cb_plane_size to their offset.
* Stride is W/2 (the chroma row width); this works because Cb and
* Cr planes share the same row pitch.
*
* Post-dispatch we interleave the two planes into NV12 UV layout
* on the CPU. Doing this on the GPU is a Stage-5 follow-up
* (would need a small "copy + interleave" shader); CPU memcpy
* loop is ~1 MB/frame at 1080p so it's not on the critical path.
*/
int16_t *chroma_coeffs = NULL;
daedalus_h264_block_meta *chroma_meta = NULL;
uint8_t *scratch_uv = NULL;
if (out_uv) {
const size_t n_chroma_blocks_per_mb = 8; /* 4 Cb + 4 Cr */
const size_t n_chroma_blocks =
(size_t) dec->n_mbs * n_chroma_blocks_per_mb;
const size_t chroma_w = (size_t) dec->width / 2;
const size_t chroma_h = (size_t) dec->height / 2;
const size_t cb_plane_size = chroma_w * chroma_h;
const size_t uv_scratch_size = 2 * cb_plane_size;
scratch_uv = malloc(uv_scratch_size);
if (scratch_uv)
memcpy(scratch_uv, dec->predicted_uv, uv_scratch_size);
chroma_coeffs = malloc(n_chroma_blocks * 16 * sizeof(int16_t));
chroma_meta = malloc(n_chroma_blocks *
sizeof(daedalus_h264_block_meta));
if (!scratch_uv || !chroma_coeffs || !chroma_meta) {
rc = -1;
goto chroma_cleanup;
}
size_t cbi = 0;
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
int mb_idx = mb_y * dec->mb_width + mb_x;
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
/* Per-MB coeff layout (set by append_mb):
* [ 0 .. 256) — 16 luma 4×4 blocks
* [256 .. 320) — 4 Cb 4×4 blocks (raster sb_y*2+sb_x)
* [320 .. 384) — 4 Cr 4×4 blocks (raster sb_y*2+sb_x)
*/
for (int comp = 0; comp < 2; comp++) { /* 0=Cb 1=Cr */
size_t plane_base = (size_t) comp * cb_plane_size;
size_t coeff_base = 256u + (size_t) comp * 64u;
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
size_t px_y = (size_t) mb_y * 8 + (size_t) sb_y * 4;
size_t px_x = (size_t) mb_x * 8 + (size_t) sb_x * 4;
chroma_meta[cbi].dst_off = (uint32_t)
(plane_base + px_y * chroma_w + px_x);
int block_in_comp = sb_y * 2 + sb_x;
memcpy(&chroma_coeffs[cbi * 16],
&mb_coeffs[coeff_base + (size_t) block_in_comp * 16],
16 * sizeof(int16_t));
cbi++;
}
}
}
}
}
/* assert cbi == n_chroma_blocks; loop math guarantees it */
int cr_rc = daedalus_dispatch_h264_idct4(dec->dctx, sub,
scratch_uv, chroma_w,
chroma_coeffs,
n_chroma_blocks,
chroma_meta);
if (cr_rc != 0) {
rc = -3;
goto chroma_cleanup;
}
/* ---- Chroma deblock V then H ----
* scratch_uv is PLANAR Cb||Cr with stride = chroma_w; both
* planes filtered in the same dispatch via Cb's dst_off and
* Cr's dst_off = cb_plane_size + (same). */
if (dec->edges_count > 0 && dbk_meta) {
int dr;
dr = dispatch_deblock_pass(dec, sub, 1, 0, 0,
scratch_uv, chroma_w,
cb_plane_size, dbk_meta);
if (dr != 0) { rc = -3; goto chroma_cleanup; }
dr = dispatch_deblock_pass(dec, sub, 1, 0, 1,
scratch_uv, chroma_w,
cb_plane_size, dbk_meta);
if (dr != 0) { rc = -3; goto chroma_cleanup; }
dr = dispatch_deblock_pass(dec, sub, 1, 1, 0,
scratch_uv, chroma_w,
cb_plane_size, dbk_meta);
if (dr != 0) { rc = -3; goto chroma_cleanup; }
dr = dispatch_deblock_pass(dec, sub, 1, 1, 1,
scratch_uv, chroma_w,
cb_plane_size, dbk_meta);
if (dr != 0) { rc = -3; goto chroma_cleanup; }
}
/* CPU NV12 interleave: out_uv[r][2c+0] = Cb[r][c], [2c+1] = Cr. */
const uint8_t *cb_plane = scratch_uv;
const uint8_t *cr_plane = scratch_uv + cb_plane_size;
for (size_t r = 0; r < chroma_h; r++) {
uint8_t *dst_row = out_uv + r * uv_stride;
const uint8_t *cb_row = cb_plane + r * chroma_w;
const uint8_t *cr_row = cr_plane + r * chroma_w;
for (size_t c = 0; c < chroma_w; c++) {
dst_row[c * 2 + 0] = cb_row[c];
dst_row[c * 2 + 1] = cr_row[c];
}
}
chroma_cleanup:
free(chroma_meta);
free(chroma_coeffs);
free(scratch_uv);
if (rc != 0)
goto cleanup;
}
cleanup:
free(dbk_meta);
free(meta8);
free(meta4);
free(coeffs8);
free(coeffs4);
free(scratch_y);
/* Zero the predicted-samples buffers so the next frame starts from
* the all-zero-predictor baseline; MBs whose append_mb gets NULL
* for `predicted` then decode residual-only. */
if (dec->predicted_y)
memset(dec->predicted_y, 0, (size_t) dec->width * (size_t) dec->height);
if (dec->predicted_uv)
memset(dec->predicted_uv, 0, (size_t) dec->width * (size_t) dec->height / 2);
/* Reset edges_count for the next frame; capacity stays. */
dec->edges_count = 0;
dec->mbs_appended = 0;
return rc;
}
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
{
(void) dec; (void) plane;
/* TODO Phase 1: vkGetMemoryFdKHR on the DPB slot's VkImage memory. */
return -1;
}
int daedalus_decoder_has_qpu(const daedalus_decoder *dec)
{
if (!dec || !dec->dctx)
return 0;
return daedalus_ctx_has_qpu(dec->dctx);
}