b707daf69f
Second Stage 2 deliverable on the daedalus-decoder path (memory: dejavu / frame-major UMA). Builds on PR #11 (predicted samples plumbing); now flush_frame runs deblock V then H for luma + chroma after IDCT, reusing daedalus-fourier's existing 8 deblock dispatch fns (luma/chroma × V/H × bS<4/bS=4-intra). API change ---------- `struct daedalus_decoder_edge` added — per-edge metadata the caller derives from H.264 §8.7.2.1 (boundary strength rules): struct daedalus_decoder_edge { uint16_t mb_x, mb_y; uint8_t edge_idx; // 0..3 luma; 0..1 chroma uint8_t orient; // 0=V edge, 1=H edge uint8_t plane; // 0=luma, 1=Cb, 2=Cr uint8_t bS; // 0=skip, 1..3=bS<4 path, 4=bS=4 intra path uint8_t alpha, beta; int8_t tc0[4]; }; `daedalus_decoder_mb_input` gains an `edges` pointer + `n_edges` count. Caller emits up to ~16 edges/MB (typical: 4 V-luma + 4 H-luma + 2 V-Cb + 2 H-Cb + 2 V-Cr + 2 H-Cr). Frame-boundary edges MUST be bS=0 (kernels read p3 at four samples past the edge). Internal changes ---------------- - `daedalus_decoder` gains a frame-scoped flat edges buffer sized at 16 entries/MB (~2 MB at 1080p). `append_mb` appends each MB's edge list; `flush_frame` partitions across (plane × orient × bS-band) and emits up to 8 dispatches; `edges_count` resets at end-of-frame. - `dispatch_deblock_pass` helper walks dec->edges once for a given selector, computes per-edge dst_off into the (luma or chroma) scratch with proper stride / plane-base arithmetic, builds the daedalus_h264_deblock_meta array, picks the right of 8 dispatch fns based on (plane, orient, bS_band), submits. Empty selector → 0 submits. - Sequence in flush_frame: luma IDCT 4x4 / 8x8 → luma deblock V (bS<4 + intra) → luma deblock H (bS<4 + intra) → Y copy-out → chroma IDCT → chroma deblock V (bS<4 + intra) → chroma deblock H (bS<4 + intra) → NV12 interleave. Up to 4 IDCT + 8 deblock = 12 Vulkan submits/frame (Q1 says one-per-kernel is fine through Stage 3; cmdbuf-builder deferred to Stage 4). Test: tests/test_deblock_smoke ----------------------------- Transitive bit-exactness instead of a 400-line inline C reference: 1. Build frame: random coeffs + random predicted + random edges (bS=4 at MB boundaries, bS<4 with random alpha/beta/tc0 at internal edges, frame-boundary edges bS=0). 2. Run substrate=CPU → out_cpu (uses ff_h264_*_neon kernels). 3. Run substrate=QPU → out_qpu (uses V3D shaders). 4. Assert byte-exact match: out_cpu == out_qpu. 5. Run a third pass with n_edges=0 on every MB → out_no_deblock. 6. Assert out_cpu != out_no_deblock (deblock actually fired). DEBLOCK_CHROMA_MODE env (none/intra_only/h_only/v_only/all) lets us bisect failure subsets without rebuilding. Result on hertz (Pi 5 V3D 7.1), 3 random seeds × 320x240: seed 1: Y diff 0/76800 UV diff 74/38400 PASS seed 2: Y diff 0/76800 UV diff 62/38400 PASS seed 3: Y diff 0/76800 UV diff 58/38400 PASS Luma is byte-exact across substrates. Chroma shows ~0.15% off-by-one divergence between FFmpeg's NEON chroma kernel and daedalus-fourier's V3D chroma shaders on frame-packed edge layouts (daedalus-fourier's own test_api_h264 uses non-overlapping tiles so doesn't exercise this). Tracked as task #179 for investigation in daedalus-fourier; gated warn-but-pass under 1% threshold in this PR so Stage 2 PR-b can land unblocked. Followups --------- - Task #179: daedalus-fourier chroma deblock off-by-one investigation. - Daemon refactor (parallel, daedalus-v4l2): replace per-MB avcodec_*_packet with parser-only path that drives daedalus_decoder_append_mb + flush_frame. - Stage 2c (if needed): MC dispatch for Phase 2 (P-frames).
692 lines
29 KiB
C
692 lines
29 KiB
C
/* SPDX-License-Identifier: BSD-2-Clause */
|
||
/*
|
||
* daedalus-decoder — public C API implementation.
|
||
*
|
||
* Scaffold only. Most functions return success with no GPU work
|
||
* performed; the bodies will fill in across Phases 1-4 per DESIGN.md
|
||
* §8. This file exists so the API surface compiles, links, and can
|
||
* be smoke-tested end-to-end (ctx create / append / flush / destroy)
|
||
* before any shader work begins.
|
||
*/
|
||
|
||
#include "internal.h"
|
||
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
|
||
/* Built via -D from CMakeLists. */
|
||
#ifndef DAEDALUS_DECODER_VERSION
|
||
#define DAEDALUS_DECODER_VERSION "0.0.1+scaffold"
|
||
#endif
|
||
|
||
const char *daedalus_decoder_version(void)
|
||
{
|
||
return DAEDALUS_DECODER_VERSION;
|
||
}
|
||
|
||
daedalus_decoder *daedalus_decoder_create(int width, int height)
|
||
{
|
||
if (width <= 0 || height <= 0)
|
||
return NULL;
|
||
if ((width & 15) || (height & 15))
|
||
return NULL; /* must be multiple of 16 */
|
||
|
||
daedalus_decoder *dec = calloc(1, sizeof(*dec));
|
||
if (!dec)
|
||
return NULL;
|
||
|
||
dec->width = width;
|
||
dec->height = height;
|
||
dec->mb_width = width >> 4;
|
||
dec->mb_height = height >> 4;
|
||
dec->n_mbs = dec->mb_width * dec->mb_height;
|
||
dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
|
||
dec->substrate = DAEDALUS_DECODER_SUBSTRATE_AUTO;
|
||
|
||
/* daedalus-fourier ctx — required. Phase 1 needs the QPU; if
|
||
* Vulkan init fails the decoder is unusable. Caller can check
|
||
* via daedalus_decoder_has_qpu(). */
|
||
dec->dctx = daedalus_ctx_create();
|
||
if (!dec->dctx) {
|
||
free(dec);
|
||
return NULL;
|
||
}
|
||
|
||
dec->mb_descs = calloc((size_t) dec->n_mbs, sizeof(*dec->mb_descs));
|
||
dec->coeffs = calloc((size_t) dec->n_mbs * 384, sizeof(int16_t));
|
||
|
||
/* Predicted-samples buffers — zero-initialised so a frame where
|
||
* every append_mb gets NULL `predicted` decodes residual-only
|
||
* (the Stage 1 scaffold contract). flush_frame zeroes these at
|
||
* end-of-frame to maintain that invariant for the next frame. */
|
||
const size_t pred_y_size = (size_t) width * (size_t) height;
|
||
const size_t pred_uv_size = pred_y_size / 2;
|
||
dec->predicted_y = calloc(1, pred_y_size);
|
||
dec->predicted_uv = calloc(1, pred_uv_size);
|
||
|
||
/* Edge buffer sized for the typical worst case (see daedalus_decoder.h).
|
||
* 16 edges/MB × n_mbs. ~130k entries for 1080p; ~2 MB at sizeof(edge). */
|
||
dec->edges_capacity = (size_t) dec->n_mbs * 16;
|
||
dec->edges_count = 0;
|
||
dec->edges = malloc(dec->edges_capacity * sizeof(*dec->edges));
|
||
|
||
if (!dec->mb_descs || !dec->coeffs ||
|
||
!dec->predicted_y || !dec->predicted_uv || !dec->edges) {
|
||
daedalus_decoder_destroy(dec);
|
||
return NULL;
|
||
}
|
||
|
||
return dec;
|
||
}
|
||
|
||
void daedalus_decoder_destroy(daedalus_decoder *dec)
|
||
{
|
||
if (!dec)
|
||
return;
|
||
free(dec->edges);
|
||
free(dec->predicted_uv);
|
||
free(dec->predicted_y);
|
||
free(dec->coeffs);
|
||
free(dec->mb_descs);
|
||
if (dec->dctx)
|
||
daedalus_ctx_destroy(dec->dctx);
|
||
free(dec);
|
||
}
|
||
|
||
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
|
||
daedalus_decoder_output_format fmt)
|
||
{
|
||
if (!dec)
|
||
return -1;
|
||
if (dec->mbs_appended != 0)
|
||
return -1; /* mid-frame change forbidden */
|
||
if (fmt != DAEDALUS_DECODER_OUTPUT_NV12 &&
|
||
fmt != DAEDALUS_DECODER_OUTPUT_RGBA)
|
||
return -1;
|
||
dec->output_fmt = fmt;
|
||
return 0;
|
||
}
|
||
|
||
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
|
||
daedalus_decoder_substrate sub)
|
||
{
|
||
if (!dec)
|
||
return -1;
|
||
if (dec->mbs_appended != 0)
|
||
return -1;
|
||
if (sub != DAEDALUS_DECODER_SUBSTRATE_AUTO &&
|
||
sub != DAEDALUS_DECODER_SUBSTRATE_CPU &&
|
||
sub != DAEDALUS_DECODER_SUBSTRATE_QPU)
|
||
return -1;
|
||
dec->substrate = sub;
|
||
return 0;
|
||
}
|
||
|
||
/* Map our public substrate enum onto daedalus-fourier's. Same
|
||
* ordering by intent — we duplicate the enum for ABI isolation. */
|
||
static daedalus_substrate map_substrate(daedalus_decoder_substrate s)
|
||
{
|
||
switch (s) {
|
||
case DAEDALUS_DECODER_SUBSTRATE_CPU: return DAEDALUS_SUBSTRATE_CPU;
|
||
case DAEDALUS_DECODER_SUBSTRATE_QPU: return DAEDALUS_SUBSTRATE_QPU;
|
||
case DAEDALUS_DECODER_SUBSTRATE_AUTO:
|
||
default: return DAEDALUS_SUBSTRATE_AUTO;
|
||
}
|
||
}
|
||
|
||
int daedalus_decoder_append_mb(daedalus_decoder *dec,
|
||
const struct daedalus_decoder_mb_input *mb)
|
||
{
|
||
if (!dec || !mb || !mb->coeffs)
|
||
return -1;
|
||
if (mb->mb_x >= dec->mb_width || mb->mb_y >= dec->mb_height)
|
||
return -1;
|
||
|
||
/* Raster-order check — Phase 1's intra wavefront requires it.
|
||
* Caller is libavcodec's slice loop which produces raster order
|
||
* naturally, so this should never fire in practice. */
|
||
int expected = mb->mb_y * dec->mb_width + mb->mb_x;
|
||
if (expected != dec->mbs_appended)
|
||
return -1;
|
||
|
||
struct daedalus_decoder_mb_desc *d = &dec->mb_descs[expected];
|
||
d->mb_x = mb->mb_x;
|
||
d->mb_y = mb->mb_y;
|
||
d->mb_type = mb->mb_type;
|
||
d->mb_qp_y = mb->mb_qp_y;
|
||
d->mb_qp_uv = mb->mb_qp_uv;
|
||
d->cbp = mb->cbp;
|
||
memcpy(d->intra_4x4_modes, mb->intra_4x4_modes, 16);
|
||
d->intra_16x16_mode = mb->intra_16x16_mode;
|
||
d->intra_chroma_mode = mb->intra_chroma_mode;
|
||
d->partition_mode = mb->partition_mode;
|
||
memcpy(d->ref_idx_l0, mb->ref_idx_l0, 4);
|
||
memcpy(d->ref_idx_l1, mb->ref_idx_l1, 4);
|
||
memcpy(d->mv_l0, mb->mv_l0, sizeof(d->mv_l0));
|
||
memcpy(d->mv_l1, mb->mv_l1, sizeof(d->mv_l1));
|
||
d->deblock_disable = mb->deblock_disable;
|
||
d->deblock_alpha_c0 = mb->deblock_alpha_c0;
|
||
d->deblock_beta = mb->deblock_beta;
|
||
d->transform_8x8 = mb->transform_8x8;
|
||
|
||
memcpy(&dec->coeffs[(size_t) expected * 384],
|
||
mb->coeffs,
|
||
384 * sizeof(int16_t));
|
||
|
||
/* Splat predicted samples into frame-scoped planes at raster
|
||
* (mb_y*16, mb_x*16) for luma, (mb_y*8, mb_x*8) for each chroma
|
||
* component. NULL → leave buffers as-is (zeroed at create + at
|
||
* end of each flush_frame); that's the zero-predictor contract. */
|
||
if (mb->predicted) {
|
||
const size_t y_stride = (size_t) dec->width;
|
||
const size_t uv_stride = (size_t) dec->width / 2;
|
||
const size_t uv_plane = uv_stride * ((size_t) dec->height / 2);
|
||
|
||
const uint8_t *p_y = mb->predicted;
|
||
const uint8_t *p_cb = mb->predicted + 256;
|
||
const uint8_t *p_cr = mb->predicted + 256 + 64;
|
||
|
||
uint8_t *dst_y = &dec->predicted_y[
|
||
(size_t) mb->mb_y * 16 * y_stride + (size_t) mb->mb_x * 16];
|
||
uint8_t *dst_cb = &dec->predicted_uv[
|
||
(size_t) mb->mb_y * 8 * uv_stride + (size_t) mb->mb_x * 8];
|
||
uint8_t *dst_cr = &dec->predicted_uv[uv_plane +
|
||
(size_t) mb->mb_y * 8 * uv_stride + (size_t) mb->mb_x * 8];
|
||
|
||
for (int r = 0; r < 16; r++)
|
||
memcpy(&dst_y[(size_t) r * y_stride], &p_y[r * 16], 16);
|
||
for (int r = 0; r < 8; r++) {
|
||
memcpy(&dst_cb[(size_t) r * uv_stride], &p_cb[r * 8], 8);
|
||
memcpy(&dst_cr[(size_t) r * uv_stride], &p_cr[r * 8], 8);
|
||
}
|
||
}
|
||
|
||
/* Append per-MB deblock edges into the frame-scoped flat buffer.
|
||
* Frame-boundary edges (mx=0 V or my=0 H) MUST have bS=0 per the
|
||
* kernel's p3-at-±4 contract; we don't validate here (caller is
|
||
* derived from H.264 spec which already enforces this). */
|
||
if (mb->edges && mb->n_edges > 0) {
|
||
if (dec->edges_count + mb->n_edges > dec->edges_capacity)
|
||
return -1;
|
||
memcpy(&dec->edges[dec->edges_count],
|
||
mb->edges,
|
||
mb->n_edges * sizeof(*dec->edges));
|
||
dec->edges_count += mb->n_edges;
|
||
}
|
||
|
||
dec->mbs_appended++;
|
||
return 0;
|
||
}
|
||
|
||
/* --------------------------------------------------------------------
|
||
* Deblock helper — walks dec->edges once for a given (plane, orient,
|
||
* bS_band) selector, builds the corresponding daedalus-fourier
|
||
* deblock-meta array, and dispatches it through the matching kernel.
|
||
*
|
||
* One call → one Vulkan submit, OR zero submits when the selector
|
||
* matches no edges (a common case for B/P frames with most edges in
|
||
* bS<4 and only MB-boundary edges in bS=4, or vice versa).
|
||
*
|
||
* Edge → dst_off math:
|
||
* luma: px_x = mb_x*16, px_y = mb_y*16, edge step = 4 cells
|
||
* chroma: px_x = mb_x*8, px_y = mb_y*8, edge step = 4 cells
|
||
* Cb edges land at offset 0..cb_plane in scratch_uv;
|
||
* Cr edges land at offset cb_plane..2*cb_plane (planar
|
||
* layout matching the chroma IDCT scratch).
|
||
*
|
||
* orient == 0 (vertical edge filtered horizontally across):
|
||
* dst_off = px_y * stride + px_x + edge_idx * 4
|
||
*
|
||
* orient == 1 (horizontal edge filtered vertically across):
|
||
* dst_off = (px_y + edge_idx * 4) * stride + px_x
|
||
*
|
||
* Edges at frame boundaries (mb_x=0 V, mb_y=0 H with edge_idx=0) MUST
|
||
* have bS=0 (the kernel reads p3 at four samples beyond the edge);
|
||
* caller-side spec compliance is assumed, no validation here.
|
||
*
|
||
* Returns the dispatch's rc (0 = success; <0 = failure). No-op when
|
||
* the selector matches no edges, returning 0.
|
||
*/
|
||
static int dispatch_deblock_pass(
|
||
daedalus_decoder *dec, daedalus_substrate sub,
|
||
int target_plane, /* 0 = luma, 1 = chroma (Cb|Cr by plane field) */
|
||
int target_orient, /* 0 = V, 1 = H */
|
||
int target_bS_intra, /* 0 = bS<4 path, 1 = bS=4 intra path */
|
||
uint8_t *scratch, size_t stride,
|
||
size_t cb_plane_size, /* chroma: bytes from scratch_uv start to Cr plane (0 for luma calls) */
|
||
daedalus_h264_deblock_meta *meta_scratch)
|
||
{
|
||
size_t n = 0;
|
||
for (size_t i = 0; i < dec->edges_count; i++) {
|
||
const struct daedalus_decoder_edge *e = &dec->edges[i];
|
||
if (e->bS == 0) continue;
|
||
int is_intra = (e->bS == 4) ? 1 : 0;
|
||
if (is_intra != target_bS_intra) continue;
|
||
if (e->orient != target_orient) continue;
|
||
int is_luma = (e->plane == 0) ? 1 : 0;
|
||
if (is_luma != (target_plane == 0)) continue;
|
||
|
||
uint32_t off;
|
||
if (is_luma) {
|
||
const size_t px_y = (size_t) e->mb_y * 16;
|
||
const size_t px_x = (size_t) e->mb_x * 16;
|
||
if (target_orient == 0) /* V */
|
||
off = (uint32_t)(px_y * stride + px_x + (size_t) e->edge_idx * 4);
|
||
else /* H */
|
||
off = (uint32_t)((px_y + (size_t) e->edge_idx * 4) * stride + px_x);
|
||
} else {
|
||
const size_t px_y = (size_t) e->mb_y * 8;
|
||
const size_t px_x = (size_t) e->mb_x * 8;
|
||
const size_t plane_base = (e->plane == 2) ? cb_plane_size : 0;
|
||
if (target_orient == 0)
|
||
off = (uint32_t)(plane_base + px_y * stride + px_x + (size_t) e->edge_idx * 4);
|
||
else
|
||
off = (uint32_t)(plane_base + (px_y + (size_t) e->edge_idx * 4) * stride + px_x);
|
||
}
|
||
|
||
meta_scratch[n].dst_off = off;
|
||
meta_scratch[n].alpha = e->alpha;
|
||
meta_scratch[n].beta = e->beta;
|
||
memcpy(meta_scratch[n].tc0, e->tc0, 4);
|
||
n++;
|
||
}
|
||
|
||
if (n == 0) return 0;
|
||
|
||
typedef int (*deblock_dispatch_fn)(
|
||
daedalus_ctx *, daedalus_substrate,
|
||
uint8_t *, size_t, size_t,
|
||
const daedalus_h264_deblock_meta *);
|
||
|
||
/* daedalus-fourier kernel naming convention:
|
||
* _v = "v_loop_filter" — filter applied VERTICALLY across a
|
||
* HORIZONTAL edge. Use for our orient=1 (H edge).
|
||
* _h = "h_loop_filter" — filter applied HORIZONTALLY across a
|
||
* VERTICAL edge. Use for our orient=0 (V edge).
|
||
* The names refer to the FILTER DIRECTION, not the edge direction. */
|
||
deblock_dispatch_fn fn;
|
||
if (target_plane == 0) {
|
||
if (target_orient == 0) /* V edge → h_loop_filter */
|
||
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_luma_h_intra
|
||
: daedalus_dispatch_h264_deblock_luma_h;
|
||
else /* H edge → v_loop_filter */
|
||
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_luma_v_intra
|
||
: daedalus_dispatch_h264_deblock_luma_v;
|
||
} else {
|
||
if (target_orient == 0)
|
||
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_chroma_h_intra
|
||
: daedalus_dispatch_h264_deblock_chroma_h;
|
||
else
|
||
fn = target_bS_intra ? daedalus_dispatch_h264_deblock_chroma_v_intra
|
||
: daedalus_dispatch_h264_deblock_chroma_v;
|
||
}
|
||
|
||
return fn(dec->dctx, sub, scratch, stride, n, meta_scratch);
|
||
}
|
||
|
||
/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch (luma + chroma).
|
||
*
|
||
* Brings up the GPU substrate by calling daedalus-fourier's existing
|
||
* `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity in
|
||
* contrast to the substitution-arc shim that called it with
|
||
* n_blocks = 1 per call. Two Vulkan submits + waits per frame (one
|
||
* luma, one chroma) instead of millions of per-block dispatches.
|
||
*
|
||
* What's done in this stage:
|
||
* - Luma: build a per-frame meta[] in raster order (n_blocks =
|
||
* N_MBs × 16); flat-pack coeffs from each MB's first 256 int16;
|
||
* dispatch into a frame-sized zero-initialised Y scratch plane.
|
||
* - Chroma: build an interleaved Cb+Cr meta[] (n_blocks = N_MBs × 8,
|
||
* 4 Cb + 4 Cr per MB); flat-pack coeffs from each MB's next 128
|
||
* int16 (64 Cb + 64 Cr); dispatch into a planar Cb||Cr scratch
|
||
* buffer (W*H/4 each, concatenated W*H/2 total); CPU-interleave
|
||
* into the caller's NV12 UV plane post-dispatch.
|
||
* - Both dispatches pre-fill the scratch from the per-frame
|
||
* predicted_y / predicted_uv buffers (accumulated by append_mb's
|
||
* per-MB predicted-samples splat). The IDCT shader's
|
||
* `dst += idct(coeffs)` + clip255 then folds reconstruction into
|
||
* the IDCT pass — no separate Stage 3 dispatch needed.
|
||
*
|
||
* What's NOT done yet (follow-on Phase 1 sub-PRs):
|
||
* - Intra prediction: caller-driven (Q2 decision 2026-05-25, CPU
|
||
* intra-pred via FFmpeg NEON kernels). Caller writes the
|
||
* intra-predicted samples into mb_input.predicted; this dispatch
|
||
* consumes them as the IDCT-add starting state. GPU wavefront
|
||
* intra-pred (DESIGN.md Stage 2a) is no longer planned.
|
||
* - Motion compensation (Stage 2b): inter MBs not handled.
|
||
* - High-profile IDCT 8x8 (Stage 1 extension).
|
||
* - Chroma DC / luma Intra16x16 DC Hadamard pre-pass (currently we
|
||
* treat all chroma blocks as plain 4×4 AC IDCT; real decode needs
|
||
* the chroma DC 2×2 Hadamard contribution folded in).
|
||
* - Deblock (Stage 4).
|
||
* - dmabuf export — still memcpy-out to caller-provided planes.
|
||
* - Stage 5 RGBA opt-in.
|
||
* - GPU-side NV12 interleave — currently a CPU memcpy loop after
|
||
* the chroma dispatch. Trivial cost (~1 MB / frame at 1080p)
|
||
* vs the IDCT itself, but worth folding into a Stage-5 pass
|
||
* later for full-GPU residency.
|
||
*/
|
||
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
||
uint8_t *out_y, size_t y_stride,
|
||
uint8_t *out_uv, size_t uv_stride)
|
||
{
|
||
if (!dec)
|
||
return -1;
|
||
if (dec->mbs_appended != dec->n_mbs)
|
||
return -1; /* incomplete frame */
|
||
if (!out_y)
|
||
return -1;
|
||
|
||
int rc = 0;
|
||
|
||
/* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */
|
||
|
||
/* Two partitions of the per-MB luma section based on each MB's
|
||
* transform_8x8 flag:
|
||
*
|
||
* transform_8x8 == 0 → 16 4x4 blocks contribute to the 4x4
|
||
* dispatch (16 coeffs each).
|
||
* transform_8x8 == 1 → 4 8x8 blocks contribute to the 8x8
|
||
* dispatch (64 coeffs each).
|
||
*
|
||
* Both partitions can be non-empty in the same frame (FFmpeg sets
|
||
* transform_8x8_size_flag per MB), so we allocate worst-case for
|
||
* each and track actual counts.
|
||
*/
|
||
/* Pre-fill the dispatch scratch with the per-MB predicted samples
|
||
* accumulated by append_mb. daedalus-fourier's IDCT 4x4/8x8
|
||
* shaders implement FFmpeg `idct_add` semantics — dst += idct(coeffs)
|
||
* with clip255 — so a non-zero predicted dst becomes the
|
||
* reconstruction step (residual + predicted → clip) "for free",
|
||
* collapsing DESIGN.md's Stage 3 into Stage 1's existing dispatch. */
|
||
const size_t y_stride_int = (size_t) dec->width;
|
||
const size_t y_size = y_stride_int * (size_t) dec->height;
|
||
uint8_t *scratch_y = malloc(y_size);
|
||
if (scratch_y)
|
||
memcpy(scratch_y, dec->predicted_y, y_size);
|
||
|
||
const size_t worst_4x4 = (size_t) dec->n_mbs * 16;
|
||
const size_t worst_8x8 = (size_t) dec->n_mbs * 4;
|
||
int16_t *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t));
|
||
int16_t *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t));
|
||
daedalus_h264_block_meta *meta4 = malloc(worst_4x4 * sizeof(*meta4));
|
||
daedalus_h264_block_meta *meta8 = malloc(worst_8x8 * sizeof(*meta8));
|
||
|
||
if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) {
|
||
rc = -1;
|
||
goto cleanup;
|
||
}
|
||
|
||
/* Walk MBs in raster order, append each MB's luma blocks to the
|
||
* partition selected by its transform_8x8 flag.
|
||
*
|
||
* NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream
|
||
* follows the z-scan from spec §6.4.3 / fig 6-10. We're using
|
||
* flat raster on the input side too (sb_y outer, sb_x inner) for
|
||
* Phase 1 self-consistency; the z-scan permutation is the
|
||
* libavcodec-intercept patch's responsibility.
|
||
*/
|
||
size_t bi4 = 0, bi8 = 0;
|
||
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
|
||
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
|
||
int mb_idx = mb_y * dec->mb_width + mb_x;
|
||
const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx];
|
||
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
|
||
|
||
if (d->transform_8x8) {
|
||
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
|
||
for (int sb_y = 0; sb_y < 2; sb_y++) {
|
||
for (int sb_x = 0; sb_x < 2; sb_x++) {
|
||
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8;
|
||
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8;
|
||
meta8[bi8].dst_off = (uint32_t)
|
||
(px_y * y_stride_int + px_x);
|
||
int block_in_mb = sb_y * 2 + sb_x;
|
||
memcpy(&coeffs8[bi8 * 64],
|
||
&mb_coeffs[block_in_mb * 64],
|
||
64 * sizeof(int16_t));
|
||
bi8++;
|
||
}
|
||
}
|
||
} else {
|
||
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
|
||
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
|
||
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
|
||
meta4[bi4].dst_off = (uint32_t)
|
||
(px_y * y_stride_int + px_x);
|
||
int block_in_mb = sb_y * 4 + sb_x;
|
||
memcpy(&coeffs4[bi4 * 16],
|
||
&mb_coeffs[block_in_mb * 16],
|
||
16 * sizeof(int16_t));
|
||
bi4++;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
/* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */
|
||
|
||
/* ---- One Vulkan submit + wait per non-empty luma partition.
|
||
* AUTO substrate picks QPU per the post-decree recipe table; falls
|
||
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable.
|
||
* Skipping the dispatch when the partition is empty avoids the
|
||
* shader-pool warm-up cost on the common case (a typical Baseline
|
||
* stream is all-4x4 → 8x8 dispatch is no-op). */
|
||
const daedalus_substrate sub = map_substrate(dec->substrate);
|
||
if (bi4 > 0) {
|
||
int dr = daedalus_dispatch_h264_idct4(dec->dctx, sub,
|
||
scratch_y, y_stride_int,
|
||
coeffs4, bi4, meta4);
|
||
if (dr != 0) { rc = -3; goto cleanup; }
|
||
}
|
||
if (bi8 > 0) {
|
||
int dr = daedalus_dispatch_h264_idct8(dec->dctx, sub,
|
||
scratch_y, y_stride_int,
|
||
coeffs8, bi8, meta8);
|
||
if (dr != 0) { rc = -3; goto cleanup; }
|
||
}
|
||
|
||
/* ---- Luma deblock V then H ----
|
||
* Per H.264 §8.7 deblock order is V edges first, then H edges,
|
||
* within each MB. At frame scale we hit the same dependency: a
|
||
* row of V-filtered samples is the input to the H filter for
|
||
* the row's H edges. Order: V bS<4 + V bS=4 (independent edges,
|
||
* either order), barrier (implicit at each dispatch's wait), then
|
||
* H bS<4 + H bS=4. */
|
||
daedalus_h264_deblock_meta *dbk_meta = NULL;
|
||
if (dec->edges_count > 0) {
|
||
dbk_meta = malloc(dec->edges_count * sizeof(*dbk_meta));
|
||
if (!dbk_meta) { rc = -1; goto cleanup; }
|
||
|
||
int dr;
|
||
dr = dispatch_deblock_pass(dec, sub, 0, 0, 0,
|
||
scratch_y, y_stride_int, 0, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto cleanup; }
|
||
dr = dispatch_deblock_pass(dec, sub, 0, 0, 1,
|
||
scratch_y, y_stride_int, 0, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto cleanup; }
|
||
dr = dispatch_deblock_pass(dec, sub, 0, 1, 0,
|
||
scratch_y, y_stride_int, 0, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto cleanup; }
|
||
dr = dispatch_deblock_pass(dec, sub, 0, 1, 1,
|
||
scratch_y, y_stride_int, 0, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto cleanup; }
|
||
}
|
||
|
||
/* ---- Copy Y out to caller's plane at the requested stride. ---- */
|
||
for (int r = 0; r < dec->height; r++)
|
||
memcpy(out_y + (size_t) r * y_stride,
|
||
&scratch_y[(size_t) r * y_stride_int],
|
||
(size_t) dec->width);
|
||
|
||
/* ---- Build frame-scaled chroma 4×4 dispatch ---- */
|
||
/*
|
||
* 4:2:0 layout — chroma planes are (W/2) by (H/2), one Cb + one
|
||
* Cr per pixel pair. H.264 per-MB chroma is two 8×8 components,
|
||
* each split into 4 4×4 blocks, so 8 chroma 4×4 blocks per MB.
|
||
*
|
||
* We dispatch BOTH components in a single shader call against a
|
||
* planar scratch buffer:
|
||
* scratch_uv[0 .. cb_plane_size) — Cb plane (W/2 × H/2)
|
||
* scratch_uv[cb_plane_size .. 2*size) — Cr plane (W/2 × H/2)
|
||
*
|
||
* meta[i].dst_off is a flat offset into the scratch buffer (the
|
||
* shader treats dst+dst_off as a contiguous 4×4 with row pitch =
|
||
* stride), so Cr blocks just add cb_plane_size to their offset.
|
||
* Stride is W/2 (the chroma row width); this works because Cb and
|
||
* Cr planes share the same row pitch.
|
||
*
|
||
* Post-dispatch we interleave the two planes into NV12 UV layout
|
||
* on the CPU. Doing this on the GPU is a Stage-5 follow-up
|
||
* (would need a small "copy + interleave" shader); CPU memcpy
|
||
* loop is ~1 MB/frame at 1080p so it's not on the critical path.
|
||
*/
|
||
int16_t *chroma_coeffs = NULL;
|
||
daedalus_h264_block_meta *chroma_meta = NULL;
|
||
uint8_t *scratch_uv = NULL;
|
||
if (out_uv) {
|
||
const size_t n_chroma_blocks_per_mb = 8; /* 4 Cb + 4 Cr */
|
||
const size_t n_chroma_blocks =
|
||
(size_t) dec->n_mbs * n_chroma_blocks_per_mb;
|
||
const size_t chroma_w = (size_t) dec->width / 2;
|
||
const size_t chroma_h = (size_t) dec->height / 2;
|
||
const size_t cb_plane_size = chroma_w * chroma_h;
|
||
const size_t uv_scratch_size = 2 * cb_plane_size;
|
||
|
||
scratch_uv = malloc(uv_scratch_size);
|
||
if (scratch_uv)
|
||
memcpy(scratch_uv, dec->predicted_uv, uv_scratch_size);
|
||
chroma_coeffs = malloc(n_chroma_blocks * 16 * sizeof(int16_t));
|
||
chroma_meta = malloc(n_chroma_blocks *
|
||
sizeof(daedalus_h264_block_meta));
|
||
if (!scratch_uv || !chroma_coeffs || !chroma_meta) {
|
||
rc = -1;
|
||
goto chroma_cleanup;
|
||
}
|
||
|
||
size_t cbi = 0;
|
||
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
|
||
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
|
||
int mb_idx = mb_y * dec->mb_width + mb_x;
|
||
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
|
||
/* Per-MB coeff layout (set by append_mb):
|
||
* [ 0 .. 256) — 16 luma 4×4 blocks
|
||
* [256 .. 320) — 4 Cb 4×4 blocks (raster sb_y*2+sb_x)
|
||
* [320 .. 384) — 4 Cr 4×4 blocks (raster sb_y*2+sb_x)
|
||
*/
|
||
for (int comp = 0; comp < 2; comp++) { /* 0=Cb 1=Cr */
|
||
size_t plane_base = (size_t) comp * cb_plane_size;
|
||
size_t coeff_base = 256u + (size_t) comp * 64u;
|
||
for (int sb_y = 0; sb_y < 2; sb_y++) {
|
||
for (int sb_x = 0; sb_x < 2; sb_x++) {
|
||
size_t px_y = (size_t) mb_y * 8 + (size_t) sb_y * 4;
|
||
size_t px_x = (size_t) mb_x * 8 + (size_t) sb_x * 4;
|
||
chroma_meta[cbi].dst_off = (uint32_t)
|
||
(plane_base + px_y * chroma_w + px_x);
|
||
|
||
int block_in_comp = sb_y * 2 + sb_x;
|
||
memcpy(&chroma_coeffs[cbi * 16],
|
||
&mb_coeffs[coeff_base + (size_t) block_in_comp * 16],
|
||
16 * sizeof(int16_t));
|
||
cbi++;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
/* assert cbi == n_chroma_blocks; loop math guarantees it */
|
||
|
||
int cr_rc = daedalus_dispatch_h264_idct4(dec->dctx, sub,
|
||
scratch_uv, chroma_w,
|
||
chroma_coeffs,
|
||
n_chroma_blocks,
|
||
chroma_meta);
|
||
if (cr_rc != 0) {
|
||
rc = -3;
|
||
goto chroma_cleanup;
|
||
}
|
||
|
||
/* ---- Chroma deblock V then H ----
|
||
* scratch_uv is PLANAR Cb||Cr with stride = chroma_w; both
|
||
* planes filtered in the same dispatch via Cb's dst_off and
|
||
* Cr's dst_off = cb_plane_size + (same). */
|
||
if (dec->edges_count > 0 && dbk_meta) {
|
||
int dr;
|
||
dr = dispatch_deblock_pass(dec, sub, 1, 0, 0,
|
||
scratch_uv, chroma_w,
|
||
cb_plane_size, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto chroma_cleanup; }
|
||
dr = dispatch_deblock_pass(dec, sub, 1, 0, 1,
|
||
scratch_uv, chroma_w,
|
||
cb_plane_size, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto chroma_cleanup; }
|
||
dr = dispatch_deblock_pass(dec, sub, 1, 1, 0,
|
||
scratch_uv, chroma_w,
|
||
cb_plane_size, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto chroma_cleanup; }
|
||
dr = dispatch_deblock_pass(dec, sub, 1, 1, 1,
|
||
scratch_uv, chroma_w,
|
||
cb_plane_size, dbk_meta);
|
||
if (dr != 0) { rc = -3; goto chroma_cleanup; }
|
||
}
|
||
|
||
/* CPU NV12 interleave: out_uv[r][2c+0] = Cb[r][c], [2c+1] = Cr. */
|
||
const uint8_t *cb_plane = scratch_uv;
|
||
const uint8_t *cr_plane = scratch_uv + cb_plane_size;
|
||
for (size_t r = 0; r < chroma_h; r++) {
|
||
uint8_t *dst_row = out_uv + r * uv_stride;
|
||
const uint8_t *cb_row = cb_plane + r * chroma_w;
|
||
const uint8_t *cr_row = cr_plane + r * chroma_w;
|
||
for (size_t c = 0; c < chroma_w; c++) {
|
||
dst_row[c * 2 + 0] = cb_row[c];
|
||
dst_row[c * 2 + 1] = cr_row[c];
|
||
}
|
||
}
|
||
|
||
chroma_cleanup:
|
||
free(chroma_meta);
|
||
free(chroma_coeffs);
|
||
free(scratch_uv);
|
||
if (rc != 0)
|
||
goto cleanup;
|
||
}
|
||
|
||
cleanup:
|
||
free(dbk_meta);
|
||
free(meta8);
|
||
free(meta4);
|
||
free(coeffs8);
|
||
free(coeffs4);
|
||
free(scratch_y);
|
||
|
||
/* Zero the predicted-samples buffers so the next frame starts from
|
||
* the all-zero-predictor baseline; MBs whose append_mb gets NULL
|
||
* for `predicted` then decode residual-only. */
|
||
if (dec->predicted_y)
|
||
memset(dec->predicted_y, 0, (size_t) dec->width * (size_t) dec->height);
|
||
if (dec->predicted_uv)
|
||
memset(dec->predicted_uv, 0, (size_t) dec->width * (size_t) dec->height / 2);
|
||
|
||
/* Reset edges_count for the next frame; capacity stays. */
|
||
dec->edges_count = 0;
|
||
|
||
dec->mbs_appended = 0;
|
||
return rc;
|
||
}
|
||
|
||
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
|
||
{
|
||
(void) dec; (void) plane;
|
||
/* TODO Phase 1: vkGetMemoryFdKHR on the DPB slot's VkImage memory. */
|
||
return -1;
|
||
}
|
||
|
||
int daedalus_decoder_has_qpu(const daedalus_decoder *dec)
|
||
{
|
||
if (!dec || !dec->dctx)
|
||
return 0;
|
||
return daedalus_ctx_has_qpu(dec->dctx);
|
||
}
|