Files
daedalus-decoder/src/daedalus_decoder.c
T
claude-noether adaabb1f63 phase1: IDCT 8x8 dispatch (High profile transform_8x8_size_flag)
Adds the High-profile 8x8 luma transform path alongside the existing
4x4 dispatch.  flush_frame now partitions macroblocks by each MB's
transform_8x8 flag and issues a separate luma dispatch per partition:

  - mb.transform_8x8 == 0 (Baseline/Main) → coeffs[0..256) interpreted
    as 16 4x4 blocks, fed to daedalus_recipe_dispatch_h264_idct4
    (existing behaviour, unchanged).
  - mb.transform_8x8 == 1 (High)          → coeffs[0..256) interpreted
    as 4 8x8 blocks (64 int16 each, column-major), fed to the new
    daedalus_recipe_dispatch_h264_idct8 call.

Both luma partitions can be non-empty in the same frame (FFmpeg sets
the flag per-MB).  Each non-empty partition costs one
vkQueueSubmit + vkQueueWaitIdle; empty partitions are skipped
(common case: Baseline streams skip the 8x8 dispatch entirely).

Chroma is unchanged — 4:2:0 chroma always uses the 4x4 transform.

API surface:
  - New uint8_t `transform_8x8` field in `struct daedalus_decoder_mb_input`
    (after deblock_*).  Backwards-compatible at the source level
    because the field defaults to 0 with C99 designated initialisers
    or {0} struct zeroing, both of which select the existing 4x4
    path.  ABI is pre-0.1 (per the header doc) so structural change
    is fine.
  - Mirrored in `struct daedalus_decoder_mb_desc` (internal layout).

Test changes:

  - test_idct_bitexact now exercises a mixed-mode frame: every odd
    raster MB uses 8x8, every even uses 4x4 (so flush_frame's
    partitioning is also under test, not just the underlying shaders).
  - 8x8 C reference (h264_idct8_butterfly + ref_idct8_add)
    transcribed from daedalus-fourier tests/h264_idct8_ref.c per
    H.264 §8.5.13.2.  Block layout column-major; +32 >> 6 rounding;
    add-to-predicted; clip255.
  - Reference luma compute branches per MB on the same mb_8x8[]
    array used to set the input flag.

Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0):

  $ ./build/test_idct_bitexact
  test_idct_bitexact: 320x240 (300 MBs), seed=0xfeedface5a5a5a5a
  MB mix: 150 4x4 MBs, 150 8x8 MBs
  Y bytes total:  76800
  Y bytes diff:   0 (0.0000%)
  Cb bytes total: 19200  diff: 0 (0.0000%)
  Cr bytes total: 19200  diff: 0 (0.0000%)
  BIT-EXACT PASS (Y + Cb + Cr)

  $ ctest --test-dir build
  100% tests passed, 0 tests failed out of 2

Bit-exact PASS first try for the 8x8 path — 150 8x8 MBs × 4 blocks =
600 8x8 IDCTs against the spec C reference, identical output.
Validates both the daedalus-fourier IDCT 8x8 shader (already gated
by its own cycle-7 bit-exact test, now also gated end-to-end through
our flush_frame), and our 8x8 layout assumptions (column-major coeffs,
raster sb_y*2+sb_x block order, top-left = mb*16 + sb*8).

What's NOT covered yet (deferred):

  - Z-scan permutation for FFmpeg compatibility (libavcodec intercept
    patch's concern; both 4x4 and 8x8 z-scans differ).
  - Chroma DC / luma Intra16x16 DC Hadamard pre-pass.
  - Mixed intra/inter MB handling — currently all MBs treated as
    residual-only (predicted=0).

Closes the "IDCT 8x8 (High profile)" item from PR #3's deferred list.
2026-05-24 22:41:05 +02:00

418 lines
16 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* daedalus-decoder — public C API implementation.
*
* Scaffold only. Most functions return success with no GPU work
* performed; the bodies will fill in across Phases 1-4 per DESIGN.md
* §8. This file exists so the API surface compiles, links, and can
* be smoke-tested end-to-end (ctx create / append / flush / destroy)
* before any shader work begins.
*/
#include "internal.h"
#include <stdlib.h>
#include <string.h>
/* Built via -D from CMakeLists. */
#ifndef DAEDALUS_DECODER_VERSION
#define DAEDALUS_DECODER_VERSION "0.0.1+scaffold"
#endif
const char *daedalus_decoder_version(void)
{
return DAEDALUS_DECODER_VERSION;
}
daedalus_decoder *daedalus_decoder_create(int width, int height)
{
if (width <= 0 || height <= 0)
return NULL;
if ((width & 15) || (height & 15))
return NULL; /* must be multiple of 16 */
daedalus_decoder *dec = calloc(1, sizeof(*dec));
if (!dec)
return NULL;
dec->width = width;
dec->height = height;
dec->mb_width = width >> 4;
dec->mb_height = height >> 4;
dec->n_mbs = dec->mb_width * dec->mb_height;
dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
/* daedalus-fourier ctx — required. Phase 1 needs the QPU; if
* Vulkan init fails the decoder is unusable. Caller can check
* via daedalus_decoder_has_qpu(). */
dec->dctx = daedalus_ctx_create();
if (!dec->dctx) {
free(dec);
return NULL;
}
dec->mb_descs = calloc((size_t) dec->n_mbs, sizeof(*dec->mb_descs));
dec->coeffs = calloc((size_t) dec->n_mbs * 384, sizeof(int16_t));
if (!dec->mb_descs || !dec->coeffs) {
daedalus_decoder_destroy(dec);
return NULL;
}
return dec;
}
void daedalus_decoder_destroy(daedalus_decoder *dec)
{
if (!dec)
return;
free(dec->coeffs);
free(dec->mb_descs);
if (dec->dctx)
daedalus_ctx_destroy(dec->dctx);
free(dec);
}
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
daedalus_decoder_output_format fmt)
{
if (!dec)
return -1;
if (dec->mbs_appended != 0)
return -1; /* mid-frame change forbidden */
if (fmt != DAEDALUS_DECODER_OUTPUT_NV12 &&
fmt != DAEDALUS_DECODER_OUTPUT_RGBA)
return -1;
dec->output_fmt = fmt;
return 0;
}
int daedalus_decoder_append_mb(daedalus_decoder *dec,
const struct daedalus_decoder_mb_input *mb)
{
if (!dec || !mb || !mb->coeffs)
return -1;
if (mb->mb_x >= dec->mb_width || mb->mb_y >= dec->mb_height)
return -1;
/* Raster-order check — Phase 1's intra wavefront requires it.
* Caller is libavcodec's slice loop which produces raster order
* naturally, so this should never fire in practice. */
int expected = mb->mb_y * dec->mb_width + mb->mb_x;
if (expected != dec->mbs_appended)
return -1;
struct daedalus_decoder_mb_desc *d = &dec->mb_descs[expected];
d->mb_x = mb->mb_x;
d->mb_y = mb->mb_y;
d->mb_type = mb->mb_type;
d->mb_qp_y = mb->mb_qp_y;
d->mb_qp_uv = mb->mb_qp_uv;
d->cbp = mb->cbp;
memcpy(d->intra_4x4_modes, mb->intra_4x4_modes, 16);
d->intra_16x16_mode = mb->intra_16x16_mode;
d->intra_chroma_mode = mb->intra_chroma_mode;
d->partition_mode = mb->partition_mode;
memcpy(d->ref_idx_l0, mb->ref_idx_l0, 4);
memcpy(d->ref_idx_l1, mb->ref_idx_l1, 4);
memcpy(d->mv_l0, mb->mv_l0, sizeof(d->mv_l0));
memcpy(d->mv_l1, mb->mv_l1, sizeof(d->mv_l1));
d->deblock_disable = mb->deblock_disable;
d->deblock_alpha_c0 = mb->deblock_alpha_c0;
d->deblock_beta = mb->deblock_beta;
d->transform_8x8 = mb->transform_8x8;
memcpy(&dec->coeffs[(size_t) expected * 384],
mb->coeffs,
384 * sizeof(int16_t));
dec->mbs_appended++;
return 0;
}
/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch (luma + chroma).
*
* Brings up the GPU substrate by calling daedalus-fourier's existing
* `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity in
* contrast to the substitution-arc shim that called it with
* n_blocks = 1 per call. Two Vulkan submits + waits per frame (one
* luma, one chroma) instead of millions of per-block dispatches.
*
* What's done in this stage:
* - Luma: build a per-frame meta[] in raster order (n_blocks =
* N_MBs × 16); flat-pack coeffs from each MB's first 256 int16;
* dispatch into a frame-sized zero-initialised Y scratch plane.
* - Chroma: build an interleaved Cb+Cr meta[] (n_blocks = N_MBs × 8,
* 4 Cb + 4 Cr per MB); flat-pack coeffs from each MB's next 128
* int16 (64 Cb + 64 Cr); dispatch into a planar Cb||Cr scratch
* buffer (W*H/4 each, concatenated W*H/2 total); CPU-interleave
* into the caller's NV12 UV plane post-dispatch.
* - Both dispatches use predicted=0 (the scratch buffers are
* calloc'd); the shader does clip255(predicted + idct(coeffs)).
*
* What's NOT done yet (follow-on Phase 1 sub-PRs):
* - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
* so output pixels are residual-only and not a valid frame decode.
* Sufficient for Vulkan round-trip validation, not for bit-exact
* against FFmpeg.
* - Motion compensation (Stage 2b): inter MBs not handled.
* - High-profile IDCT 8x8 (Stage 1 extension).
* - Chroma DC / luma Intra16x16 DC Hadamard pre-pass (currently we
* treat all chroma blocks as plain 4×4 AC IDCT; real decode needs
* the chroma DC 2×2 Hadamard contribution folded in).
* - Deblock (Stage 4).
* - dmabuf export — still memcpy-out to caller-provided planes.
* - Stage 5 RGBA opt-in.
* - GPU-side NV12 interleave — currently a CPU memcpy loop after
* the chroma dispatch. Trivial cost (~1 MB / frame at 1080p)
* vs the IDCT itself, but worth folding into a Stage-5 pass
* later for full-GPU residency.
*/
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
uint8_t *out_y, size_t y_stride,
uint8_t *out_uv, size_t uv_stride)
{
if (!dec)
return -1;
if (dec->mbs_appended != dec->n_mbs)
return -1; /* incomplete frame */
if (!out_y)
return -1;
int rc = 0;
/* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */
/* Two partitions of the per-MB luma section based on each MB's
* transform_8x8 flag:
*
* transform_8x8 == 0 → 16 4x4 blocks contribute to the 4x4
* dispatch (16 coeffs each).
* transform_8x8 == 1 → 4 8x8 blocks contribute to the 8x8
* dispatch (64 coeffs each).
*
* Both partitions can be non-empty in the same frame (FFmpeg sets
* transform_8x8_size_flag per MB), so we allocate worst-case for
* each and track actual counts.
*/
const size_t y_stride_int = (size_t) dec->width;
const size_t y_size = y_stride_int * (size_t) dec->height;
uint8_t *scratch_y = calloc(1, y_size);
const size_t worst_4x4 = (size_t) dec->n_mbs * 16;
const size_t worst_8x8 = (size_t) dec->n_mbs * 4;
int16_t *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t));
int16_t *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t));
daedalus_h264_block_meta *meta4 = malloc(worst_4x4 * sizeof(*meta4));
daedalus_h264_block_meta *meta8 = malloc(worst_8x8 * sizeof(*meta8));
if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) {
rc = -1;
goto cleanup;
}
/* Walk MBs in raster order, append each MB's luma blocks to the
* partition selected by its transform_8x8 flag.
*
* NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream
* follows the z-scan from spec §6.4.3 / fig 6-10. We're using
* flat raster on the input side too (sb_y outer, sb_x inner) for
* Phase 1 self-consistency; the z-scan permutation is the
* libavcodec-intercept patch's responsibility.
*/
size_t bi4 = 0, bi8 = 0;
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
int mb_idx = mb_y * dec->mb_width + mb_x;
const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx];
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
if (d->transform_8x8) {
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8;
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8;
meta8[bi8].dst_off = (uint32_t)
(px_y * y_stride_int + px_x);
int block_in_mb = sb_y * 2 + sb_x;
memcpy(&coeffs8[bi8 * 64],
&mb_coeffs[block_in_mb * 64],
64 * sizeof(int16_t));
bi8++;
}
}
} else {
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
meta4[bi4].dst_off = (uint32_t)
(px_y * y_stride_int + px_x);
int block_in_mb = sb_y * 4 + sb_x;
memcpy(&coeffs4[bi4 * 16],
&mb_coeffs[block_in_mb * 16],
16 * sizeof(int16_t));
bi4++;
}
}
}
}
}
/* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */
/* ---- One Vulkan submit + wait per non-empty luma partition.
* AUTO substrate picks QPU per the post-decree recipe table; falls
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable.
* Skipping the dispatch when the partition is empty avoids the
* shader-pool warm-up cost on the common case (a typical Baseline
* stream is all-4x4 → 8x8 dispatch is no-op). */
if (bi4 > 0) {
int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
scratch_y, y_stride_int,
coeffs4, bi4, meta4);
if (dr != 0) { rc = -3; goto cleanup; }
}
if (bi8 > 0) {
int dr = daedalus_recipe_dispatch_h264_idct8(dec->dctx,
scratch_y, y_stride_int,
coeffs8, bi8, meta8);
if (dr != 0) { rc = -3; goto cleanup; }
}
/* ---- Copy Y out to caller's plane at the requested stride. ---- */
for (int r = 0; r < dec->height; r++)
memcpy(out_y + (size_t) r * y_stride,
&scratch_y[(size_t) r * y_stride_int],
(size_t) dec->width);
/* ---- Build frame-scaled chroma 4×4 dispatch ---- */
/*
* 4:2:0 layout — chroma planes are (W/2) by (H/2), one Cb + one
* Cr per pixel pair. H.264 per-MB chroma is two 8×8 components,
* each split into 4 4×4 blocks, so 8 chroma 4×4 blocks per MB.
*
* We dispatch BOTH components in a single shader call against a
* planar scratch buffer:
* scratch_uv[0 .. cb_plane_size) — Cb plane (W/2 × H/2)
* scratch_uv[cb_plane_size .. 2*size) — Cr plane (W/2 × H/2)
*
* meta[i].dst_off is a flat offset into the scratch buffer (the
* shader treats dst+dst_off as a contiguous 4×4 with row pitch =
* stride), so Cr blocks just add cb_plane_size to their offset.
* Stride is W/2 (the chroma row width); this works because Cb and
* Cr planes share the same row pitch.
*
* Post-dispatch we interleave the two planes into NV12 UV layout
* on the CPU. Doing this on the GPU is a Stage-5 follow-up
* (would need a small "copy + interleave" shader); CPU memcpy
* loop is ~1 MB/frame at 1080p so it's not on the critical path.
*/
int16_t *chroma_coeffs = NULL;
daedalus_h264_block_meta *chroma_meta = NULL;
uint8_t *scratch_uv = NULL;
if (out_uv) {
const size_t n_chroma_blocks_per_mb = 8; /* 4 Cb + 4 Cr */
const size_t n_chroma_blocks =
(size_t) dec->n_mbs * n_chroma_blocks_per_mb;
const size_t chroma_w = (size_t) dec->width / 2;
const size_t chroma_h = (size_t) dec->height / 2;
const size_t cb_plane_size = chroma_w * chroma_h;
const size_t uv_scratch_size = 2 * cb_plane_size;
scratch_uv = calloc(1, uv_scratch_size);
chroma_coeffs = malloc(n_chroma_blocks * 16 * sizeof(int16_t));
chroma_meta = malloc(n_chroma_blocks *
sizeof(daedalus_h264_block_meta));
if (!scratch_uv || !chroma_coeffs || !chroma_meta) {
rc = -1;
goto chroma_cleanup;
}
size_t cbi = 0;
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
int mb_idx = mb_y * dec->mb_width + mb_x;
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
/* Per-MB coeff layout (set by append_mb):
* [ 0 .. 256) — 16 luma 4×4 blocks
* [256 .. 320) — 4 Cb 4×4 blocks (raster sb_y*2+sb_x)
* [320 .. 384) — 4 Cr 4×4 blocks (raster sb_y*2+sb_x)
*/
for (int comp = 0; comp < 2; comp++) { /* 0=Cb 1=Cr */
size_t plane_base = (size_t) comp * cb_plane_size;
size_t coeff_base = 256u + (size_t) comp * 64u;
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
size_t px_y = (size_t) mb_y * 8 + (size_t) sb_y * 4;
size_t px_x = (size_t) mb_x * 8 + (size_t) sb_x * 4;
chroma_meta[cbi].dst_off = (uint32_t)
(plane_base + px_y * chroma_w + px_x);
int block_in_comp = sb_y * 2 + sb_x;
memcpy(&chroma_coeffs[cbi * 16],
&mb_coeffs[coeff_base + (size_t) block_in_comp * 16],
16 * sizeof(int16_t));
cbi++;
}
}
}
}
}
/* assert cbi == n_chroma_blocks; loop math guarantees it */
int cr_rc = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
scratch_uv, chroma_w,
chroma_coeffs,
n_chroma_blocks,
chroma_meta);
if (cr_rc != 0) {
rc = -3;
goto chroma_cleanup;
}
/* CPU NV12 interleave: out_uv[r][2c+0] = Cb[r][c], [2c+1] = Cr. */
const uint8_t *cb_plane = scratch_uv;
const uint8_t *cr_plane = scratch_uv + cb_plane_size;
for (size_t r = 0; r < chroma_h; r++) {
uint8_t *dst_row = out_uv + r * uv_stride;
const uint8_t *cb_row = cb_plane + r * chroma_w;
const uint8_t *cr_row = cr_plane + r * chroma_w;
for (size_t c = 0; c < chroma_w; c++) {
dst_row[c * 2 + 0] = cb_row[c];
dst_row[c * 2 + 1] = cr_row[c];
}
}
chroma_cleanup:
free(chroma_meta);
free(chroma_coeffs);
free(scratch_uv);
if (rc != 0)
goto cleanup;
}
cleanup:
free(meta8);
free(meta4);
free(coeffs8);
free(coeffs4);
free(scratch_y);
dec->mbs_appended = 0;
return rc;
}
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
{
(void) dec; (void) plane;
/* TODO Phase 1: vkGetMemoryFdKHR on the DPB slot's VkImage memory. */
return -1;
}
int daedalus_decoder_has_qpu(const daedalus_decoder *dec)
{
if (!dec || !dec->dctx)
return 0;
return daedalus_ctx_has_qpu(dec->dctx);
}