58848bd162
Replaces the chroma placeholder (memset 128) with a real frame-scaled
4x4 IDCT dispatch for the Cb and Cr components. Two Vulkan submits +
waits per frame now (one luma, one chroma) instead of one + memset.
Implementation:
- One combined planar scratch buffer (W*H/2 bytes) holds Cb then Cr;
a single `daedalus_recipe_dispatch_h264_idct4` call processes both
components by setting meta[].dst_off accordingly (Cr blocks add
cb_plane_size).
- Stride = W/2 (chroma row pitch); shared between Cb and Cr since
they have identical geometry.
- Per-MB coeff layout already had [256..320) for Cb and [320..384)
for Cr (4 raster-order 4x4 blocks per component) from the original
daedalus_decoder_append_mb design — no header-side changes.
- Post-dispatch CPU memcpy loop interleaves Cb[r][c] and Cr[r][c]
into NV12 UV at out_uv[r][2c..2c+1]. ~1 MB/frame at 1080p, well
off the critical path; a GPU-side interleave shader is a Stage-5
optimisation.
- Chroma dispatch is gated on out_uv != NULL so callers that only
want luma (e.g. the bit-exact test before this PR) still pay
nothing.
Test changes:
- tests/test_idct_bitexact.c extended with parallel reference IDCT
for Cb and Cr planes (W/2 x H/2 each), then deinterleaves NV12 UV
back into Cb/Cr for the compare. Random coeffs in [-512, 511] for
all 384 per-MB int16 slots (previously only luma was randomised).
- tests/test_smoke.c UV expectation flipped from "all 128 placeholder"
to "all 0" (real dispatch with zero coeffs). Sentinel 0xcd
pre-fill stays — same purpose: catches read-then-write bugs.
Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0):
$ ctest --test-dir build --output-on-failure
Start 1: smoke
1/2 Test #1: smoke ............................ Passed 1.27 sec
Start 2: idct_bitexact
2/2 Test #2: idct_bitexact .................... Passed 0.05 sec
100% tests passed, 0 tests failed out of 2
$ ./build/test_idct_bitexact
test_idct_bitexact: 320x240 (300 MBs), seed=0xfeedface5a5a5a5a
Y bytes total: 76800
Y bytes diff: 0 (0.0000%)
Cb bytes total: 19200 diff: 0 (0.0000%)
Cr bytes total: 19200 diff: 0 (0.0000%)
BIT-EXACT PASS (Y + Cb + Cr)
$ ./build/test_smoke
daedalus-decoder version: 0.0.1
ctx created: 1920x1088, has_qpu=1
appended 8160 MBs (120x68)
flush_frame rc=0
Y non-zero bytes: 0 / 2088960
UV non-zero bytes: 0 / 1044480
smoke OK
(Smoke's 1.27s includes the 1080p frame: 8160 MBs * 16 = 130,560 luma
blocks + 8160 * 8 = 65,280 chroma blocks across two dispatches —
shader pool warm-up dominates the wall time, not the IDCT work.)
What's NOT covered yet (deferred):
- Chroma DC / Intra16x16 luma DC 2x2 Hadamard pre-pass. Real H.264
chroma puts the per-block DC coefficient through a Hadamard before
it's added to the AC block; we currently treat all chroma blocks as
plain 4x4 AC. Will land alongside the libavcodec intercept patch,
since CABAC/CAVLC is where the DC vs AC distinction is exposed.
- Z-scan permutation for FFmpeg compatibility — only matters at the
intercept boundary, not here.
- IDCT 8x8 (High profile).
Closes the "chroma is a stub" item from PR #3's "what's NOT done" list.
166 lines
6.0 KiB
C
166 lines
6.0 KiB
C
/* SPDX-License-Identifier: BSD-2-Clause */
|
|
/*
|
|
* Scaffold smoke test — verifies the daedalus-decoder library links
|
|
* cleanly against daedalus-fourier and the lifecycle entry points
|
|
* don't immediately crash. No actual decoding work yet.
|
|
*
|
|
* Returns 0 on success, non-zero on any unexpected behaviour.
|
|
*/
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "daedalus_decoder.h"
|
|
|
|
#define EXPECT(cond, msg) do { \
|
|
if (!(cond)) { \
|
|
fprintf(stderr, "EXPECT FAIL %s:%d: %s\n", __FILE__, __LINE__, msg); \
|
|
return 1; \
|
|
} \
|
|
} while (0)
|
|
|
|
int main(void)
|
|
{
|
|
printf("daedalus-decoder version: %s\n", daedalus_decoder_version());
|
|
|
|
/* Create / destroy null is a no-op. */
|
|
daedalus_decoder_destroy(NULL);
|
|
|
|
/* Bad dimensions rejected. */
|
|
EXPECT(daedalus_decoder_create(0, 0 ) == NULL, "zero dims must reject");
|
|
EXPECT(daedalus_decoder_create(1919, 1088) == NULL, "non-16-multiple width must reject");
|
|
EXPECT(daedalus_decoder_create(1920, 1079) == NULL, "non-16-multiple height must reject");
|
|
|
|
/* Valid 1088p create. */
|
|
daedalus_decoder *dec = daedalus_decoder_create(1920, 1088);
|
|
if (!dec) {
|
|
/* Vulkan init failure on this host — degrades to skip, not fail.
|
|
* (CI runners without V3D7 will hit this; the smoke test
|
|
* shouldn't gate on hardware presence.) */
|
|
fprintf(stderr, "SKIP: daedalus_decoder_create returned NULL "
|
|
"(Vulkan / V3D7 unavailable on this host)\n");
|
|
return 0;
|
|
}
|
|
|
|
printf("ctx created: 1920x1088, has_qpu=%d\n",
|
|
daedalus_decoder_has_qpu(dec));
|
|
|
|
/* set_output_format mid-frame on virgin ctx is allowed
|
|
* (mbs_appended == 0). */
|
|
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_RGBA) == 0,
|
|
"switch to RGBA on virgin ctx");
|
|
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_NV12) == 0,
|
|
"switch back to NV12");
|
|
|
|
/* Append rejects out-of-bounds + null inputs. */
|
|
int16_t coeffs[384] = {0};
|
|
struct daedalus_decoder_mb_input mb = {0};
|
|
mb.coeffs = coeffs;
|
|
|
|
mb.mb_x = 0; mb.mb_y = 0;
|
|
EXPECT(daedalus_decoder_append_mb(dec, NULL) == -1, "null mb rejects");
|
|
{
|
|
struct daedalus_decoder_mb_input mb2 = mb;
|
|
mb2.coeffs = NULL;
|
|
EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1, "null coeffs rejects");
|
|
}
|
|
{
|
|
struct daedalus_decoder_mb_input mb2 = mb;
|
|
mb2.mb_x = 9999; mb2.mb_y = 9999;
|
|
EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1, "OOB coords reject");
|
|
}
|
|
|
|
/* Append first MB at raster index 0 — should succeed. */
|
|
EXPECT(daedalus_decoder_append_mb(dec, &mb) == 0, "append (0,0)");
|
|
|
|
/* Skipping (0,1) and appending (1,0) violates raster order — reject. */
|
|
{
|
|
struct daedalus_decoder_mb_input mb2 = mb;
|
|
mb2.mb_x = 0; mb2.mb_y = 1;
|
|
EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1,
|
|
"out-of-raster-order rejects");
|
|
}
|
|
|
|
/* In-order: (1,0). */
|
|
mb.mb_x = 1; mb.mb_y = 0;
|
|
EXPECT(daedalus_decoder_append_mb(dec, &mb) == 0, "append (1,0)");
|
|
|
|
/* Flush an incomplete frame: should fail because mbs_appended != n_mbs. */
|
|
EXPECT(daedalus_decoder_flush_frame(dec, NULL, 0, NULL, 0) == -1,
|
|
"incomplete-frame flush rejects");
|
|
|
|
/* set_output_format mid-frame (mbs_appended > 0) must reject. */
|
|
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_RGBA) == -1,
|
|
"mid-frame format change rejects");
|
|
|
|
daedalus_decoder_destroy(dec);
|
|
|
|
/* ---- Full-frame round-trip with all-zero coefficients.
|
|
* Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT
|
|
* dispatch and a successful GPU round-trip returns 0. IDCT of
|
|
* all-zero coefficients with zero-initialised predicted = all-zero
|
|
* output pixels. */
|
|
dec = daedalus_decoder_create(1920, 1088);
|
|
if (!dec) {
|
|
fprintf(stderr, "SKIP roundtrip: ctx create failed\n");
|
|
return 0;
|
|
}
|
|
|
|
static int16_t zero_coeffs[384] = {0};
|
|
struct daedalus_decoder_mb_input zmb = {0};
|
|
zmb.coeffs = zero_coeffs;
|
|
|
|
int mb_width = 1920 / 16; /* 120 */
|
|
int mb_height = 1088 / 16; /* 68 */
|
|
int n_mbs = mb_width * mb_height;
|
|
|
|
for (int mby = 0; mby < mb_height; mby++) {
|
|
for (int mbx = 0; mbx < mb_width; mbx++) {
|
|
zmb.mb_x = (uint16_t) mbx;
|
|
zmb.mb_y = (uint16_t) mby;
|
|
if (daedalus_decoder_append_mb(dec, &zmb) != 0) {
|
|
fprintf(stderr, "append (%d, %d) failed\n", mbx, mby);
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height);
|
|
|
|
size_t y_size = (size_t) 1920 * 1088;
|
|
size_t uv_size = (size_t) 1920 * 1088 / 2;
|
|
uint8_t *out_y = malloc(y_size);
|
|
uint8_t *out_uv = malloc(uv_size);
|
|
/* Pre-fill with sentinel so any read-then-write bug becomes visible. */
|
|
memset(out_y, 0xab, y_size);
|
|
memset(out_uv, 0xcd, uv_size);
|
|
|
|
int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920);
|
|
printf("flush_frame rc=%d\n", frc);
|
|
EXPECT(frc == 0, "flush succeeds on full frame");
|
|
|
|
/* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */
|
|
int y_nz = 0;
|
|
for (size_t i = 0; i < y_size; i++)
|
|
if (out_y[i] != 0) y_nz++;
|
|
printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
|
|
EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");
|
|
|
|
/* UV plane should be all zero now (real chroma IDCT runs with
|
|
* zero coeffs → zero residual → clip255(0+0) = 0). Previously a
|
|
* 128 placeholder when chroma was a memset stub; this PR replaced
|
|
* that with the real dispatch. Sentinel 0xcd above guarantees we
|
|
* are observing post-dispatch writes, not the leftover memset. */
|
|
int uv_nz = 0;
|
|
for (size_t i = 0; i < uv_size; i++)
|
|
if (out_uv[i] != 0) uv_nz++;
|
|
printf("UV non-zero bytes: %d / %zu\n", uv_nz, uv_size);
|
|
EXPECT(uv_nz == 0, "UV plane all zero for zero-coeff frame");
|
|
|
|
free(out_y);
|
|
free(out_uv);
|
|
daedalus_decoder_destroy(dec);
|
|
|
|
printf("smoke OK\n");
|
|
return 0;
|
|
}
|