a7a0d56ecd
First concrete deliverable on the daedalus-decoder Stage 2 path post
the 2026-05-25 architecture re-pin (memory: dejavu / frame-major UMA).
Q2 decision: CPU intra prediction. libavcodec's existing NEON intra
prediction kernels generate predicted samples per MB; daedalus-decoder
accepts those samples through the API and uses them as the IDCT-add
starting state. FFmpeg's `idct_add` semantics — dst += idct(coeffs);
clip255 — fold DESIGN.md's Stage 3 reconstruction into the existing
Stage 1 IDCT dispatch for free. No new GPU work.
API change
----------
`daedalus_decoder_mb_input` gains a `const uint8_t *predicted` field:
predicted [ 0 .. 256) — 16×16 luma, row-major raster
predicted [256 .. 320) — 8×8 Cb, row-major raster
predicted [320 .. 384) — 8×8 Cr, row-major raster
NULL is legal and equivalent to all-zero predicted samples — preserves
the existing IDCT-isolation test contract.
Internal changes
----------------
- `daedalus_decoder` gains predicted_y (W×H) and predicted_uv (planar
Cb||Cr, W×H/2) buffers allocated at create, zeroed at end of every
flush_frame so NULL `mb->predicted` is indistinguishable from
explicit zeros from one frame to the next.
- `append_mb` splats mb->predicted into predicted_y/_uv at raster
(mb_y*16, mb_x*16) for luma and (mb_y*8, mb_x*8) for each chroma
component.
- `flush_frame` replaces `calloc(scratch_y)` and `calloc(scratch_uv)`
with `malloc + memcpy from predicted_y/_uv` — the IDCT dispatch
then writes residual on top, clip-adding to the predicted samples
in place.
Test
----
`test_idct_bitexact` extended:
- Generates random predicted samples (uint8_t) per MB alongside the
existing random coeffs.
- Pre-fills the reference ref_y / ref_cb / ref_cr planes with those
same predicted samples at the corresponding raster positions
BEFORE applying ref_idct4_add / ref_idct8_add per block.
- Compares GPU output to reference byte-for-byte.
Result on hertz (Pi 5 V3D 7.1), all three substrates:
test_idct_bitexact 320 240 0xfeedface5a5a5a5a {cpu, qpu, auto}
Y bytes diff: 0/76800 (0.0000%)
Cb bytes diff: 0/19200 (0.0000%)
Cr bytes diff: 0/19200 (0.0000%)
BIT-EXACT PASS on all three substrates
Catches any silent drift between substrates and any predicted-samples
plumbing mistake on either the API or the dispatch side.
Followups
---------
- Stage 2 PR-b: deblock dispatch in flush_frame.
- Stage 2 daemon refactor (parallel, daedalus-v4l2 daemon): replace
avcodec_send_packet/receive_frame with a libavcodec-parser-only
path that drives daedalus_decoder_append_mb in raster order +
flush_frame at slice boundary.
447 lines
18 KiB
C
447 lines
18 KiB
C
/* SPDX-License-Identifier: BSD-2-Clause */
|
||
/*
|
||
* test_idct_bitexact — phase1 stage1 bit-exact gate for the frame-
|
||
* scaled luma + chroma IDCT 4×4 / 8×8 dispatch + Stage 2 predicted-
|
||
* samples plumbing.
|
||
*
|
||
* Generates a frame of random coefficients AND random predicted
|
||
* samples per MB, runs daedalus_decoder (which writes the predicted
|
||
* samples into its frame-scoped predicted_y/_uv buffers via
|
||
* append_mb, then pre-fills the IDCT dispatch scratch from them in
|
||
* flush_frame), and compares every output byte against an inline C
|
||
* reference that mirrors the H.264 §8.5.12.1 1D butterfly applied
|
||
* to the same predicted+coeffs inputs.
|
||
*
|
||
* Why "bit-exact": the GPU shader and the C reference apply the same
|
||
* integer arithmetic. Any rounding / sign / overflow disagreement is
|
||
* a bug. Pass = every output byte matches.
|
||
*
|
||
* Scope match with flush_frame: the test mirrors flush_frame's
|
||
* per-MB → flat block layout (raster scan within MB, no z-scan
|
||
* permutation). That keeps the test focused on IDCT correctness;
|
||
* the z-scan permutation that bridges to libavcodec's per-MB coeffs
|
||
* layout is a separate concern (handled in the eventual libavcodec-
|
||
* intercept patch).
|
||
*
|
||
* Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved).
|
||
* Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use
|
||
* transform_8x8=0 (16 luma 4x4 blocks); both partitions are
|
||
* exercised in the same frame so the flush_frame partitioning logic
|
||
* is also under test, not just the underlying shaders. Random coeffs
|
||
* for all components; reference IDCT applied per block. The chroma
|
||
* compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
|
||
*
|
||
* Not in scope (covered by other tests / future PRs):
|
||
* - Chroma DC / Intra16x16 DC Hadamard pre-pass
|
||
* - bit-exactness against real H.264 streams (test-vector PR)
|
||
* - deblock (lands in Stage 2 PR-b after this one)
|
||
*/
|
||
|
||
#include "daedalus_decoder.h"
|
||
|
||
#include <stdint.h>
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
|
||
/* xorshift64* for deterministic random coefficient generation. */
|
||
static uint64_t xs64_state;
|
||
static uint64_t xs64(void)
|
||
{
|
||
uint64_t x = xs64_state;
|
||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||
return xs64_state = x;
|
||
}
|
||
|
||
/* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
|
||
* then column pass; +32 rounding, >>6, add to predicted (=0 here),
|
||
* clip to u8. Bit-exact-equivalent transcription of daedalus-fourier
|
||
* tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
|
||
* fair-use for test purposes — same algorithm, no copy of code). */
|
||
static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||
|
||
static void h264_idct4_butterfly(const int d[4], int out[4])
|
||
{
|
||
int e = d[0] + d[2];
|
||
int f = d[0] - d[2];
|
||
int g = (d[1] >> 1) - d[3];
|
||
int h = d[1] + (d[3] >> 1);
|
||
out[0] = e + h;
|
||
out[1] = f + g;
|
||
out[2] = f - g;
|
||
out[3] = e - h;
|
||
}
|
||
|
||
/* 1D 8-point butterfly per H.264 §8.5.13.2. Transcribed from
|
||
* daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original —
|
||
* algorithm reproduced here for test purposes, no copy of code). */
|
||
static void h264_idct8_butterfly(const int d[8], int g[8])
|
||
{
|
||
int e[8], f[8];
|
||
e[0] = d[0] + d[4];
|
||
e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
|
||
e[2] = d[0] - d[4];
|
||
e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
|
||
e[4] = (d[2] >> 1) - d[6];
|
||
e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
|
||
e[6] = d[2] + (d[6] >> 1);
|
||
e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);
|
||
|
||
f[0] = e[0] + e[6];
|
||
f[1] = e[1] + (e[7] >> 2);
|
||
f[2] = e[2] + e[4];
|
||
f[3] = e[3] + (e[5] >> 2);
|
||
f[4] = e[2] - e[4];
|
||
f[5] = (e[3] >> 2) - e[5];
|
||
f[6] = e[0] - e[6];
|
||
f[7] = e[7] - (e[1] >> 2);
|
||
|
||
g[0] = f[0] + f[7];
|
||
g[1] = f[2] + f[5];
|
||
g[2] = f[4] + f[3];
|
||
g[3] = f[6] + f[1];
|
||
g[4] = f[6] - f[1];
|
||
g[5] = f[4] - f[3];
|
||
g[6] = f[2] - f[5];
|
||
g[7] = f[0] - f[7];
|
||
}
|
||
|
||
static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
|
||
{
|
||
/* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */
|
||
int tmp[8][8];
|
||
for (int r = 0; r < 8; r++) {
|
||
int d[8];
|
||
for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r];
|
||
int g[8];
|
||
h264_idct8_butterfly(d, g);
|
||
for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
|
||
}
|
||
int col_out[8][8];
|
||
for (int c = 0; c < 8; c++) {
|
||
int d[8];
|
||
for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
|
||
int g[8];
|
||
h264_idct8_butterfly(d, g);
|
||
for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
|
||
}
|
||
for (int r = 0; r < 8; r++)
|
||
for (int c = 0; c < 8; c++)
|
||
dst[r * stride + c] = (uint8_t) clip_u8(
|
||
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
|
||
}
|
||
|
||
static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
|
||
{
|
||
/* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
|
||
* block[c*4 + r] = coeff at (row=r, col=c).
|
||
* Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
|
||
int tmp[4][4];
|
||
for (int r = 0; r < 4; r++) {
|
||
int d[4] = { block[0*4 + r], block[1*4 + r],
|
||
block[2*4 + r], block[3*4 + r] };
|
||
int o[4];
|
||
h264_idct4_butterfly(d, o);
|
||
for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
|
||
}
|
||
/* Column pass: gather d[r] = tmp[r][c] for fixed c. */
|
||
int col_out[4][4];
|
||
for (int c = 0; c < 4; c++) {
|
||
int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
|
||
int o[4];
|
||
h264_idct4_butterfly(d, o);
|
||
for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
|
||
}
|
||
/* Add (predicted=dst, here 0) + clip. */
|
||
for (int r = 0; r < 4; r++)
|
||
for (int c = 0; c < 4; c++)
|
||
dst[r * stride + c] = (uint8_t) clip_u8(
|
||
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
|
||
}
|
||
|
||
int main(int argc, char **argv)
|
||
{
|
||
/* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
|
||
* the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */
|
||
int width = argc > 1 ? atoi(argv[1]) : 320;
|
||
int height = argc > 2 ? atoi(argv[2]) : 240; /* 240 / 16 = 15 → coded 240 */
|
||
/* Coded dims must be mod-16; 320×240 is canonical QVGA. */
|
||
|
||
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
|
||
xs64_state = seed;
|
||
|
||
/* Optional 4th argv: "auto" (default) / "cpu" / "qpu" to pin the
|
||
* dispatch substrate. Both substrates must produce IDENTICAL
|
||
* output (the V3D shaders are bit-exact gates against the same
|
||
* spec the NEON path implements); the ctest suite runs the QVGA
|
||
* test once per substrate to catch any silent drift. */
|
||
daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
|
||
const char *sub_name = "auto";
|
||
if (argc > 4) {
|
||
if (!strcmp(argv[4], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; }
|
||
else if (!strcmp(argv[4], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; }
|
||
else if (!strcmp(argv[4], "auto")) { /* default */ }
|
||
else {
|
||
fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[4]);
|
||
return 1;
|
||
}
|
||
}
|
||
|
||
int mb_w = width / 16;
|
||
int mb_h = height / 16;
|
||
int n_mbs = mb_w * mb_h;
|
||
printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
|
||
width, height, n_mbs, (unsigned long) seed);
|
||
|
||
daedalus_decoder *dec = daedalus_decoder_create(width, height);
|
||
if (!dec) {
|
||
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
|
||
return 0;
|
||
}
|
||
if (daedalus_decoder_set_substrate(dec, sub) != 0) {
|
||
fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
|
||
return 1;
|
||
}
|
||
printf("substrate: %s\n", sub_name);
|
||
|
||
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
|
||
* random coeffs in [-512, 511] — same range as the daedalus-fourier
|
||
* cycle-6 M1 gate uses. Plus random predicted samples (uint8 each)
|
||
* to exercise the Stage 2 predicted-samples plumbing — when this
|
||
* is non-zero, flush_frame must pre-fill the IDCT-dispatch scratch
|
||
* from dec->predicted_y / dec->predicted_uv (Stage 2 PR-a) rather
|
||
* than from calloc-zero (the Stage 1 scaffold contract). The
|
||
* reference path mirrors this by pre-filling ref_y / ref_cb / ref_cr
|
||
* from the same predicted bytes BEFORE the per-block ref_idct*_add
|
||
* calls — so the test catches any mismatch between caller-supplied
|
||
* predicted and what reaches the GPU's IDCT-add starting state. */
|
||
int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
|
||
uint8_t (*per_mb_predicted)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_predicted));
|
||
if (!per_mb_coeffs || !per_mb_predicted) { fprintf(stderr, "alloc fail\n"); return 1; }
|
||
|
||
for (int mb = 0; mb < n_mbs; mb++) {
|
||
for (int i = 0; i < 384; i++) {
|
||
/* Random coeffs in [-512, 511] for all of luma + Cb + Cr. */
|
||
per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
|
||
/* Random predicted samples in [0, 255]. */
|
||
per_mb_predicted[mb][i] = (uint8_t)(xs64() & 0xff);
|
||
}
|
||
}
|
||
|
||
/* Per-MB transform mode (deterministic split: every odd raster MB
|
||
* is 8x8, every even is 4x4 — exercises BOTH partitions in the
|
||
* same frame so the flush_frame partitioning logic is under test). */
|
||
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
|
||
if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; }
|
||
for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0;
|
||
|
||
/* Append in raster order. */
|
||
struct daedalus_decoder_mb_input mb = {0};
|
||
int n_8x8_mbs = 0, n_4x4_mbs = 0;
|
||
for (int my = 0; my < mb_h; my++) {
|
||
for (int mx = 0; mx < mb_w; mx++) {
|
||
int idx = my * mb_w + mx;
|
||
mb.mb_x = (uint16_t) mx;
|
||
mb.mb_y = (uint16_t) my;
|
||
mb.coeffs = per_mb_coeffs[idx];
|
||
mb.predicted = per_mb_predicted[idx];
|
||
mb.transform_8x8 = mb_8x8[idx];
|
||
if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++;
|
||
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
|
||
fprintf(stderr, "append (%d,%d) failed\n", mx, my);
|
||
return 1;
|
||
}
|
||
}
|
||
}
|
||
printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs);
|
||
|
||
/* Flush — exercise BOTH the luma path (out_y) and the chroma path
|
||
* (out_uv set to non-NULL so flush_frame runs the chroma dispatch
|
||
* + NV12 interleave). */
|
||
size_t y_size = (size_t) width * height;
|
||
size_t uv_size = (size_t) width * height / 2;
|
||
uint8_t *gpu_y = calloc(1, y_size);
|
||
uint8_t *gpu_uv = calloc(1, uv_size);
|
||
if (!gpu_y || !gpu_uv) return 1;
|
||
int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
|
||
gpu_uv, (size_t) width);
|
||
if (frc != 0) {
|
||
fprintf(stderr, "flush_frame rc=%d\n", frc);
|
||
return 1;
|
||
}
|
||
|
||
/* Compute the reference output: same per-MB → flat raster block
|
||
* layout as flush_frame uses. Branch per MB on transform_8x8.
|
||
*
|
||
* ref_y is pre-filled with each MB's 16×16 luma predicted samples
|
||
* at raster (my*16, mx*16), then ref_idct4_add/8_add overlay the
|
||
* residual via FFmpeg `idct_add` semantics (dst += idct(coeffs);
|
||
* clip255). This mirrors what flush_frame does on the GPU side:
|
||
* scratch_y starts from dec->predicted_y, IDCT-add writes back. */
|
||
uint8_t *ref_y = malloc(y_size);
|
||
if (!ref_y) return 1;
|
||
for (int my = 0; my < mb_h; my++) {
|
||
for (int mx = 0; mx < mb_w; mx++) {
|
||
int mb_idx = my * mb_w + mx;
|
||
const uint8_t *p_y = per_mb_predicted[mb_idx]; /* [0..256) */
|
||
for (int r = 0; r < 16; r++) {
|
||
memcpy(&ref_y[((size_t) my * 16 + r) * (size_t) width
|
||
+ (size_t) mx * 16],
|
||
&p_y[r * 16], 16);
|
||
}
|
||
}
|
||
}
|
||
int16_t block_scratch[64]; /* large enough for 8x8 */
|
||
for (int my = 0; my < mb_h; my++) {
|
||
for (int mx = 0; mx < mb_w; mx++) {
|
||
int mb_idx = my * mb_w + mx;
|
||
if (mb_8x8[mb_idx]) {
|
||
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
|
||
for (int sb_y = 0; sb_y < 2; sb_y++) {
|
||
for (int sb_x = 0; sb_x < 2; sb_x++) {
|
||
int block_in_mb = sb_y * 2 + sb_x;
|
||
memcpy(block_scratch,
|
||
&per_mb_coeffs[mb_idx][block_in_mb * 64],
|
||
64 * sizeof(int16_t));
|
||
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8;
|
||
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8;
|
||
ref_idct8_add(&ref_y[px_y * (size_t) width + px_x],
|
||
width, block_scratch);
|
||
}
|
||
}
|
||
} else {
|
||
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
|
||
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||
int block_in_mb = sb_y * 4 + sb_x;
|
||
memcpy(block_scratch,
|
||
&per_mb_coeffs[mb_idx][block_in_mb * 16],
|
||
16 * sizeof(int16_t));
|
||
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
|
||
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
|
||
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
|
||
width, block_scratch);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Build the chroma reference: separate planar Cb and Cr (W/2 by
|
||
* H/2), each block IDCT'd into its plane. Chroma per-MB layout
|
||
* matches flush_frame: 4 Cb blocks then 4 Cr blocks, raster order
|
||
* within each component (sb_y * 2 + sb_x). */
|
||
size_t chroma_w = (size_t) width / 2;
|
||
size_t chroma_h = (size_t) height / 2;
|
||
size_t chroma_plane_size = chroma_w * chroma_h;
|
||
uint8_t *ref_cb = malloc(chroma_plane_size);
|
||
uint8_t *ref_cr = malloc(chroma_plane_size);
|
||
if (!ref_cb || !ref_cr) return 1;
|
||
/* Pre-fill ref_cb / ref_cr with per-MB 8x8 chroma predicted samples
|
||
* (mirrors the predicted-samples plumbing on the chroma path). */
|
||
for (int my = 0; my < mb_h; my++) {
|
||
for (int mx = 0; mx < mb_w; mx++) {
|
||
int mb_idx = my * mb_w + mx;
|
||
const uint8_t *p_cb = per_mb_predicted[mb_idx] + 256;
|
||
const uint8_t *p_cr = per_mb_predicted[mb_idx] + 256 + 64;
|
||
for (int r = 0; r < 8; r++) {
|
||
memcpy(&ref_cb[((size_t) my * 8 + r) * chroma_w + (size_t) mx * 8],
|
||
&p_cb[r * 8], 8);
|
||
memcpy(&ref_cr[((size_t) my * 8 + r) * chroma_w + (size_t) mx * 8],
|
||
&p_cr[r * 8], 8);
|
||
}
|
||
}
|
||
}
|
||
for (int my = 0; my < mb_h; my++) {
|
||
for (int mx = 0; mx < mb_w; mx++) {
|
||
int mb_idx = my * mb_w + mx;
|
||
for (int comp = 0; comp < 2; comp++) {
|
||
uint8_t *plane = (comp == 0) ? ref_cb : ref_cr;
|
||
size_t coeff_base = 256u + (size_t) comp * 64u;
|
||
for (int sb_y = 0; sb_y < 2; sb_y++) {
|
||
for (int sb_x = 0; sb_x < 2; sb_x++) {
|
||
int block_in_comp = sb_y * 2 + sb_x;
|
||
memcpy(block_scratch,
|
||
&per_mb_coeffs[mb_idx][coeff_base +
|
||
(size_t) block_in_comp * 16],
|
||
16 * sizeof(int16_t));
|
||
size_t px_y = (size_t) my * 8 + (size_t) sb_y * 4;
|
||
size_t px_x = (size_t) mx * 8 + (size_t) sb_x * 4;
|
||
ref_idct4_add(&plane[px_y * chroma_w + px_x],
|
||
(ptrdiff_t) chroma_w, block_scratch);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Y compare. */
|
||
size_t y_diffs = 0, y_first_diff = 0;
|
||
for (size_t i = 0; i < y_size; i++) {
|
||
if (gpu_y[i] != ref_y[i]) {
|
||
if (y_diffs == 0) y_first_diff = i;
|
||
y_diffs++;
|
||
}
|
||
}
|
||
printf("Y bytes total: %zu\n", y_size);
|
||
printf("Y bytes diff: %zu (%.4f%%)\n", y_diffs, 100.0 * y_diffs / y_size);
|
||
if (y_diffs) {
|
||
printf("Y first diff at offset %zu: gpu=%u ref=%u\n",
|
||
y_first_diff, gpu_y[y_first_diff], ref_y[y_first_diff]);
|
||
}
|
||
|
||
/* UV compare — deinterleave NV12 back into Cb/Cr and compare. */
|
||
size_t cb_diffs = 0, cr_diffs = 0;
|
||
size_t cb_first = 0, cr_first = 0;
|
||
for (size_t r = 0; r < chroma_h; r++) {
|
||
const uint8_t *gpu_row = gpu_uv + r * (size_t) width;
|
||
const uint8_t *cb_row = ref_cb + r * chroma_w;
|
||
const uint8_t *cr_row = ref_cr + r * chroma_w;
|
||
for (size_t c = 0; c < chroma_w; c++) {
|
||
uint8_t gpu_cb = gpu_row[c * 2 + 0];
|
||
uint8_t gpu_cr = gpu_row[c * 2 + 1];
|
||
if (gpu_cb != cb_row[c]) {
|
||
if (cb_diffs == 0) cb_first = r * chroma_w + c;
|
||
cb_diffs++;
|
||
}
|
||
if (gpu_cr != cr_row[c]) {
|
||
if (cr_diffs == 0) cr_first = r * chroma_w + c;
|
||
cr_diffs++;
|
||
}
|
||
}
|
||
}
|
||
printf("Cb bytes total: %zu diff: %zu (%.4f%%)\n",
|
||
chroma_plane_size, cb_diffs,
|
||
100.0 * cb_diffs / chroma_plane_size);
|
||
printf("Cr bytes total: %zu diff: %zu (%.4f%%)\n",
|
||
chroma_plane_size, cr_diffs,
|
||
100.0 * cr_diffs / chroma_plane_size);
|
||
if (cb_diffs) {
|
||
size_t r = cb_first / chroma_w, c = cb_first % chroma_w;
|
||
printf("Cb first diff at (%zu,%zu): gpu=%u ref=%u\n",
|
||
r, c, gpu_uv[r * (size_t) width + c * 2 + 0], ref_cb[cb_first]);
|
||
}
|
||
if (cr_diffs) {
|
||
size_t r = cr_first / chroma_w, c = cr_first % chroma_w;
|
||
printf("Cr first diff at (%zu,%zu): gpu=%u ref=%u\n",
|
||
r, c, gpu_uv[r * (size_t) width + c * 2 + 1], ref_cr[cr_first]);
|
||
}
|
||
|
||
free(ref_cr);
|
||
free(ref_cb);
|
||
free(ref_y);
|
||
free(gpu_uv);
|
||
free(gpu_y);
|
||
free(mb_8x8);
|
||
free(per_mb_predicted);
|
||
free(per_mb_coeffs);
|
||
daedalus_decoder_destroy(dec);
|
||
|
||
if (y_diffs == 0 && cb_diffs == 0 && cr_diffs == 0) {
|
||
printf("BIT-EXACT PASS (Y + Cb + Cr)\n");
|
||
return 0;
|
||
}
|
||
fprintf(stderr, "BIT-EXACT FAIL\n");
|
||
return 1;
|
||
}
|