daedalus-decoder/tools/daedalus_decode_h264.c

/* SPDX-License-Identifier: BSD-2-Clause */
/*
 * daedalus_decode_h264 — option A standalone test harness for
 * daedalus-decoder against real H.264 streams.
 *
 * Decodes an H.264 file via stock libavcodec (the reference), AND
 * in parallel runs the same frame through daedalus-decoder in
 * identity-passthrough mode (predicted = libavcodec's reconstructed
 * frame, coeffs = 0, no deblock edges).  Writes both outputs as
 * NV12 YUV, then byte-exact diffs.
 *
 * PR-A1b purpose: validate the daedalus-decoder data path / API
 * contract at real-stream frame sizes (16k+ MBs at 1080p, real
 * H.264-decoded predicted-sample distributions), without yet
 * requiring per-MB internal state extraction from libavcodec.
 * Follow-up PRs (A2+) extend this harness to feed REAL per-MB
 * state (residual coeffs, pre-residual predicted, deblock edges)
 * via the per-MB inspection callback added in marfrit-packages
 * patch 0016 (PR #106).
 *
 * Identity-passthrough math:
 *   - mb_input.predicted = AVFrame pixels at this MB's raster pos
 *   - mb_input.coeffs    = 384 int16's, all zero
 *   - mb_input.edges     = NULL, n_edges = 0
 *   Then flush_frame:
 *     scratch_y/_uv pre-fill from predicted_y/_uv = AVFrame pixels
 *     IDCT dispatches with all-zero coeffs add 0 (no-op)
 *     No deblock dispatches (no edges)
 *     copy-out to caller's planes
 *   Result MUST equal AVFrame pixels byte-for-byte.
 *
 * Invoke:
 *   daedalus_decode_h264 [--substrate cpu|qpu|auto]
 *                        [--max-frames N]
 *                        <input.h264> <output_dadec.yuv> <output_ref.yuv>
 *
 * Exit status:
 *   0 — bit-exact match across all decoded frames
 *   1 — argument / setup error
 *   2 — decode error from libavcodec
 *   3 — daedalus-decoder error (ctx, append, flush)
 *   4 — bit-exact comparison failed (diff > 0 bytes)
 */

#define _POSIX_C_SOURCE 200809L

#include "daedalus_decoder.h"

#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/imgutils.h>

/* Per-MB inspection callback API — provided by the patched FFmpeg
 * fork via marfrit-packages patches 0016 + 0017.
 *
 * When DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS is defined (CMake sets it
 * alongside DAEDALUS_FFMPEG_SRC), we include libavcodec's INTERNAL
 * h264dec.h header to dereference H264Context fields — specifically
 * h->mb_inspect_coeffs (the 0017 side buffer holding pre-IDCT-
 * destruction sl->mb), h->cur_pic.f (pre-deblock reconstructed pixels),
 * and h->cur_pic.mb_type[mb_xy] for the mb-type gate.  The same
 * configure-time config.h that built the static libavcodec.a is
 * picked up via -DHAVE_AV_CONFIG_H + -I path; ABI match is automatic.
 *
 * When only DAEDALUS_HAVE_H264_MB_INSPECT_CB is defined (no source
 * tree available — e.g. building against a distro-shipped patched
 * libavcodec), the H264Context stays opaque and we fall back to
 * identity-passthrough across all MBs.
 *
 * When neither is defined: stock libavcodec, no callback, identity-
 * passthrough only (PR-A1b behaviour). */
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
#  include "libavcodec/h264dec.h"
#  include "libavcodec/h264.h"   /* IS_INTRA4x4 / IS_8x8DCT / IS_INTRA_PCM */
#elif defined(DAEDALUS_HAVE_H264_MB_INSPECT_CB)
struct H264Context;
#endif

#if defined(DAEDALUS_HAVE_H264_MB_INSPECT_CB) || defined(DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS)
typedef void (*ff_h264_mb_inspect_cb)(void *opaque,
                                       const struct H264Context *h,
                                       int mb_x, int mb_y);
void ff_h264_set_mb_inspect_cb(AVCodecContext *avctx,
                                ff_h264_mb_inspect_cb cb, void *opaque);
#endif

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>

static const char *substrate_str = "auto";
static int   max_frames = -1;

/* Inspection-callback state: per-frame counter + "each MB seen exactly
 * once" check.  Bitmap, not raster-order — libavcodec's MB threading +
 * multi-slice frames mean MBs reach the callback out of strict order;
 * contract is "every MB fires the callback exactly once per frame".
 *
 * When real-coeff extraction is compiled in (PR-A3+), we ALSO maintain
 * a per-MB capture buffer (real-coeffs path) so the main loop can
 * drive daedalus_decoder_append_mb with REAL pre-residual P + real
 * coefficients for MBs that satisfy the gate (Intra_4x4, no 8x8 DCT,
 * no PCM).  Other MBs stay on identity-passthrough. */
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
struct mb_capture {
    int     valid;              /* 1 = real-coeffs IDCT path, 0 = identity (predicted = pre_deblock_snap) */
    int16_t coeffs[256];        /* luma, raster block order, raw sl->mb layout */
    uint8_t predicted[256];     /* luma P recovered = pre_deblock - clipped IDCT(C) */
    uint8_t pre_deblock_snap_y[256];  /* luma 16×16 pre-deblock at callback time */
    uint8_t pre_deblock_snap_cb[64];  /* Cb 8×8 pre-deblock */
    uint8_t pre_deblock_snap_cr[64];  /* Cr 8×8 pre-deblock */
    int     qp_y;               /* QP_Y for this MB (sl->qscale at callback time) */
    int     mb_type_intra;      /* 1 if MB is intra (any flavour), 0 otherwise */
    int     transform_8x8;      /* 1 if 8×8 DCT (affects which internal edges fire) */
};

struct inspect_state {
    int       n_cbs_this_frame;
    int       mb_w, mb_h;
    uint8_t  *seen;             /* mb_w * mb_h bitmap */
    int       duplicate_mbs;
    int       out_of_bounds;
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
    struct mb_capture *captures;        /* mb_w * mb_h entries */
    int       real_coeffs_mbs;          /* count of MBs in real-coeffs IDCT path this frame */
    int       skipped_intra16x16;
    int       skipped_8x8dct;
    int       skipped_other;
    /* Slice-level deblock params (captured first time the callback sees a
     * slice context).  Per H.264 spec these are constant per slice; we
     * assume single-slice frames in our test stream. */
    int       slice_alpha_c0_offset;
    int       slice_beta_offset;
    int       slice_deblock_disable;    /* sl->deblocking_filter from spec */
#endif
};

#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
/* H.264 §8.7.2.2/8.7.2.3 deblock filter tables — transcribed verbatim
 * from FFmpeg libavcodec/h264_loopfilter.c (LGPL-2.1+; algorithm + table
 * values come from the H.264 spec which is normative and unpatented).
 * Tables are size 52*3 — FFmpeg's trick to absorb slice_alpha_c0_offset +
 * slice_beta_offset (in -12..+12) into the index without bounds-clamping.
 * Usage: alpha = alpha_table[qp + a]  where a = 52 + slice_alpha_c0_offset
 * (8-bit only; high-bit-depth subtracts qp_bd_offset). */
static const uint8_t alpha_table[52*3] = {
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
    80, 90,101,113,127,144,162,182,203,226,
   255,255,
   255,255,255,255,255,255,255,255,255,255,255,255,255,
   255,255,255,255,255,255,255,255,255,255,255,255,255,
   255,255,255,255,255,255,255,255,255,255,255,255,255,
   255,255,255,255,255,255,255,255,255,255,255,255,255,
};
static const uint8_t beta_table[52*3] = {
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
    18, 18,
    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
};
static const int8_t tc0_table[52*3][4] = {
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
};

/* H.264 §8.5.11 / Table 8-11: qP_y → qP_chroma mapping for chroma_qp_index_offset == 0.
 * For qP_y < 30, qP_c = qP_y.  Above that, the spec table compresses. */
static const uint8_t chroma_qp_table[52] = {
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
    31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
    39, 39, 39, 39,
};

/* libavcodec's sl->mb stores coefficients in RASTER (row-major) order,
 * not zig-zag scan order — h264_cavlc.c does
 *     block[*scantable] = (level * qmul[*scantable] + 32) >> 6
 * where *scantable advances through ff_zigzag_scan[] which contains
 * RASTER positions (row*4 + col).  So sl->mb[i] = coef at raster
 * position i = (i/4, i%4) = (row, col).  No inverse-zigzag needed;
 * just transpose row-major → column-major (daedalus's convention). */

/* H.264 §6.4.3 4x4 luma block scan within MB (z-scan).
 * Maps raster-block-idx (sb_y*4+sb_x) → libavcodec sl->mb's z-scan idx.
 * Z-scan happens to be its own inverse (symmetric mapping). */
static const uint8_t raster_to_zscan[16] = {
    0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
};

/* H.264 4x4 IDCT — transcribed from daedalus-fourier
 * tests/test_idct_bitexact.c (which itself mirrors h264_idct4_ref.c).
 * Outputs row-major 16-element residual; clip + shift happens in
 * the consumer. */
static void h264_idct4_butterfly(const int d[4], int out[4]) {
    int e = d[0] + d[2];
    int f = d[0] - d[2];
    int g = (d[1] >> 1) - d[3];
    int h = d[1] + (d[3] >> 1);
    out[0] = e + h;
    out[1] = f + g;
    out[2] = f - g;
    out[3] = e - h;
}
static void ref_idct4_compute(const int16_t block[16], int out[16]) {
    /* block COLUMN-MAJOR: block[c*4+r] = coef at (row=r, col=c).
     *
     * Pass order: COLUMN-pass first, then ROW-pass — matches FFmpeg's
     * h264idct_template.c.  The pass order matters for integer
     * arithmetic with `>>1` on signed values (which round toward -inf
     * for odd negatives in C); row-first vs column-first orders can
     * disagree by 1 unit at the intermediate stage, propagating to
     * the final pixel residual.
     *
     * (daedalus-fourier's tests/h264_idct4_ref.c does ROW-first, which
     * matches its NEON kernel + GPU shader bit-exact within the
     * package but DIVERGES from FFmpeg's IDCT for some inputs.  PR-A3b
     * surfaces the divergence; investigating the fix is a daedalus-
     * fourier follow-up — see task #184.) */
    int tmp[4][4];
    /* Column pass: process each column c independently. */
    for (int c = 0; c < 4; c++) {
        int d[4] = { block[c*4+0], block[c*4+1], block[c*4+2], block[c*4+3] };
        int o[4];
        h264_idct4_butterfly(d, o);
        for (int r = 0; r < 4; r++) tmp[r][c] = o[r];
    }
    /* Row pass: process each row r. */
    for (int r = 0; r < 4; r++) {
        int d[4] = { tmp[r][0], tmp[r][1], tmp[r][2], tmp[r][3] };
        int o[4];
        h264_idct4_butterfly(d, o);
        for (int c = 0; c < 4; c++) out[r*4+c] = o[c];
    }
}
#endif  /* DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS */

static void inspect_cb(void *opaque,
                        const struct H264Context *h,
                        int mb_x, int mb_y)
{
    struct inspect_state *st = opaque;
#ifndef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
    (void) h;
#endif

    if (mb_x < 0 || mb_x >= st->mb_w || mb_y < 0 || mb_y >= st->mb_h) {
        st->out_of_bounds++;
        st->n_cbs_this_frame++;
        return;
    }

    const size_t idx = (size_t) mb_y * st->mb_w + (size_t) mb_x;
    if (st->seen[idx]) st->duplicate_mbs++;
    st->seen[idx] = 1;
    st->n_cbs_this_frame++;

#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
    /* Capture slice-level deblock params once.  Per spec they're constant
     * per slice; for our single-slice test streams we just keep the
     * latest values seen. */
    {
        const H264SliceContext *sl = &h->slice_ctx[0];
        st->slice_alpha_c0_offset = sl->slice_alpha_c0_offset;
        st->slice_beta_offset     = sl->slice_beta_offset;
        st->slice_deblock_disable = sl->deblocking_filter;
    }

    /* Real-coeffs path: extract per-MB state for daedalus-decoder
     * IDCT validation on this MB.  Gate: only Intra_4x4 + 4x4 transform
     * + non-PCM is supported in PR-A3b — other MB flavours fall back
     * to identity-passthrough in the main loop. */
    struct mb_capture *cap = &st->captures[idx];
    cap->valid = 0;  /* default to passthrough */

    const int mb_xy = mb_y * h->mb_stride + mb_x;
    const uint32_t mb_type = h->cur_pic.mb_type[mb_xy];

    /* Capture state needed for deblock edge derivation, regardless
     * of whether this MB takes the real-coeffs IDCT path. */
    cap->qp_y           = h->cur_pic.qscale_table[mb_xy];
    cap->mb_type_intra  = IS_INTRA(mb_type) ? 1 : 0;
    cap->transform_8x8  = IS_8x8DCT(mb_type) ? 1 : 0;

    /* Snapshot pre-deblock pixels for all 3 planes at this MB's position. */
    {
        const int y_stride  = h->cur_pic.f->linesize[0];
        const int uv_stride = h->cur_pic.f->linesize[1];
        const uint8_t *mb_y_px = h->cur_pic.f->data[0]
            + (ptrdiff_t) mb_y * 16 * y_stride + mb_x * 16;
        const uint8_t *mb_cb_px = h->cur_pic.f->data[1]
            + (ptrdiff_t) mb_y * 8 * uv_stride + mb_x * 8;
        const uint8_t *mb_cr_px = h->cur_pic.f->data[2]
            + (ptrdiff_t) mb_y * 8 * uv_stride + mb_x * 8;
        for (int r = 0; r < 16; r++)
            memcpy(&cap->pre_deblock_snap_y[r * 16], &mb_y_px[r * y_stride], 16);
        for (int r = 0; r < 8; r++) {
            memcpy(&cap->pre_deblock_snap_cb[r * 8], &mb_cb_px[r * uv_stride], 8);
            memcpy(&cap->pre_deblock_snap_cr[r * 8], &mb_cr_px[r * uv_stride], 8);
        }
    }

    if (!IS_INTRA4x4(mb_type)) {
        if (IS_INTRA16x16(mb_type))  st->skipped_intra16x16++;
        else                          st->skipped_other++;
        return;
    }
    if (IS_8x8DCT(mb_type)) { st->skipped_8x8dct++; return; }
    if (IS_INTRA_PCM(mb_type)) { st->skipped_other++; return; }

    /* Snapshot luma pre-deblock pixels from cur_pic. */
    const uint8_t *luma_plane = h->cur_pic.f->data[0];
    const int luma_stride = h->cur_pic.f->linesize[0];
    const uint8_t *mb_pixels = luma_plane + (ptrdiff_t) mb_y * 16 * luma_stride
                                          + mb_x * 16;

    /* (pre_deblock_snap_y is already populated above for all 3 planes;
     * we use it later in the main loop as the daedalus predicted input.) */

    /* Coefficients are in sl->mb at end of entropy decode but zeroed by
     * the time the callback fires (IDCT-add consumed them).  Patch 0017
     * preserves them in h->mb_inspect_coeffs[16 * 48] BEFORE IDCT runs,
     * so we read from there. */
    const int16_t *zz_mb = h->mb_inspect_coeffs;  /* layout matches sl->mb 8-bit half */

    for (int r_block = 0; r_block < 16; r_block++) {
        const int z_block = raster_to_zscan[r_block];
        const int16_t *block_raw = &zz_mb[z_block * 16];

        /* sl->mb stores 16 int16 per block.  Empirical finding (via
         * /tmp/idct_compare.c, 2026-05-26): daedalus-fourier's C ref
         * IDCT and FFmpeg's C ref IDCT produce IDENTICAL output for
         * the same input array — the "column-major vs row-major"
         * labelling is decoration; both functions implement the same
         * H.264 spec IDCT on a 16-int16 input.  So we feed daedalus
         * the raw sl->mb data unchanged.  Previous attempt to
         * transpose row-major→column-major was wrong — the transpose
         * changed the IDCT result. */
        int16_t col[16];
        memcpy(col, block_raw, 16 * sizeof(int16_t));

        memcpy(&cap->coeffs[r_block * 16], col, 16 * sizeof(int16_t));

        /* IDCT → row-major 16-int residual. */
        int idct_row[16];
        ref_idct4_compute(col, idct_row);

        /* P = clip(pre_deblock - ((IDCT + 32) >> 6)) for each pixel.
         * Symmetric: daedalus IDCT-add will undo the subtract, including
         * for saturating cases (where the same shift puts the value back
         * at the same clip boundary). */
        const int sb_y = r_block >> 2;
        const int sb_x = r_block & 3;
        for (int r = 0; r < 4; r++) {
            for (int c = 0; c < 4; c++) {
                const int pre_db = mb_pixels[(sb_y * 4 + r) * luma_stride + sb_x * 4 + c];
                const int shift  = (idct_row[r * 4 + c] + 32) >> 6;
                int p = pre_db - shift;
                if (p < 0)   p = 0;
                if (p > 255) p = 255;
                cap->predicted[(sb_y * 4 + r) * 16 + (sb_x * 4 + c)] = (uint8_t) p;
            }
        }
    }
    cap->valid = 1;
    st->real_coeffs_mbs++;

    /* One-shot diagnostic enabled by DAEDALUS_DUMP_MB_3_0 env var. */
    if (mb_x == 3 && mb_y == 0 && getenv("DAEDALUS_DUMP_MB_3_0")) {
        const int16_t *zz = &zz_mb[1 * 16];   /* z_block = raster_block = 1 */
        const struct mb_capture *capdiag = &st->captures[mb_y * st->mb_w + mb_x];
        fprintf(stderr, "  MB(3,0) block z=1 raster coeffs (sl->mb):");
        for (int p = 0; p < 16; p++) fprintf(stderr, " %d", (int) zz[p]);
        fprintf(stderr, "\n");
        fprintf(stderr, "  MB(3,0) block z=1 col_major coeffs (after transpose):");
        for (int i = 0; i < 16; i++) fprintf(stderr, " %d", (int) capdiag->coeffs[1 * 16 + i]);
        fprintf(stderr, "\n");
        /* Recompute IDCT for this block (already done in the loop above but
         * print here for visibility). */
        int idct_print[16];
        ref_idct4_compute(&capdiag->coeffs[1 * 16], idct_print);
        fprintf(stderr, "  MB(3,0) block z=1 IDCT row-major (raw, pre-shift):");
        for (int i = 0; i < 16; i++) fprintf(stderr, " %d", idct_print[i]);
        fprintf(stderr, "\n");
        fprintf(stderr, "  MB(3,0) block z=1 IDCT (+32)>>6:");
        for (int i = 0; i < 16; i++) fprintf(stderr, " %d", (idct_print[i] + 32) >> 6);
        fprintf(stderr, "\n");
        const uint8_t *bpix = mb_pixels + 0 * luma_stride + 4;  /* sb_y=0, sb_x=1 → cols 4..7 within MB */
        fprintf(stderr, "  MB(3,0) block z=1 pre_deblock pixels:\n");
        for (int r = 0; r < 4; r++) {
            fprintf(stderr, "   ");
            for (int c = 0; c < 4; c++)
                fprintf(stderr, " %3u", bpix[r * luma_stride + c]);
            fprintf(stderr, "\n");
        }
        fprintf(stderr, "  MB(3,0) block z=1 P_rec (= pre_deblock - shift):\n");
        for (int r = 0; r < 4; r++) {
            fprintf(stderr, "   ");
            for (int c = 0; c < 4; c++)
                fprintf(stderr, " %3u", capdiag->predicted[(0*4+r) * 16 + (1*4+c)]);
            fprintf(stderr, "\n");
        }
        /* And what daedalus_decoder SHOULD produce: clip(P_rec + shift). */
        fprintf(stderr, "  MB(3,0) block z=1 expected daedalus output = clip(P_rec + shift):\n");
        for (int r = 0; r < 4; r++) {
            fprintf(stderr, "   ");
            for (int c = 0; c < 4; c++) {
                int p_rec = capdiag->predicted[(0*4+r) * 16 + (1*4+c)];
                int sh = (idct_print[r*4+c] + 32) >> 6;
                int e = p_rec + sh;
                if (e < 0) e = 0; if (e > 255) e = 255;
                fprintf(stderr, " %3d", e);
            }
            fprintf(stderr, "\n");
        }
    }
#endif
}
#endif

/* Extract one MB's predicted-samples block from a YUV420P AVFrame
 * (stock libavcodec) and pack it into the 384-byte mb_input.predicted
 * layout: 16x16 luma raster, then 8x8 Cb raster, then 8x8 Cr raster.
 *
 * AVFrame's data[] points at separate Y / U / V planes (or NV12's
 * interleaved UV — we handle both via the pix_fmt branch). */
static void pack_mb_predicted(const AVFrame *fr, int mb_x, int mb_y,
                               uint8_t out[384])
{
    const int y_off  = mb_y * 16 * fr->linesize[0] + mb_x * 16;
    const int uv_off = mb_y *  8 * fr->linesize[1] + mb_x *  8;

    /* Luma: 16 rows × 16 cols */
    for (int r = 0; r < 16; r++)
        memcpy(&out[r * 16],
               &fr->data[0][y_off + r * fr->linesize[0]],
               16);

    /* Chroma: 8 rows × 8 cols per component */
    if (fr->format == AV_PIX_FMT_YUV420P) {
        for (int r = 0; r < 8; r++) {
            memcpy(&out[256 + r * 8],
                   &fr->data[1][uv_off + r * fr->linesize[1]], 8);
            memcpy(&out[256 + 64 + r * 8],
                   &fr->data[2][uv_off + r * fr->linesize[2]], 8);
        }
    } else if (fr->format == AV_PIX_FMT_NV12) {
        /* NV12: interleaved UV plane, deinterleave into Cb/Cr halves */
        const int uv_off_nv12 = mb_y * 8 * fr->linesize[1] + mb_x * 16;
        for (int r = 0; r < 8; r++) {
            for (int c = 0; c < 8; c++) {
                out[256 + r * 8 + c]      = fr->data[1][uv_off_nv12 + r * fr->linesize[1] + c * 2 + 0];
                out[256 + 64 + r * 8 + c] = fr->data[1][uv_off_nv12 + r * fr->linesize[1] + c * 2 + 1];
            }
        }
    } else {
        /* Unsupported pixel format — zero out chroma (test will fail loud) */
        memset(&out[256], 0, 128);
    }
}

/* Convert an AVFrame (YUV420P or NV12) to NV12 in caller-provided
 * planes.  Used to write the reference YUV file. */
static void avframe_to_nv12(const AVFrame *fr, uint8_t *out_y, size_t y_stride,
                             uint8_t *out_uv, size_t uv_stride,
                             int width, int height)
{
    /* Y plane: row-major copy from src linesize to dst stride */
    for (int r = 0; r < height; r++)
        memcpy(&out_y[(size_t) r * y_stride],
               &fr->data[0][(size_t) r * fr->linesize[0]],
               (size_t) width);

    if (fr->format == AV_PIX_FMT_NV12) {
        for (int r = 0; r < height / 2; r++)
            memcpy(&out_uv[(size_t) r * uv_stride],
                   &fr->data[1][(size_t) r * fr->linesize[1]],
                   (size_t) width);
    } else if (fr->format == AV_PIX_FMT_YUV420P) {
        /* Interleave U+V → NV12 UV */
        const int cw = width / 2, ch = height / 2;
        for (int r = 0; r < ch; r++) {
            for (int c = 0; c < cw; c++) {
                out_uv[(size_t) r * uv_stride + (size_t) c * 2 + 0] =
                    fr->data[1][(size_t) r * fr->linesize[1] + c];
                out_uv[(size_t) r * uv_stride + (size_t) c * 2 + 1] =
                    fr->data[2][(size_t) r * fr->linesize[2] + c];
            }
        }
    }
}

static int parse_args(int argc, char **argv,
                       const char **in_path,
                       const char **out_dadec_path,
                       const char **out_ref_path)
{
    int i = 1;
    while (i < argc && argv[i][0] == '-') {
        if (!strcmp(argv[i], "--substrate") && i + 1 < argc) {
            substrate_str = argv[++i];
        } else if (!strcmp(argv[i], "--max-frames") && i + 1 < argc) {
            max_frames = atoi(argv[++i]);
        } else {
            fprintf(stderr, "unknown option: %s\n", argv[i]);
            return -1;
        }
        i++;
    }
    if (argc - i != 3) {
        fprintf(stderr,
            "usage: %s [--substrate cpu|qpu|auto] [--max-frames N] "
            "<input.h264> <output_dadec.yuv> <output_ref.yuv>\n", argv[0]);
        return -1;
    }
    *in_path        = argv[i + 0];
    *out_dadec_path = argv[i + 1];
    *out_ref_path   = argv[i + 2];
    return 0;
}

static daedalus_decoder_substrate parse_substrate(const char *s)
{
    if (!strcmp(s, "cpu"))  return DAEDALUS_DECODER_SUBSTRATE_CPU;
    if (!strcmp(s, "qpu"))  return DAEDALUS_DECODER_SUBSTRATE_QPU;
    return DAEDALUS_DECODER_SUBSTRATE_AUTO;
}

int main(int argc, char **argv)
{
    const char *in_path, *out_dadec_path, *out_ref_path;
    if (parse_args(argc, argv, &in_path, &out_dadec_path, &out_ref_path) != 0)
        return 1;

    /* ---- Open input via libavformat (so we get NAL framing for free
     * from the raw .h264 elementary stream demuxer). ---- */
    AVFormatContext *fmt = NULL;
    if (avformat_open_input(&fmt, in_path, NULL, NULL) < 0) {
        fprintf(stderr, "avformat_open_input(%s) failed\n", in_path);
        return 2;
    }
    if (avformat_find_stream_info(fmt, NULL) < 0) {
        fprintf(stderr, "avformat_find_stream_info failed\n");
        avformat_close_input(&fmt); return 2;
    }
    int vstream = -1;
    for (unsigned s = 0; s < fmt->nb_streams; s++)
        if (fmt->streams[s]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
            vstream = (int) s; break;
        }
    if (vstream < 0) {
        fprintf(stderr, "no video stream in %s\n", in_path);
        avformat_close_input(&fmt); return 2;
    }

    /* ---- Open H.264 decoder ---- */
    const AVCodec *codec = avcodec_find_decoder(AV_CODEC_ID_H264);
    AVCodecContext *avctx = avcodec_alloc_context3(codec);
    avcodec_parameters_to_context(avctx, fmt->streams[vstream]->codecpar);

#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
    /* Patch 0017's coefficient side buffer lives in H264Context (single
     * per-stream); multi-threaded slice decode would race on it. */
    avctx->thread_count     = 1;
    avctx->thread_type      = 0;
    /* PR-A6: keep libavcodec's deblock ON so AVFrame is the post-deblock
     * reference we validate daedalus against.  Per-MB pre_deblock
     * snapshots taken in the inspection callback (before deblock crosses
     * into this MB's region) provide daedalus with pre-deblock input. */
#endif

    if (avcodec_open2(avctx, codec, NULL) < 0) {
        fprintf(stderr, "avcodec_open2 failed\n");
        avformat_close_input(&fmt); return 2;
    }

    AVPacket *pkt = av_packet_alloc();
    AVFrame  *fr  = av_frame_alloc();

    /* ---- Allocate output buffers + state needed before first decode ---- */
    daedalus_decoder *dec = NULL;
    uint8_t *out_y_dadec = NULL, *out_uv_dadec = NULL;
    uint8_t *out_y_ref   = NULL, *out_uv_ref   = NULL;
    size_t y_size = 0, uv_size = 0;
    FILE *out_dadec_f = NULL, *out_ref_f = NULL;
    int rc = 0;
    int n_frames = 0;
    size_t total_y_diffs = 0, total_uv_diffs = 0;

#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
    /* Init inspect state BEFORE the first avcodec_send_packet — the
     * callback fires from inside send_packet (i.e. before the first
     * receive_frame ever returns), so lazy-init after-the-fact
     * would miss the entire first frame.  Use codecpar dims; round
     * up to MB granularity (H.264 codes 1080 height as 1088). */
    struct inspect_state inspect_st = {0};
    {
        const AVCodecParameters *cp = fmt->streams[vstream]->codecpar;
        const int W_round = (cp->width  + 15) & ~15;
        const int H_round = (cp->height + 15) & ~15;
        inspect_st.mb_w = W_round / 16;
        inspect_st.mb_h = H_round / 16;
        inspect_st.seen = calloc(1, (size_t) inspect_st.mb_w * inspect_st.mb_h);
        if (!inspect_st.seen) { rc = 1; goto cleanup; }
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
        inspect_st.captures = calloc((size_t) inspect_st.mb_w * inspect_st.mb_h,
                                      sizeof(*inspect_st.captures));
        if (!inspect_st.captures) { rc = 1; goto cleanup; }
#endif
    }
    ff_h264_set_mb_inspect_cb(avctx, inspect_cb, &inspect_st);
    int inspect_total_cbs       = 0;
    int inspect_total_duplicate = 0;
    int inspect_total_oob       = 0;
    int inspect_total_missing   = 0;
#endif

    /* ---- daedalus_decoder is lazy-created on the first AVFrame
     * (coded width/height come from the bitstream's SPS via
     * libavcodec). ---- */

    while (av_read_frame(fmt, pkt) >= 0) {
        if (pkt->stream_index != vstream) { av_packet_unref(pkt); continue; }

        if (avcodec_send_packet(avctx, pkt) < 0) {
            fprintf(stderr, "send_packet failed\n");
            rc = 2; goto cleanup;
        }
        av_packet_unref(pkt);

        for (;;) {
            int ret = avcodec_receive_frame(avctx, fr);
            if (ret == AVERROR(EAGAIN)) break;
            if (ret < 0) {
                fprintf(stderr, "receive_frame failed: %d\n", ret);
                rc = 2; goto cleanup;
            }

            /* Lazily create the daedalus_decoder + output planes on
             * the first frame so the SPS-derived coded width/height
             * are known. */
            if (!dec) {
                /* Coded (= MB-aligned) dimensions are on AVCodecContext,
                 * not AVFrame (which carries the cropped display size). */
                const int W = avctx->coded_width  ? avctx->coded_width  : fr->width;
                const int H = avctx->coded_height ? avctx->coded_height : fr->height;
                if ((W & 15) || (H & 15)) {
                    fprintf(stderr, "coded dims %dx%d not mod-16; skip\n", W, H);
                    rc = 2; goto cleanup;
                }
                dec = daedalus_decoder_create(W, H);
                if (!dec) {
                    fprintf(stderr, "daedalus_decoder_create failed\n");
                    rc = 3; goto cleanup;
                }
                daedalus_decoder_set_substrate(dec, parse_substrate(substrate_str));
                y_size  = (size_t) W * (size_t) H;
                uv_size = y_size / 2;
                out_y_dadec  = malloc(y_size);
                out_uv_dadec = malloc(uv_size);
                out_y_ref    = malloc(y_size);
                out_uv_ref   = malloc(uv_size);
                out_dadec_f  = fopen(out_dadec_path, "wb");
                out_ref_f    = fopen(out_ref_path,   "wb");
                if (!out_y_dadec || !out_uv_dadec || !out_y_ref || !out_uv_ref ||
                    !out_dadec_f || !out_ref_f) {
                    fprintf(stderr, "alloc / fopen failed\n");
                    rc = 1; goto cleanup;
                }
                printf("daedalus_decode_h264: %dx%d, substrate=%s\n",
                       W, H, substrate_str);
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
                printf("  inspection callback: ACTIVE (patched libavcodec); "
                       "mb-grid %dx%d\n", inspect_st.mb_w, inspect_st.mb_h);
#else
                printf("  inspection callback: not built in (stock libavcodec)\n");
#endif
            }

            /* Pack each MB's predicted samples from the AVFrame.
             * Coeffs = 0; no edges; daedalus_decoder will reproduce
             * exactly the AVFrame pixels.  Use coded_width/coded_height
             * for MB-grid alignment (e.g. 1920x1088 for 1080p display). */
            const int coded_w = avctx->coded_width  ? avctx->coded_width  : avctx->width;
            const int coded_h = avctx->coded_height ? avctx->coded_height : avctx->height;
            const int mb_w = coded_w / 16;
            const int mb_h = coded_h / 16;
            uint8_t mb_pred[384];
            int16_t mb_coeffs[384] = {0};
            struct daedalus_decoder_edge mb_edges[16];
            struct daedalus_decoder_mb_input mb = {0};
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
            /* PR-A6 edge derivation: a = 52 + slice_alpha_c0_offset,
             * b = 52 + slice_beta_offset (per FFmpeg loopfilter.c
             * convention; absorbs the offset into the tripled tables). */
            const int slice_a = 52 + inspect_st.slice_alpha_c0_offset;
            const int slice_b = 52 + inspect_st.slice_beta_offset;
            /* FFmpeg's h264_slice.c inverts the spec's disable_deblocking_filter_idc
             * via `sl->deblocking_filter ^= 1` (line ~1901).  Internal convention:
             *   0 = disabled       (spec = 1)
             *   1 = enabled        (spec = 0)
             *   2 = enabled-but-not-across-slice-boundaries  (unchanged)
             * So deblock is OFF iff sl->deblocking_filter == 0. */
            const int deblock_off = inspect_st.slice_deblock_disable == 0;
#endif
            for (int my = 0; my < mb_h; my++) {
                for (int mx = 0; mx < mb_w; mx++) {
                    /* Default: identity-passthrough — luma from AVFrame,
                     * chroma from AVFrame, coeffs all zero, no edges. */
                    pack_mb_predicted(fr, mx, my, mb_pred);
                    memset(mb_coeffs, 0, sizeof(mb_coeffs));
                    int n_edges = 0;

#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
                    /* PR-A6: feed daedalus pre-deblock pixels from the
                     * per-MB snapshots taken in the callback (AVFrame is
                     * now post-deblock — used as reference, not as input). */
                    const int mb_idx = my * mb_w + mx;
                    const struct mb_capture *cap = &inspect_st.captures[mb_idx];

                    /* Luma: P_rec for real-coeffs MBs, raw pre-deblock snap
                     * otherwise (with zero coeffs).  Both produce the same
                     * pre-deblock state after daedalus IDCT-add. */
                    if (cap->valid) {
                        memcpy(mb_pred, cap->predicted, 256);
                        for (int i = 0; i < 256; i++)
                            mb_coeffs[i] = cap->coeffs[i];
                    } else {
                        memcpy(mb_pred, cap->pre_deblock_snap_y, 256);
                    }
                    /* Chroma: always identity-passthrough from snap.
                     * Chroma DC Hadamard + chroma residual extraction is
                     * a follow-up (PR-A4). */
                    memcpy(mb_pred + 256,       cap->pre_deblock_snap_cb, 64);
                    memcpy(mb_pred + 256 + 64,  cap->pre_deblock_snap_cr, 64);

                    /* Derive deblock edges for this MB.  Spec §8.7.2:
                     * - Frame-boundary edges: skip (bS=0 — kernel reads p3 at -4).
                     * - MB-boundary edges with intra neighbour: bS=4.
                     * - Internal MB edges within intra MB: bS=3.
                     * - 8x8 DCT MBs: internal edges only at col/row 8 (the
                     *   single 8x8-block boundary inside the MB).
                     * For non-intra MB types in mixed streams the bS rules
                     * differ; we'd need cbp/MV/ref info from sl context for
                     * those.  Our test stream is all-intra, so simplified. */
                    if (!deblock_off && cap->mb_type_intra && !getenv("DAEDALUS_SKIP_EDGES")) {
                        const int qp_self  = cap->qp_y;
                        const int qp_left  = (mx > 0)
                            ? inspect_st.captures[mb_idx - 1].qp_y : qp_self;
                        const int qp_top   = (my > 0)
                            ? inspect_st.captures[mb_idx - mb_w].qp_y : qp_self;
                        const int qpc_self = chroma_qp_table[qp_self];
                        const int qpc_left = chroma_qp_table[qp_left];
                        const int qpc_top  = chroma_qp_table[qp_top];
                        const int qp_avg_left  = (qp_self  + qp_left  + 1) >> 1;
                        const int qp_avg_top   = (qp_self  + qp_top   + 1) >> 1;
                        const int qpc_avg_left = (qpc_self + qpc_left + 1) >> 1;
                        const int qpc_avg_top  = (qpc_self + qpc_top  + 1) >> 1;

                        /* Helper macro to emit one edge.  bS=0 (skip)
                         * edges are still emitted with bS=0 — daedalus's
                         * partitioner filters them out. */
                        #define EMIT_EDGE(orient_, plane_, edge_idx_, bS_, qp_) do { \
                            if (n_edges >= 16) break;                                \
                            struct daedalus_decoder_edge *e = &mb_edges[n_edges++];  \
                            e->mb_x     = (uint16_t) mx;                             \
                            e->mb_y     = (uint16_t) my;                             \
                            e->edge_idx = (uint8_t)  (edge_idx_);                    \
                            e->orient   = (uint8_t)  (orient_);                      \
                            e->plane    = (uint8_t)  (plane_);                       \
                            e->bS       = (uint8_t)  (bS_);                          \
                            e->alpha    = alpha_table[(qp_) + slice_a];              \
                            e->beta     = beta_table [(qp_) + slice_b];              \
                            const int8_t *tc = tc0_table[(qp_) + slice_a];           \
                            e->tc0[0] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
                            e->tc0[1] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
                            e->tc0[2] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
                            e->tc0[3] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
                        } while (0)

                        /* Luma V edges: 4 at col 0, 4, 8, 12.  Internal
                         * edges at 4/12 are skipped for 8x8 DCT MBs. */
                        EMIT_EDGE(0, 0, 0, (mx > 0) ? 4 : 0, qp_avg_left);
                        if (!cap->transform_8x8) EMIT_EDGE(0, 0, 1, 3, qp_self);
                        EMIT_EDGE(0, 0, 2, 3, qp_self);
                        if (!cap->transform_8x8) EMIT_EDGE(0, 0, 3, 3, qp_self);

                        /* Luma H edges: 4 at row 0, 4, 8, 12. */
                        EMIT_EDGE(1, 0, 0, (my > 0) ? 4 : 0, qp_avg_top);
                        if (!cap->transform_8x8) EMIT_EDGE(1, 0, 1, 3, qp_self);
                        EMIT_EDGE(1, 0, 2, 3, qp_self);
                        if (!cap->transform_8x8) EMIT_EDGE(1, 0, 3, 3, qp_self);

                        /* Chroma V edges: 2 per plane (Cb=1, Cr=2). */
                        EMIT_EDGE(0, 1, 0, (mx > 0) ? 4 : 0, qpc_avg_left);
                        if (!cap->transform_8x8) EMIT_EDGE(0, 1, 1, 3, qpc_self);
                        EMIT_EDGE(0, 2, 0, (mx > 0) ? 4 : 0, qpc_avg_left);
                        if (!cap->transform_8x8) EMIT_EDGE(0, 2, 1, 3, qpc_self);

                        /* Chroma H edges. */
                        EMIT_EDGE(1, 1, 0, (my > 0) ? 4 : 0, qpc_avg_top);
                        if (!cap->transform_8x8) EMIT_EDGE(1, 1, 1, 3, qpc_self);
                        EMIT_EDGE(1, 2, 0, (my > 0) ? 4 : 0, qpc_avg_top);
                        if (!cap->transform_8x8) EMIT_EDGE(1, 2, 1, 3, qpc_self);

                        #undef EMIT_EDGE
                    }
#endif

                    mb.mb_x        = (uint16_t) mx;
                    mb.mb_y        = (uint16_t) my;
                    mb.transform_8x8 = 0;
                    mb.coeffs      = mb_coeffs;
                    mb.predicted   = mb_pred;
                    mb.edges       = (n_edges > 0) ? mb_edges : NULL;
                    mb.n_edges     = (uint8_t) n_edges;
                    if (daedalus_decoder_append_mb(dec, &mb) != 0) {
                        fprintf(stderr, "append_mb (%d,%d) failed\n", mx, my);
                        rc = 3; goto cleanup;
                    }
                }
            }

            int frc = daedalus_decoder_flush_frame(dec,
                                                    out_y_dadec,  (size_t) coded_w,
                                                    out_uv_dadec, (size_t) coded_w);
            if (frc != 0) {
                fprintf(stderr, "flush_frame frame %d rc=%d\n", n_frames, frc);
                rc = 3; goto cleanup;
            }

            /* Build the reference NV12 from the AVFrame for comparison. */
            avframe_to_nv12(fr, out_y_ref,  (size_t) coded_w,
                                out_uv_ref, (size_t) coded_w,
                                coded_w, coded_h);

            /* (PR-A3b's pre_deblock vs AVFrame DIAG check is removed in
             * PR-A6: with libavcodec's deblock now ENABLED, AVFrame is
             * post-deblock and intentionally differs from the per-MB
             * pre_deblock snapshots taken in the callback.) */

            /* Byte-exact compare + first-diff diagnostic. */
            size_t y_diffs = 0, uv_diffs = 0;
            size_t y_first_diff = (size_t) -1;
            for (size_t i = 0; i < y_size; i++)
                if (out_y_dadec[i] != out_y_ref[i]) {
                    if (y_first_diff == (size_t) -1) y_first_diff = i;
                    y_diffs++;
                }
            for (size_t i = 0; i < uv_size; i++)
                if (out_uv_dadec[i] != out_uv_ref[i]) uv_diffs++;
            if (y_diffs && y_first_diff != (size_t) -1) {
                const size_t row = y_first_diff / (size_t) avctx->width;
                const size_t col = y_first_diff % (size_t) avctx->width;
                const size_t mb_x = col / 16;
                const size_t mb_y = row / 8;  /* not row/16 — chroma row uses /8 so use raw row here */
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
                const int mb_idx = (int)(row / 16) * mb_w + (int) mb_x;
                const int real = (mb_idx >= 0 && mb_idx < mb_w * mb_h)
                                  ? inspect_st.captures[mb_idx].valid : -1;
                printf("    first Y diff @ byte %zu = (row %zu, col %zu) in MB(%zu,%zu) [real-coeffs=%d]; "
                       "dadec=%u ref=%u\n",
                       y_first_diff, row, col, mb_x, row / 16,
                       real, out_y_dadec[y_first_diff], out_y_ref[y_first_diff]);
#else
                (void) mb_x; (void) mb_y;
                printf("    first Y diff @ byte %zu = (row %zu, col %zu); dadec=%u ref=%u\n",
                       y_first_diff, row, col,
                       out_y_dadec[y_first_diff], out_y_ref[y_first_diff]);
#endif
            }
            total_y_diffs  += y_diffs;
            total_uv_diffs += uv_diffs;
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
            {
                const int expected = mb_w * mb_h;
                /* Count MBs that fired the callback. */
                int seen_count = 0;
                for (int i = 0; i < expected; i++)
                    if (inspect_st.seen[i]) seen_count++;
                int missing = expected - seen_count;
                if (missing || inspect_st.duplicate_mbs || inspect_st.out_of_bounds) {
                    fprintf(stderr,
                        "  frame %d: callback invariants: fired=%d expected=%d "
                        "missing=%d duplicates=%d oob=%d\n",
                        n_frames, inspect_st.n_cbs_this_frame, expected,
                        missing, inspect_st.duplicate_mbs, inspect_st.out_of_bounds);
                    rc = 4;
                }
                inspect_total_cbs       += inspect_st.n_cbs_this_frame;
                inspect_total_duplicate += inspect_st.duplicate_mbs;
                inspect_total_oob       += inspect_st.out_of_bounds;
                inspect_total_missing   += missing;
                /* Reset for next frame. */
                inspect_st.n_cbs_this_frame = 0;
                inspect_st.duplicate_mbs    = 0;
                inspect_st.out_of_bounds    = 0;
                memset(inspect_st.seen, 0, (size_t) expected);

#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
                printf("  frame %d: real-coeffs path %d MBs, "
                       "skipped intra16x16=%d 8x8dct=%d other=%d\n",
                       n_frames, inspect_st.real_coeffs_mbs,
                       inspect_st.skipped_intra16x16,
                       inspect_st.skipped_8x8dct,
                       inspect_st.skipped_other);
                inspect_st.real_coeffs_mbs      = 0;
                inspect_st.skipped_intra16x16   = 0;
                inspect_st.skipped_8x8dct       = 0;
                inspect_st.skipped_other        = 0;
                memset(inspect_st.captures, 0,
                       (size_t) expected * sizeof(*inspect_st.captures));
#endif
            }
#endif
            printf("  frame %d: Y diff %zu/%zu  UV diff %zu/%zu%s\n",
                   n_frames, y_diffs, y_size, uv_diffs, uv_size,
                   (y_diffs || uv_diffs) ? "  ***" : "");

            /* Write both YUVs to disk. */
            fwrite(out_y_dadec,  1, y_size,  out_dadec_f);
            fwrite(out_uv_dadec, 1, uv_size, out_dadec_f);
            fwrite(out_y_ref,    1, y_size,  out_ref_f);
            fwrite(out_uv_ref,   1, uv_size, out_ref_f);

            n_frames++;
            if (max_frames > 0 && n_frames >= max_frames) goto drained;
        }
    }
    /* Flush libavcodec for any remaining buffered frames. */
    avcodec_send_packet(avctx, NULL);
    for (;;) {
        int ret = avcodec_receive_frame(avctx, fr);
        if (ret < 0) break;
        (void) ret;
        /* Same loop body as above would go here; omitted for brevity —
         * stock libavcodec rarely buffers I-only streams. */
        n_frames++;
    }

drained:
    printf("\n%d frames decoded; total Y diff %zu, UV diff %zu\n",
           n_frames, total_y_diffs, total_uv_diffs);
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
    printf("inspection callback: %d total invocations, %d missing, %d duplicates, %d oob\n",
           inspect_total_cbs, inspect_total_missing, inspect_total_duplicate, inspect_total_oob);
    if (inspect_total_missing || inspect_total_duplicate || inspect_total_oob)
        rc = 4;
#endif
    if (rc == 0 && (total_y_diffs || total_uv_diffs)) {
        printf("FAIL: daedalus-decoder output does NOT match libavcodec reference byte-for-byte\n");
        rc = 4;
    } else if (rc == 0) {
        printf("PASS: byte-exact identity-passthrough across %d frames\n", n_frames);
    } else {
        printf("FAIL: %s\n",
               (total_y_diffs || total_uv_diffs) ? "byte-exact comparison failed"
                                                   : "inspection callback invariants violated");
    }

cleanup:
    if (out_dadec_f) fclose(out_dadec_f);
    if (out_ref_f)   fclose(out_ref_f);
    free(out_uv_ref);  free(out_y_ref);
    free(out_uv_dadec);free(out_y_dadec);
#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
    free(inspect_st.seen);
#  ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
    free(inspect_st.captures);
#  endif
#endif
    if (dec)   daedalus_decoder_destroy(dec);
    av_frame_free(&fr);
    av_packet_free(&pkt);
    avcodec_free_context(&avctx);
    avformat_close_input(&fmt);
    return rc;
}