Install daedalus-decoder.pc for sibling consumers

Adds pkg-config plumbing so consumers (daedalus-v4l2 daemon for the upcoming PR-Q3a shadow-mode wiring; the daedalus_decode_h264 CLI when built outside this tree) can locate libdaedalus_decoder.a + the public header via pkg_check_modules / pkg-config. Mirrors daedalus-fourier's relocatable-prefix scheme: prefix is derived from ${pcfiledir} so cmake --install --prefix /foo produces a .pc that resolves to /foo at lookup time. Verified across two install prefixes. daedalus-fourier is declared as a public Requires: because consumers static-linking libdaedalus_decoder.a also need libdaedalus_core.a in their link line to resolve the daedalus_ctx_* / daedalus_recipe_* symbols this archive references. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Merge pull request 'PR-A6: enable libavcodec deblock + drive daedalus deblock on real streams' (#16 ) from noether/tools-h264-deblock-validation into main
2026-05-26 13:32:58 +02:00 · 2026-05-26 10:12:30 +00:00 · 2026-05-26 11:53:23 +02:00
2 changed files with 282 additions and 66 deletions
@@ -237,12 +237,48 @@ endif()

 # ---- Install ------------------------------------------------------
 #
-# Library + public header.  Stage 2/3 will add a pkg-config file and
-# CMake config exports once the API stabilises; pre-0.1 the scaffold
-# install just gives the static archive a home.
+# Installs:
+#   - libdaedalus_decoder.a       → ${CMAKE_INSTALL_LIBDIR}
+#   - include/daedalus_decoder.h  → ${CMAKE_INSTALL_INCLUDEDIR}
+#   - daedalus-decoder.pc         → ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+#
+# The .pc lets sibling consumers (daedalus-v4l2 daemon, the
+# daedalus_decode_h264 CLI when built externally) discover the static
+# archive + headers via pkg-config.  daedalus-fourier is declared as a
+# public `Requires:` because the consumer (which static-links
+# libdaedalus_decoder.a) also needs daedalus-fourier in its own link
+# line to resolve the daedalus_ctx_* / daedalus_recipe_* symbols this
+# archive references.
+#
+# Relocatable-prefix scheme mirrors daedalus-fourier's .pc generation:
+# `prefix` is derived from ${pcfiledir} so `cmake --install --prefix /foo`
+# produces a .pc that resolves prefix=/foo at lookup time, regardless of
+# what CMAKE_INSTALL_PREFIX was at configure time.

 include(GNUInstallDirs)
 install(TARGETS daedalus_decoder
    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
 install(FILES include/daedalus_decoder.h
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+file(RELATIVE_PATH PKGCONFIG_PCDIR_TO_PREFIX
+    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig"
+    "${CMAKE_INSTALL_PREFIX}")
+
+set(PKGCONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/daedalus-decoder.pc)
+file(WRITE ${PKGCONFIG_OUT}
+"prefix=\${pcfiledir}/${PKGCONFIG_PCDIR_TO_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${prefix}/${CMAKE_INSTALL_LIBDIR}
+includedir=\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}
+
+Name: daedalus-decoder
+Description: Frame-major H.264 decoder on V3D7 via daedalus-fourier primitives
+Version: ${PROJECT_VERSION}
+Libs: -L\${libdir} -ldaedalus_decoder
+Requires: daedalus-fourier
+Cflags: -I\${includedir}
+")
+install(FILES ${PKGCONFIG_OUT}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+)
@@ -105,12 +105,15 @@ static int   max_frames = -1;
 * no PCM).  Other MBs stay on identity-passthrough. */
 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
 struct mb_capture {
-    int     valid;              /* 1 = real-coeffs path, 0 = identity passthrough */
-    int16_t coeffs[256];        /* luma, column-major within 4x4, raster block order */
+    int     valid;              /* 1 = real-coeffs IDCT path, 0 = identity (predicted = pre_deblock_snap) */
+    int16_t coeffs[256];        /* luma, raster block order, raw sl->mb layout */
    uint8_t predicted[256];     /* luma P recovered = pre_deblock - clipped IDCT(C) */
-    uint8_t pre_deblock_snap[256]; /* DIAGNOSTIC: pre_deblock at callback time;
-                                    * compared against AVFrame post-receive_frame
-                                    * to detect h->cur_pic.f vs AVFrame divergence */
+    uint8_t pre_deblock_snap_y[256];  /* luma 16×16 pre-deblock at callback time */
+    uint8_t pre_deblock_snap_cb[64];  /* Cb 8×8 pre-deblock */
+    uint8_t pre_deblock_snap_cr[64];  /* Cr 8×8 pre-deblock */
+    int     qp_y;               /* QP_Y for this MB (sl->qscale at callback time) */
+    int     mb_type_intra;      /* 1 if MB is intra (any flavour), 0 otherwise */
+    int     transform_8x8;      /* 1 if 8×8 DCT (affects which internal edges fire) */
 };

 struct inspect_state {
@@ -121,14 +124,98 @@ struct inspect_state {
    int       out_of_bounds;
 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
    struct mb_capture *captures;        /* mb_w * mb_h entries */
-    int       real_coeffs_mbs;          /* count of MBs in real-coeffs path this frame */
+    int       real_coeffs_mbs;          /* count of MBs in real-coeffs IDCT path this frame */
    int       skipped_intra16x16;
    int       skipped_8x8dct;
    int       skipped_other;
+    /* Slice-level deblock params (captured first time the callback sees a
+     * slice context).  Per H.264 spec these are constant per slice; we
+     * assume single-slice frames in our test stream. */
+    int       slice_alpha_c0_offset;
+    int       slice_beta_offset;
+    int       slice_deblock_disable;    /* sl->deblocking_filter from spec */
 #endif
 };

 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+/* H.264 §8.7.2.2/8.7.2.3 deblock filter tables — transcribed verbatim
+ * from FFmpeg libavcodec/h264_loopfilter.c (LGPL-2.1+; algorithm + table
+ * values come from the H.264 spec which is normative and unpatented).
+ * Tables are size 52*3 — FFmpeg's trick to absorb slice_alpha_c0_offset +
+ * slice_beta_offset (in -12..+12) into the index without bounds-clamping.
+ * Usage: alpha = alpha_table[qp + a]  where a = 52 + slice_alpha_c0_offset
+ * (8-bit only; high-bit-depth subtracts qp_bd_offset). */
+static const uint8_t alpha_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+   255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+};
+static const uint8_t beta_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+};
+static const int8_t tc0_table[52*3][4] = {
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
+    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
+    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
+    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
+    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
+    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
+    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+};
+
+/* H.264 §8.5.11 / Table 8-11: qP_y → qP_chroma mapping for chroma_qp_index_offset == 0.
+ * For qP_y < 30, qP_c = qP_y.  Above that, the spec table compresses. */
+static const uint8_t chroma_qp_table[52] = {
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
+    31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
+    39, 39, 39, 39,
+};
+
 /* libavcodec's sl->mb stores coefficients in RASTER (row-major) order,
 * not zig-zag scan order — h264_cavlc.c does
 *     block[*scantable] = (level * qmul[*scantable] + 32) >> 6
@@ -212,6 +299,16 @@ static void inspect_cb(void *opaque,
    st->n_cbs_this_frame++;

 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+    /* Capture slice-level deblock params once.  Per spec they're constant
+     * per slice; for our single-slice test streams we just keep the
+     * latest values seen. */
+    {
+        const H264SliceContext *sl = &h->slice_ctx[0];
+        st->slice_alpha_c0_offset = sl->slice_alpha_c0_offset;
+        st->slice_beta_offset     = sl->slice_beta_offset;
+        st->slice_deblock_disable = sl->deblocking_filter;
+    }
+
    /* Real-coeffs path: extract per-MB state for daedalus-decoder
     * IDCT validation on this MB.  Gate: only Intra_4x4 + 4x4 transform
     * + non-PCM is supported in PR-A3b — other MB flavours fall back
@@ -222,6 +319,30 @@ static void inspect_cb(void *opaque,
    const int mb_xy = mb_y * h->mb_stride + mb_x;
    const uint32_t mb_type = h->cur_pic.mb_type[mb_xy];

+    /* Capture state needed for deblock edge derivation, regardless
+     * of whether this MB takes the real-coeffs IDCT path. */
+    cap->qp_y           = h->cur_pic.qscale_table[mb_xy];
+    cap->mb_type_intra  = IS_INTRA(mb_type) ? 1 : 0;
+    cap->transform_8x8  = IS_8x8DCT(mb_type) ? 1 : 0;
+
+    /* Snapshot pre-deblock pixels for all 3 planes at this MB's position. */
+    {
+        const int y_stride  = h->cur_pic.f->linesize[0];
+        const int uv_stride = h->cur_pic.f->linesize[1];
+        const uint8_t *mb_y_px = h->cur_pic.f->data[0]
+            + (ptrdiff_t) mb_y * 16 * y_stride + mb_x * 16;
+        const uint8_t *mb_cb_px = h->cur_pic.f->data[1]
+            + (ptrdiff_t) mb_y * 8 * uv_stride + mb_x * 8;
+        const uint8_t *mb_cr_px = h->cur_pic.f->data[2]
+            + (ptrdiff_t) mb_y * 8 * uv_stride + mb_x * 8;
+        for (int r = 0; r < 16; r++)
+            memcpy(&cap->pre_deblock_snap_y[r * 16], &mb_y_px[r * y_stride], 16);
+        for (int r = 0; r < 8; r++) {
+            memcpy(&cap->pre_deblock_snap_cb[r * 8], &mb_cb_px[r * uv_stride], 8);
+            memcpy(&cap->pre_deblock_snap_cr[r * 8], &mb_cr_px[r * uv_stride], 8);
+        }
+    }
+
    if (!IS_INTRA4x4(mb_type)) {
        if (IS_INTRA16x16(mb_type))  st->skipped_intra16x16++;
        else                          st->skipped_other++;
@@ -236,11 +357,8 @@ static void inspect_cb(void *opaque,
    const uint8_t *mb_pixels = luma_plane + (ptrdiff_t) mb_y * 16 * luma_stride
                                          + mb_x * 16;

-    /* Diagnostic snapshot: capture the 16x16 luma block as we see it in
-     * cur_pic at callback time.  Compared against AVFrame contents after
-     * receive_frame returns; mismatch points at a buffer-divergence bug. */
-    for (int r = 0; r < 16; r++)
-        memcpy(&cap->pre_deblock_snap[r * 16], &mb_pixels[r * luma_stride], 16);
+    /* (pre_deblock_snap_y is already populated above for all 3 planes;
+     * we use it later in the main loop as the daedalus predicted input.) */

    /* Coefficients are in sl->mb at end of entropy decode but zeroed by
     * the time the callback fires (IDCT-add consumed them).  Patch 0017
@@ -485,12 +603,13 @@ int main(int argc, char **argv)

 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
    /* Patch 0017's coefficient side buffer lives in H264Context (single
-     * per-stream); multi-threaded slice decode would race on it.  Force
-     * single-thread.  Also disable libavcodec's deblock so AVFrame is
-     * pre-deblock and the P-recovery math is exact. */
+     * per-stream); multi-threaded slice decode would race on it. */
    avctx->thread_count     = 1;
    avctx->thread_type      = 0;
-    avctx->skip_loop_filter = AVDISCARD_ALL;
+    /* PR-A6: keep libavcodec's deblock ON so AVFrame is the post-deblock
+     * reference we validate daedalus against.  Per-MB pre_deblock
+     * snapshots taken in the inspection callback (before deblock crosses
+     * into this MB's region) provide daedalus with pre-deblock input. */
 #endif

    if (avcodec_open2(avctx, codec, NULL) < 0) {
@@ -611,26 +730,123 @@ int main(int argc, char **argv)
            const int mb_h = coded_h / 16;
            uint8_t mb_pred[384];
            int16_t mb_coeffs[384] = {0};
+            struct daedalus_decoder_edge mb_edges[16];
            struct daedalus_decoder_mb_input mb = {0};
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+            /* PR-A6 edge derivation: a = 52 + slice_alpha_c0_offset,
+             * b = 52 + slice_beta_offset (per FFmpeg loopfilter.c
+             * convention; absorbs the offset into the tripled tables). */
+            const int slice_a = 52 + inspect_st.slice_alpha_c0_offset;
+            const int slice_b = 52 + inspect_st.slice_beta_offset;
+            /* FFmpeg's h264_slice.c inverts the spec's disable_deblocking_filter_idc
+             * via `sl->deblocking_filter ^= 1` (line ~1901).  Internal convention:
+             *   0 = disabled       (spec = 1)
+             *   1 = enabled        (spec = 0)
+             *   2 = enabled-but-not-across-slice-boundaries  (unchanged)
+             * So deblock is OFF iff sl->deblocking_filter == 0. */
+            const int deblock_off = inspect_st.slice_deblock_disable == 0;
+#endif
            for (int my = 0; my < mb_h; my++) {
                for (int mx = 0; mx < mb_w; mx++) {
                    /* Default: identity-passthrough — luma from AVFrame,
-                     * chroma from AVFrame, coeffs all zero. */
+                     * chroma from AVFrame, coeffs all zero, no edges. */
                    pack_mb_predicted(fr, mx, my, mb_pred);
                    memset(mb_coeffs, 0, sizeof(mb_coeffs));
+                    int n_edges = 0;

 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
-                    /* Real-coeffs path: if the callback captured this MB
-                     * as Intra_4x4 / 4x4-DCT, override luma predicted
-                     * with the recovered P and use the real luma coeffs.
-                     * Chroma stays identity-passthrough (PR-A3b scope —
-                     * chroma DC Hadamard + 8x8 transform follow-ups). */
+                    /* PR-A6: feed daedalus pre-deblock pixels from the
+                     * per-MB snapshots taken in the callback (AVFrame is
+                     * now post-deblock — used as reference, not as input). */
                    const int mb_idx = my * mb_w + mx;
                    const struct mb_capture *cap = &inspect_st.captures[mb_idx];
+
+                    /* Luma: P_rec for real-coeffs MBs, raw pre-deblock snap
+                     * otherwise (with zero coeffs).  Both produce the same
+                     * pre-deblock state after daedalus IDCT-add. */
                    if (cap->valid) {
                        memcpy(mb_pred, cap->predicted, 256);
                        for (int i = 0; i < 256; i++)
                            mb_coeffs[i] = cap->coeffs[i];
+                    } else {
+                        memcpy(mb_pred, cap->pre_deblock_snap_y, 256);
+                    }
+                    /* Chroma: always identity-passthrough from snap.
+                     * Chroma DC Hadamard + chroma residual extraction is
+                     * a follow-up (PR-A4). */
+                    memcpy(mb_pred + 256,       cap->pre_deblock_snap_cb, 64);
+                    memcpy(mb_pred + 256 + 64,  cap->pre_deblock_snap_cr, 64);
+
+                    /* Derive deblock edges for this MB.  Spec §8.7.2:
+                     * - Frame-boundary edges: skip (bS=0 — kernel reads p3 at -4).
+                     * - MB-boundary edges with intra neighbour: bS=4.
+                     * - Internal MB edges within intra MB: bS=3.
+                     * - 8x8 DCT MBs: internal edges only at col/row 8 (the
+                     *   single 8x8-block boundary inside the MB).
+                     * For non-intra MB types in mixed streams the bS rules
+                     * differ; we'd need cbp/MV/ref info from sl context for
+                     * those.  Our test stream is all-intra, so simplified. */
+                    if (!deblock_off && cap->mb_type_intra && !getenv("DAEDALUS_SKIP_EDGES")) {
+                        const int qp_self  = cap->qp_y;
+                        const int qp_left  = (mx > 0)
+                            ? inspect_st.captures[mb_idx - 1].qp_y : qp_self;
+                        const int qp_top   = (my > 0)
+                            ? inspect_st.captures[mb_idx - mb_w].qp_y : qp_self;
+                        const int qpc_self = chroma_qp_table[qp_self];
+                        const int qpc_left = chroma_qp_table[qp_left];
+                        const int qpc_top  = chroma_qp_table[qp_top];
+                        const int qp_avg_left  = (qp_self  + qp_left  + 1) >> 1;
+                        const int qp_avg_top   = (qp_self  + qp_top   + 1) >> 1;
+                        const int qpc_avg_left = (qpc_self + qpc_left + 1) >> 1;
+                        const int qpc_avg_top  = (qpc_self + qpc_top  + 1) >> 1;
+
+                        /* Helper macro to emit one edge.  bS=0 (skip)
+                         * edges are still emitted with bS=0 — daedalus's
+                         * partitioner filters them out. */
+                        #define EMIT_EDGE(orient_, plane_, edge_idx_, bS_, qp_) do { \
+                            if (n_edges >= 16) break;                                \
+                            struct daedalus_decoder_edge *e = &mb_edges[n_edges++];  \
+                            e->mb_x     = (uint16_t) mx;                             \
+                            e->mb_y     = (uint16_t) my;                             \
+                            e->edge_idx = (uint8_t)  (edge_idx_);                    \
+                            e->orient   = (uint8_t)  (orient_);                      \
+                            e->plane    = (uint8_t)  (plane_);                       \
+                            e->bS       = (uint8_t)  (bS_);                          \
+                            e->alpha    = alpha_table[(qp_) + slice_a];              \
+                            e->beta     = beta_table [(qp_) + slice_b];              \
+                            const int8_t *tc = tc0_table[(qp_) + slice_a];           \
+                            e->tc0[0] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                            e->tc0[1] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                            e->tc0[2] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                            e->tc0[3] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                        } while (0)
+
+                        /* Luma V edges: 4 at col 0, 4, 8, 12.  Internal
+                         * edges at 4/12 are skipped for 8x8 DCT MBs. */
+                        EMIT_EDGE(0, 0, 0, (mx > 0) ? 4 : 0, qp_avg_left);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 0, 1, 3, qp_self);
+                        EMIT_EDGE(0, 0, 2, 3, qp_self);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 0, 3, 3, qp_self);
+
+                        /* Luma H edges: 4 at row 0, 4, 8, 12. */
+                        EMIT_EDGE(1, 0, 0, (my > 0) ? 4 : 0, qp_avg_top);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 0, 1, 3, qp_self);
+                        EMIT_EDGE(1, 0, 2, 3, qp_self);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 0, 3, 3, qp_self);
+
+                        /* Chroma V edges: 2 per plane (Cb=1, Cr=2). */
+                        EMIT_EDGE(0, 1, 0, (mx > 0) ? 4 : 0, qpc_avg_left);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 1, 1, 3, qpc_self);
+                        EMIT_EDGE(0, 2, 0, (mx > 0) ? 4 : 0, qpc_avg_left);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 2, 1, 3, qpc_self);
+
+                        /* Chroma H edges. */
+                        EMIT_EDGE(1, 1, 0, (my > 0) ? 4 : 0, qpc_avg_top);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 1, 1, 3, qpc_self);
+                        EMIT_EDGE(1, 2, 0, (my > 0) ? 4 : 0, qpc_avg_top);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 2, 1, 3, qpc_self);
+
+                        #undef EMIT_EDGE
                    }
 #endif

@@ -639,8 +855,8 @@ int main(int argc, char **argv)
                    mb.transform_8x8 = 0;
                    mb.coeffs      = mb_coeffs;
                    mb.predicted   = mb_pred;
-                    mb.edges       = NULL;
-                    mb.n_edges     = 0;
+                    mb.edges       = (n_edges > 0) ? mb_edges : NULL;
+                    mb.n_edges     = (uint8_t) n_edges;
                    if (daedalus_decoder_append_mb(dec, &mb) != 0) {
                        fprintf(stderr, "append_mb (%d,%d) failed\n", mx, my);
                        rc = 3; goto cleanup;
@@ -661,46 +877,10 @@ int main(int argc, char **argv)
                                out_uv_ref, (size_t) coded_w,
                                coded_w, coded_h);

-#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
-            /* Diagnostic: for each real-coeffs MB, compare the callback's
-             * pre_deblock snapshot against AVFrame at the same position.
-             * If they differ, h->cur_pic.f at callback time isn't the
-             * eventual AVFrame buffer (or deblock ran despite
-             * skip_loop_filter=AVDISCARD_ALL). */
-            int snap_mismatches = 0;
-            int first_snap_mismatch_mb = -1;
-            for (int my2 = 0; my2 < mb_h; my2++) {
-                for (int mx2 = 0; mx2 < mb_w; mx2++) {
-                    const int idx2 = my2 * mb_w + mx2;
-                    if (!inspect_st.captures[idx2].valid) continue;
-                    const uint8_t *avf_mb = fr->data[0]
-                        + (ptrdiff_t) my2 * 16 * fr->linesize[0]
-                        + mx2 * 16;
-                    for (int r = 0; r < 16; r++) {
-                        for (int c = 0; c < 16; c++) {
-                            if (avf_mb[r * fr->linesize[0] + c] !=
-                                inspect_st.captures[idx2].pre_deblock_snap[r * 16 + c]) {
-                                if (first_snap_mismatch_mb < 0)
-                                    first_snap_mismatch_mb = idx2;
-                                snap_mismatches++;
-                            }
-                        }
-                    }
-                }
-            }
-            if (snap_mismatches > 0) {
-                const int mmb_x = first_snap_mismatch_mb % mb_w;
-                const int mmb_y = first_snap_mismatch_mb / mb_w;
-                fprintf(stderr,
-                    "  DIAG: callback's pre_deblock differs from AVFrame in "
-                    "%d bytes across real-coeffs MBs; first mismatch at MB(%d, %d)\n",
-                    snap_mismatches, mmb_x, mmb_y);
-                rc = 4;
-            }
-            /* Silent on match — the invariant must hold for the
-             * P-recovery math to be valid; we'd want to know if it
-             * ever broke, but no need to confirm it every frame. */
-#endif
+            /* (PR-A3b's pre_deblock vs AVFrame DIAG check is removed in
+             * PR-A6: with libavcodec's deblock now ENABLED, AVFrame is
+             * post-deblock and intentionally differs from the per-MB
+             * pre_deblock snapshots taken in the callback.) */

            /* Byte-exact compare + first-diff diagnostic. */
            size_t y_diffs = 0, uv_diffs = 0;