Install daedalus-decoder.pc for sibling consumers

Adds pkg-config plumbing so consumers (daedalus-v4l2 daemon for the upcoming PR-Q3a shadow-mode wiring; the daedalus_decode_h264 CLI when built outside this tree) can locate libdaedalus_decoder.a + the public header via pkg_check_modules / pkg-config. Mirrors daedalus-fourier's relocatable-prefix scheme: prefix is derived from ${pcfiledir} so cmake --install --prefix /foo produces a .pc that resolves to /foo at lookup time. Verified across two install prefixes. daedalus-fourier is declared as a public Requires: because consumers static-linking libdaedalus_decoder.a also need libdaedalus_core.a in their link line to resolve the daedalus_ctx_* / daedalus_recipe_* symbols this archive references. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Merge pull request 'PR-A6: enable libavcodec deblock + drive daedalus deblock on real streams' (#16 ) from noether/tools-h264-deblock-validation into main
2026-05-26 13:32:58 +02:00 · 2026-05-26 10:12:30 +00:00 · 2026-05-26 11:53:23 +02:00 · 2026-05-26 09:36:03 +00:00 · 2026-05-26 11:19:11 +02:00
2 changed files with 619 additions and 26 deletions
@@ -195,6 +195,30 @@ if(DAEDALUS_BUILD_TOOLS)
            ${DAEDALUS_FFMPEG_PREFIX}/lib/libswresample.a
            m z pthread)
        set(FFMPEG_CFLAGS_OTHER "-DDAEDALUS_HAVE_H264_MB_INSPECT_CB=1")
+
+        # PR-A3+ optional: also point at the patched FFmpeg SOURCE TREE
+        # so the CLI can include libavcodec/h264dec.h directly and
+        # dereference H264Context fields (the side-buffer mb_inspect_coeffs
+        # added in marfrit-packages patch 0017, the cur_pic.f for
+        # pre-deblock pixel access, etc.).  When set, the internal-header
+        # include codepath is compiled in.
+        set(DAEDALUS_FFMPEG_SRC "" CACHE PATH
+            "Path to patched FFmpeg source tree (= path to FFmpeg/ checkout where build was run; contains config.h + libavcodec/h264dec.h). Empty = h264dec.h includes are disabled.")
+        if(DAEDALUS_FFMPEG_SRC)
+            message(STATUS "daedalus_decode_h264: FFmpeg source at ${DAEDALUS_FFMPEG_SRC}")
+            # IMPORTANT: source tree FIRST in -I order — its
+            # libavutil/common.h does #include "intmath.h" with HAVE_AV_CONFIG_H,
+            # which resolves to libavutil/intmath.h (in the source tree
+            # only — that header isn't installed since it's arch-dispatched).
+            # The installed-prefix include path's libavutil/common.h is the
+            # same file textually but resolves "intmath.h" against the
+            # install dir where it doesn't exist.
+            set(FFMPEG_INCLUDE_DIRS ${DAEDALUS_FFMPEG_SRC})
+            set(FFMPEG_CFLAGS_OTHER
+                "${FFMPEG_CFLAGS_OTHER} -DDAEDALUS_HAVE_H264_MB_INSPECT_COEFFS=1 -DHAVE_AV_CONFIG_H")
+            # Convert space-separated string to list (CMake idiom for compile flags).
+            separate_arguments(FFMPEG_CFLAGS_OTHER UNIX_COMMAND "${FFMPEG_CFLAGS_OTHER}")
+        endif()
    else()
        pkg_check_modules(FFMPEG REQUIRED libavcodec libavformat libavutil)
        message(STATUS "daedalus_decode_h264: system FFmpeg (no inspection callback)")
@@ -213,12 +237,48 @@ endif()

 # ---- Install ------------------------------------------------------
 #
-# Library + public header.  Stage 2/3 will add a pkg-config file and
-# CMake config exports once the API stabilises; pre-0.1 the scaffold
-# install just gives the static archive a home.
+# Installs:
+#   - libdaedalus_decoder.a       → ${CMAKE_INSTALL_LIBDIR}
+#   - include/daedalus_decoder.h  → ${CMAKE_INSTALL_INCLUDEDIR}
+#   - daedalus-decoder.pc         → ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+#
+# The .pc lets sibling consumers (daedalus-v4l2 daemon, the
+# daedalus_decode_h264 CLI when built externally) discover the static
+# archive + headers via pkg-config.  daedalus-fourier is declared as a
+# public `Requires:` because the consumer (which static-links
+# libdaedalus_decoder.a) also needs daedalus-fourier in its own link
+# line to resolve the daedalus_ctx_* / daedalus_recipe_* symbols this
+# archive references.
+#
+# Relocatable-prefix scheme mirrors daedalus-fourier's .pc generation:
+# `prefix` is derived from ${pcfiledir} so `cmake --install --prefix /foo`
+# produces a .pc that resolves prefix=/foo at lookup time, regardless of
+# what CMAKE_INSTALL_PREFIX was at configure time.

 include(GNUInstallDirs)
 install(TARGETS daedalus_decoder
    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
 install(FILES include/daedalus_decoder.h
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+file(RELATIVE_PATH PKGCONFIG_PCDIR_TO_PREFIX
+    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig"
+    "${CMAKE_INSTALL_PREFIX}")
+
+set(PKGCONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/daedalus-decoder.pc)
+file(WRITE ${PKGCONFIG_OUT}
+"prefix=\${pcfiledir}/${PKGCONFIG_PCDIR_TO_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${prefix}/${CMAKE_INSTALL_LIBDIR}
+includedir=\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}
+
+Name: daedalus-decoder
+Description: Frame-major H.264 decoder on V3D7 via daedalus-fourier primitives
+Version: ${PROJECT_VERSION}
+Libs: -L\${libdir} -ldaedalus_decoder
+Requires: daedalus-fourier
+Cflags: -I\${includedir}
+")
+install(FILES ${PKGCONFIG_OUT}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+)
@@ -51,14 +51,32 @@
 #include <libavutil/imgutils.h>

 /* Per-MB inspection callback API — provided by the patched FFmpeg
- * fork via marfrit-packages 0016.  The H264Context struct itself
- * remains internal (declared in libavcodec/h264dec.h which isn't
- * installed), so we only forward-declare it here and use it
- * opaquely through the callback signature.  Real per-MB state
- * extraction (sl->mb coefficients, mb_type, etc.) will land in
- * PR-A3 alongside an internal-header include path. */
-#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
+ * fork via marfrit-packages patches 0016 + 0017.
+ *
+ * When DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS is defined (CMake sets it
+ * alongside DAEDALUS_FFMPEG_SRC), we include libavcodec's INTERNAL
+ * h264dec.h header to dereference H264Context fields — specifically
+ * h->mb_inspect_coeffs (the 0017 side buffer holding pre-IDCT-
+ * destruction sl->mb), h->cur_pic.f (pre-deblock reconstructed pixels),
+ * and h->cur_pic.mb_type[mb_xy] for the mb-type gate.  The same
+ * configure-time config.h that built the static libavcodec.a is
+ * picked up via -DHAVE_AV_CONFIG_H + -I path; ABI match is automatic.
+ *
+ * When only DAEDALUS_HAVE_H264_MB_INSPECT_CB is defined (no source
+ * tree available — e.g. building against a distro-shipped patched
+ * libavcodec), the H264Context stays opaque and we fall back to
+ * identity-passthrough across all MBs.
+ *
+ * When neither is defined: stock libavcodec, no callback, identity-
+ * passthrough only (PR-A1b behaviour). */
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+#  include "libavcodec/h264dec.h"
+#  include "libavcodec/h264.h"   /* IS_INTRA4x4 / IS_8x8DCT / IS_INTRA_PCM */
+#elif defined(DAEDALUS_HAVE_H264_MB_INSPECT_CB)
 struct H264Context;
+#endif
+
+#if defined(DAEDALUS_HAVE_H264_MB_INSPECT_CB) || defined(DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS)
 typedef void (*ff_h264_mb_inspect_cb)(void *opaque,
                                       const struct H264Context *h,
                                       int mb_x, int mb_y);
@@ -76,35 +94,370 @@ static const char *substrate_str = "auto";
 static int   max_frames = -1;

 /* Inspection-callback state: per-frame counter + "each MB seen exactly
- * once" check.  We use a bitmap rather than a raster-order assertion
- * because libavcodec's MB-level threading + multi-slice frames mean
- * MBs reach the callback in non-strictly-raster order; the contract
- * is "every MB fires the callback exactly once per frame", not "in
- * raster order".  Reset at end of each frame. */
+ * once" check.  Bitmap, not raster-order — libavcodec's MB threading +
+ * multi-slice frames mean MBs reach the callback out of strict order;
+ * contract is "every MB fires the callback exactly once per frame".
+ *
+ * When real-coeff extraction is compiled in (PR-A3+), we ALSO maintain
+ * a per-MB capture buffer (real-coeffs path) so the main loop can
+ * drive daedalus_decoder_append_mb with REAL pre-residual P + real
+ * coefficients for MBs that satisfy the gate (Intra_4x4, no 8x8 DCT,
+ * no PCM).  Other MBs stay on identity-passthrough. */
 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
+struct mb_capture {
+    int     valid;              /* 1 = real-coeffs IDCT path, 0 = identity (predicted = pre_deblock_snap) */
+    int16_t coeffs[256];        /* luma, raster block order, raw sl->mb layout */
+    uint8_t predicted[256];     /* luma P recovered = pre_deblock - clipped IDCT(C) */
+    uint8_t pre_deblock_snap_y[256];  /* luma 16×16 pre-deblock at callback time */
+    uint8_t pre_deblock_snap_cb[64];  /* Cb 8×8 pre-deblock */
+    uint8_t pre_deblock_snap_cr[64];  /* Cr 8×8 pre-deblock */
+    int     qp_y;               /* QP_Y for this MB (sl->qscale at callback time) */
+    int     mb_type_intra;      /* 1 if MB is intra (any flavour), 0 otherwise */
+    int     transform_8x8;      /* 1 if 8×8 DCT (affects which internal edges fire) */
+};
+
 struct inspect_state {
    int       n_cbs_this_frame;
    int       mb_w, mb_h;
    uint8_t  *seen;             /* mb_w * mb_h bitmap */
-    int       duplicate_mbs;    /* same (mb_x, mb_y) seen twice this frame */
-    int       out_of_bounds;    /* (mb_x, mb_y) outside the coded grid */
+    int       duplicate_mbs;
+    int       out_of_bounds;
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+    struct mb_capture *captures;        /* mb_w * mb_h entries */
+    int       real_coeffs_mbs;          /* count of MBs in real-coeffs IDCT path this frame */
+    int       skipped_intra16x16;
+    int       skipped_8x8dct;
+    int       skipped_other;
+    /* Slice-level deblock params (captured first time the callback sees a
+     * slice context).  Per H.264 spec these are constant per slice; we
+     * assume single-slice frames in our test stream. */
+    int       slice_alpha_c0_offset;
+    int       slice_beta_offset;
+    int       slice_deblock_disable;    /* sl->deblocking_filter from spec */
+#endif
 };

+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+/* H.264 §8.7.2.2/8.7.2.3 deblock filter tables — transcribed verbatim
+ * from FFmpeg libavcodec/h264_loopfilter.c (LGPL-2.1+; algorithm + table
+ * values come from the H.264 spec which is normative and unpatented).
+ * Tables are size 52*3 — FFmpeg's trick to absorb slice_alpha_c0_offset +
+ * slice_beta_offset (in -12..+12) into the index without bounds-clamping.
+ * Usage: alpha = alpha_table[qp + a]  where a = 52 + slice_alpha_c0_offset
+ * (8-bit only; high-bit-depth subtracts qp_bd_offset). */
+static const uint8_t alpha_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+   255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+   255,255,255,255,255,255,255,255,255,255,255,255,255,
+};
+static const uint8_t beta_table[52*3] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
+};
+static const int8_t tc0_table[52*3][4] = {
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
+    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
+    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
+    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
+    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
+    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
+    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+};
+
+/* H.264 §8.5.11 / Table 8-11: qP_y → qP_chroma mapping for chroma_qp_index_offset == 0.
+ * For qP_y < 30, qP_c = qP_y.  Above that, the spec table compresses. */
+static const uint8_t chroma_qp_table[52] = {
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
+    31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
+    39, 39, 39, 39,
+};
+
+/* libavcodec's sl->mb stores coefficients in RASTER (row-major) order,
+ * not zig-zag scan order — h264_cavlc.c does
+ *     block[*scantable] = (level * qmul[*scantable] + 32) >> 6
+ * where *scantable advances through ff_zigzag_scan[] which contains
+ * RASTER positions (row*4 + col).  So sl->mb[i] = coef at raster
+ * position i = (i/4, i%4) = (row, col).  No inverse-zigzag needed;
+ * just transpose row-major → column-major (daedalus's convention). */
+
+/* H.264 §6.4.3 4x4 luma block scan within MB (z-scan).
+ * Maps raster-block-idx (sb_y*4+sb_x) → libavcodec sl->mb's z-scan idx.
+ * Z-scan happens to be its own inverse (symmetric mapping). */
+static const uint8_t raster_to_zscan[16] = {
+    0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
+};
+
+/* H.264 4x4 IDCT — transcribed from daedalus-fourier
+ * tests/test_idct_bitexact.c (which itself mirrors h264_idct4_ref.c).
+ * Outputs row-major 16-element residual; clip + shift happens in
+ * the consumer. */
+static void h264_idct4_butterfly(const int d[4], int out[4]) {
+    int e = d[0] + d[2];
+    int f = d[0] - d[2];
+    int g = (d[1] >> 1) - d[3];
+    int h = d[1] + (d[3] >> 1);
+    out[0] = e + h;
+    out[1] = f + g;
+    out[2] = f - g;
+    out[3] = e - h;
+}
+static void ref_idct4_compute(const int16_t block[16], int out[16]) {
+    /* block COLUMN-MAJOR: block[c*4+r] = coef at (row=r, col=c).
+     *
+     * Pass order: COLUMN-pass first, then ROW-pass — matches FFmpeg's
+     * h264idct_template.c.  The pass order matters for integer
+     * arithmetic with `>>1` on signed values (which round toward -inf
+     * for odd negatives in C); row-first vs column-first orders can
+     * disagree by 1 unit at the intermediate stage, propagating to
+     * the final pixel residual.
+     *
+     * (daedalus-fourier's tests/h264_idct4_ref.c does ROW-first, which
+     * matches its NEON kernel + GPU shader bit-exact within the
+     * package but DIVERGES from FFmpeg's IDCT for some inputs.  PR-A3b
+     * surfaces the divergence; investigating the fix is a daedalus-
+     * fourier follow-up — see task #184.) */
+    int tmp[4][4];
+    /* Column pass: process each column c independently. */
+    for (int c = 0; c < 4; c++) {
+        int d[4] = { block[c*4+0], block[c*4+1], block[c*4+2], block[c*4+3] };
+        int o[4];
+        h264_idct4_butterfly(d, o);
+        for (int r = 0; r < 4; r++) tmp[r][c] = o[r];
+    }
+    /* Row pass: process each row r. */
+    for (int r = 0; r < 4; r++) {
+        int d[4] = { tmp[r][0], tmp[r][1], tmp[r][2], tmp[r][3] };
+        int o[4];
+        h264_idct4_butterfly(d, o);
+        for (int c = 0; c < 4; c++) out[r*4+c] = o[c];
+    }
+}
+#endif  /* DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS */
+
 static void inspect_cb(void *opaque,
                        const struct H264Context *h,
                        int mb_x, int mb_y)
 {
-    (void) h;
    struct inspect_state *st = opaque;
+#ifndef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+    (void) h;
+#endif

    if (mb_x < 0 || mb_x >= st->mb_w || mb_y < 0 || mb_y >= st->mb_h) {
        st->out_of_bounds++;
-    } else {
-        const size_t idx = (size_t) mb_y * st->mb_w + (size_t) mb_x;
-        if (st->seen[idx]) st->duplicate_mbs++;
-        st->seen[idx] = 1;
+        st->n_cbs_this_frame++;
+        return;
    }
+
+    const size_t idx = (size_t) mb_y * st->mb_w + (size_t) mb_x;
+    if (st->seen[idx]) st->duplicate_mbs++;
+    st->seen[idx] = 1;
    st->n_cbs_this_frame++;
+
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+    /* Capture slice-level deblock params once.  Per spec they're constant
+     * per slice; for our single-slice test streams we just keep the
+     * latest values seen. */
+    {
+        const H264SliceContext *sl = &h->slice_ctx[0];
+        st->slice_alpha_c0_offset = sl->slice_alpha_c0_offset;
+        st->slice_beta_offset     = sl->slice_beta_offset;
+        st->slice_deblock_disable = sl->deblocking_filter;
+    }
+
+    /* Real-coeffs path: extract per-MB state for daedalus-decoder
+     * IDCT validation on this MB.  Gate: only Intra_4x4 + 4x4 transform
+     * + non-PCM is supported in PR-A3b — other MB flavours fall back
+     * to identity-passthrough in the main loop. */
+    struct mb_capture *cap = &st->captures[idx];
+    cap->valid = 0;  /* default to passthrough */
+
+    const int mb_xy = mb_y * h->mb_stride + mb_x;
+    const uint32_t mb_type = h->cur_pic.mb_type[mb_xy];
+
+    /* Capture state needed for deblock edge derivation, regardless
+     * of whether this MB takes the real-coeffs IDCT path. */
+    cap->qp_y           = h->cur_pic.qscale_table[mb_xy];
+    cap->mb_type_intra  = IS_INTRA(mb_type) ? 1 : 0;
+    cap->transform_8x8  = IS_8x8DCT(mb_type) ? 1 : 0;
+
+    /* Snapshot pre-deblock pixels for all 3 planes at this MB's position. */
+    {
+        const int y_stride  = h->cur_pic.f->linesize[0];
+        const int uv_stride = h->cur_pic.f->linesize[1];
+        const uint8_t *mb_y_px = h->cur_pic.f->data[0]
+            + (ptrdiff_t) mb_y * 16 * y_stride + mb_x * 16;
+        const uint8_t *mb_cb_px = h->cur_pic.f->data[1]
+            + (ptrdiff_t) mb_y * 8 * uv_stride + mb_x * 8;
+        const uint8_t *mb_cr_px = h->cur_pic.f->data[2]
+            + (ptrdiff_t) mb_y * 8 * uv_stride + mb_x * 8;
+        for (int r = 0; r < 16; r++)
+            memcpy(&cap->pre_deblock_snap_y[r * 16], &mb_y_px[r * y_stride], 16);
+        for (int r = 0; r < 8; r++) {
+            memcpy(&cap->pre_deblock_snap_cb[r * 8], &mb_cb_px[r * uv_stride], 8);
+            memcpy(&cap->pre_deblock_snap_cr[r * 8], &mb_cr_px[r * uv_stride], 8);
+        }
+    }
+
+    if (!IS_INTRA4x4(mb_type)) {
+        if (IS_INTRA16x16(mb_type))  st->skipped_intra16x16++;
+        else                          st->skipped_other++;
+        return;
+    }
+    if (IS_8x8DCT(mb_type)) { st->skipped_8x8dct++; return; }
+    if (IS_INTRA_PCM(mb_type)) { st->skipped_other++; return; }
+
+    /* Snapshot luma pre-deblock pixels from cur_pic. */
+    const uint8_t *luma_plane = h->cur_pic.f->data[0];
+    const int luma_stride = h->cur_pic.f->linesize[0];
+    const uint8_t *mb_pixels = luma_plane + (ptrdiff_t) mb_y * 16 * luma_stride
+                                          + mb_x * 16;
+
+    /* (pre_deblock_snap_y is already populated above for all 3 planes;
+     * we use it later in the main loop as the daedalus predicted input.) */
+
+    /* Coefficients are in sl->mb at end of entropy decode but zeroed by
+     * the time the callback fires (IDCT-add consumed them).  Patch 0017
+     * preserves them in h->mb_inspect_coeffs[16 * 48] BEFORE IDCT runs,
+     * so we read from there. */
+    const int16_t *zz_mb = h->mb_inspect_coeffs;  /* layout matches sl->mb 8-bit half */
+
+    for (int r_block = 0; r_block < 16; r_block++) {
+        const int z_block = raster_to_zscan[r_block];
+        const int16_t *block_raw = &zz_mb[z_block * 16];
+
+        /* sl->mb stores 16 int16 per block.  Empirical finding (via
+         * /tmp/idct_compare.c, 2026-05-26): daedalus-fourier's C ref
+         * IDCT and FFmpeg's C ref IDCT produce IDENTICAL output for
+         * the same input array — the "column-major vs row-major"
+         * labelling is decoration; both functions implement the same
+         * H.264 spec IDCT on a 16-int16 input.  So we feed daedalus
+         * the raw sl->mb data unchanged.  Previous attempt to
+         * transpose row-major→column-major was wrong — the transpose
+         * changed the IDCT result. */
+        int16_t col[16];
+        memcpy(col, block_raw, 16 * sizeof(int16_t));
+
+        memcpy(&cap->coeffs[r_block * 16], col, 16 * sizeof(int16_t));
+
+        /* IDCT → row-major 16-int residual. */
+        int idct_row[16];
+        ref_idct4_compute(col, idct_row);
+
+        /* P = clip(pre_deblock - ((IDCT + 32) >> 6)) for each pixel.
+         * Symmetric: daedalus IDCT-add will undo the subtract, including
+         * for saturating cases (where the same shift puts the value back
+         * at the same clip boundary). */
+        const int sb_y = r_block >> 2;
+        const int sb_x = r_block & 3;
+        for (int r = 0; r < 4; r++) {
+            for (int c = 0; c < 4; c++) {
+                const int pre_db = mb_pixels[(sb_y * 4 + r) * luma_stride + sb_x * 4 + c];
+                const int shift  = (idct_row[r * 4 + c] + 32) >> 6;
+                int p = pre_db - shift;
+                if (p < 0)   p = 0;
+                if (p > 255) p = 255;
+                cap->predicted[(sb_y * 4 + r) * 16 + (sb_x * 4 + c)] = (uint8_t) p;
+            }
+        }
+    }
+    cap->valid = 1;
+    st->real_coeffs_mbs++;
+
+    /* One-shot diagnostic enabled by DAEDALUS_DUMP_MB_3_0 env var. */
+    if (mb_x == 3 && mb_y == 0 && getenv("DAEDALUS_DUMP_MB_3_0")) {
+        const int16_t *zz = &zz_mb[1 * 16];   /* z_block = raster_block = 1 */
+        const struct mb_capture *capdiag = &st->captures[mb_y * st->mb_w + mb_x];
+        fprintf(stderr, "  MB(3,0) block z=1 raster coeffs (sl->mb):");
+        for (int p = 0; p < 16; p++) fprintf(stderr, " %d", (int) zz[p]);
+        fprintf(stderr, "\n");
+        fprintf(stderr, "  MB(3,0) block z=1 col_major coeffs (after transpose):");
+        for (int i = 0; i < 16; i++) fprintf(stderr, " %d", (int) capdiag->coeffs[1 * 16 + i]);
+        fprintf(stderr, "\n");
+        /* Recompute IDCT for this block (already done in the loop above but
+         * print here for visibility). */
+        int idct_print[16];
+        ref_idct4_compute(&capdiag->coeffs[1 * 16], idct_print);
+        fprintf(stderr, "  MB(3,0) block z=1 IDCT row-major (raw, pre-shift):");
+        for (int i = 0; i < 16; i++) fprintf(stderr, " %d", idct_print[i]);
+        fprintf(stderr, "\n");
+        fprintf(stderr, "  MB(3,0) block z=1 IDCT (+32)>>6:");
+        for (int i = 0; i < 16; i++) fprintf(stderr, " %d", (idct_print[i] + 32) >> 6);
+        fprintf(stderr, "\n");
+        const uint8_t *bpix = mb_pixels + 0 * luma_stride + 4;  /* sb_y=0, sb_x=1 → cols 4..7 within MB */
+        fprintf(stderr, "  MB(3,0) block z=1 pre_deblock pixels:\n");
+        for (int r = 0; r < 4; r++) {
+            fprintf(stderr, "   ");
+            for (int c = 0; c < 4; c++)
+                fprintf(stderr, " %3u", bpix[r * luma_stride + c]);
+            fprintf(stderr, "\n");
+        }
+        fprintf(stderr, "  MB(3,0) block z=1 P_rec (= pre_deblock - shift):\n");
+        for (int r = 0; r < 4; r++) {
+            fprintf(stderr, "   ");
+            for (int c = 0; c < 4; c++)
+                fprintf(stderr, " %3u", capdiag->predicted[(0*4+r) * 16 + (1*4+c)]);
+            fprintf(stderr, "\n");
+        }
+        /* And what daedalus_decoder SHOULD produce: clip(P_rec + shift). */
+        fprintf(stderr, "  MB(3,0) block z=1 expected daedalus output = clip(P_rec + shift):\n");
+        for (int r = 0; r < 4; r++) {
+            fprintf(stderr, "   ");
+            for (int c = 0; c < 4; c++) {
+                int p_rec = capdiag->predicted[(0*4+r) * 16 + (1*4+c)];
+                int sh = (idct_print[r*4+c] + 32) >> 6;
+                int e = p_rec + sh;
+                if (e < 0) e = 0; if (e > 255) e = 255;
+                fprintf(stderr, " %3d", e);
+            }
+            fprintf(stderr, "\n");
+        }
+    }
+#endif
 }
 #endif

@@ -247,6 +600,18 @@ int main(int argc, char **argv)
    const AVCodec *codec = avcodec_find_decoder(AV_CODEC_ID_H264);
    AVCodecContext *avctx = avcodec_alloc_context3(codec);
    avcodec_parameters_to_context(avctx, fmt->streams[vstream]->codecpar);
+
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+    /* Patch 0017's coefficient side buffer lives in H264Context (single
+     * per-stream); multi-threaded slice decode would race on it. */
+    avctx->thread_count     = 1;
+    avctx->thread_type      = 0;
+    /* PR-A6: keep libavcodec's deblock ON so AVFrame is the post-deblock
+     * reference we validate daedalus against.  Per-MB pre_deblock
+     * snapshots taken in the inspection callback (before deblock crosses
+     * into this MB's region) provide daedalus with pre-deblock input. */
+#endif
+
    if (avcodec_open2(avctx, codec, NULL) < 0) {
        fprintf(stderr, "avcodec_open2 failed\n");
        avformat_close_input(&fmt); return 2;
@@ -280,6 +645,11 @@ int main(int argc, char **argv)
        inspect_st.mb_h = H_round / 16;
        inspect_st.seen = calloc(1, (size_t) inspect_st.mb_w * inspect_st.mb_h);
        if (!inspect_st.seen) { rc = 1; goto cleanup; }
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+        inspect_st.captures = calloc((size_t) inspect_st.mb_w * inspect_st.mb_h,
+                                      sizeof(*inspect_st.captures));
+        if (!inspect_st.captures) { rc = 1; goto cleanup; }
+#endif
    }
    ff_h264_set_mb_inspect_cb(avctx, inspect_cb, &inspect_st);
    int inspect_total_cbs       = 0;
@@ -360,17 +730,133 @@ int main(int argc, char **argv)
            const int mb_h = coded_h / 16;
            uint8_t mb_pred[384];
            int16_t mb_coeffs[384] = {0};
+            struct daedalus_decoder_edge mb_edges[16];
            struct daedalus_decoder_mb_input mb = {0};
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+            /* PR-A6 edge derivation: a = 52 + slice_alpha_c0_offset,
+             * b = 52 + slice_beta_offset (per FFmpeg loopfilter.c
+             * convention; absorbs the offset into the tripled tables). */
+            const int slice_a = 52 + inspect_st.slice_alpha_c0_offset;
+            const int slice_b = 52 + inspect_st.slice_beta_offset;
+            /* FFmpeg's h264_slice.c inverts the spec's disable_deblocking_filter_idc
+             * via `sl->deblocking_filter ^= 1` (line ~1901).  Internal convention:
+             *   0 = disabled       (spec = 1)
+             *   1 = enabled        (spec = 0)
+             *   2 = enabled-but-not-across-slice-boundaries  (unchanged)
+             * So deblock is OFF iff sl->deblocking_filter == 0. */
+            const int deblock_off = inspect_st.slice_deblock_disable == 0;
+#endif
            for (int my = 0; my < mb_h; my++) {
                for (int mx = 0; mx < mb_w; mx++) {
+                    /* Default: identity-passthrough — luma from AVFrame,
+                     * chroma from AVFrame, coeffs all zero, no edges. */
                    pack_mb_predicted(fr, mx, my, mb_pred);
+                    memset(mb_coeffs, 0, sizeof(mb_coeffs));
+                    int n_edges = 0;
+
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+                    /* PR-A6: feed daedalus pre-deblock pixels from the
+                     * per-MB snapshots taken in the callback (AVFrame is
+                     * now post-deblock — used as reference, not as input). */
+                    const int mb_idx = my * mb_w + mx;
+                    const struct mb_capture *cap = &inspect_st.captures[mb_idx];
+
+                    /* Luma: P_rec for real-coeffs MBs, raw pre-deblock snap
+                     * otherwise (with zero coeffs).  Both produce the same
+                     * pre-deblock state after daedalus IDCT-add. */
+                    if (cap->valid) {
+                        memcpy(mb_pred, cap->predicted, 256);
+                        for (int i = 0; i < 256; i++)
+                            mb_coeffs[i] = cap->coeffs[i];
+                    } else {
+                        memcpy(mb_pred, cap->pre_deblock_snap_y, 256);
+                    }
+                    /* Chroma: always identity-passthrough from snap.
+                     * Chroma DC Hadamard + chroma residual extraction is
+                     * a follow-up (PR-A4). */
+                    memcpy(mb_pred + 256,       cap->pre_deblock_snap_cb, 64);
+                    memcpy(mb_pred + 256 + 64,  cap->pre_deblock_snap_cr, 64);
+
+                    /* Derive deblock edges for this MB.  Spec §8.7.2:
+                     * - Frame-boundary edges: skip (bS=0 — kernel reads p3 at -4).
+                     * - MB-boundary edges with intra neighbour: bS=4.
+                     * - Internal MB edges within intra MB: bS=3.
+                     * - 8x8 DCT MBs: internal edges only at col/row 8 (the
+                     *   single 8x8-block boundary inside the MB).
+                     * For non-intra MB types in mixed streams the bS rules
+                     * differ; we'd need cbp/MV/ref info from sl context for
+                     * those.  Our test stream is all-intra, so simplified. */
+                    if (!deblock_off && cap->mb_type_intra && !getenv("DAEDALUS_SKIP_EDGES")) {
+                        const int qp_self  = cap->qp_y;
+                        const int qp_left  = (mx > 0)
+                            ? inspect_st.captures[mb_idx - 1].qp_y : qp_self;
+                        const int qp_top   = (my > 0)
+                            ? inspect_st.captures[mb_idx - mb_w].qp_y : qp_self;
+                        const int qpc_self = chroma_qp_table[qp_self];
+                        const int qpc_left = chroma_qp_table[qp_left];
+                        const int qpc_top  = chroma_qp_table[qp_top];
+                        const int qp_avg_left  = (qp_self  + qp_left  + 1) >> 1;
+                        const int qp_avg_top   = (qp_self  + qp_top   + 1) >> 1;
+                        const int qpc_avg_left = (qpc_self + qpc_left + 1) >> 1;
+                        const int qpc_avg_top  = (qpc_self + qpc_top  + 1) >> 1;
+
+                        /* Helper macro to emit one edge.  bS=0 (skip)
+                         * edges are still emitted with bS=0 — daedalus's
+                         * partitioner filters them out. */
+                        #define EMIT_EDGE(orient_, plane_, edge_idx_, bS_, qp_) do { \
+                            if (n_edges >= 16) break;                                \
+                            struct daedalus_decoder_edge *e = &mb_edges[n_edges++];  \
+                            e->mb_x     = (uint16_t) mx;                             \
+                            e->mb_y     = (uint16_t) my;                             \
+                            e->edge_idx = (uint8_t)  (edge_idx_);                    \
+                            e->orient   = (uint8_t)  (orient_);                      \
+                            e->plane    = (uint8_t)  (plane_);                       \
+                            e->bS       = (uint8_t)  (bS_);                          \
+                            e->alpha    = alpha_table[(qp_) + slice_a];              \
+                            e->beta     = beta_table [(qp_) + slice_b];              \
+                            const int8_t *tc = tc0_table[(qp_) + slice_a];           \
+                            e->tc0[0] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                            e->tc0[1] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                            e->tc0[2] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                            e->tc0[3] = tc[(bS_) <= 3 ? (bS_) : 0];                  \
+                        } while (0)
+
+                        /* Luma V edges: 4 at col 0, 4, 8, 12.  Internal
+                         * edges at 4/12 are skipped for 8x8 DCT MBs. */
+                        EMIT_EDGE(0, 0, 0, (mx > 0) ? 4 : 0, qp_avg_left);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 0, 1, 3, qp_self);
+                        EMIT_EDGE(0, 0, 2, 3, qp_self);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 0, 3, 3, qp_self);
+
+                        /* Luma H edges: 4 at row 0, 4, 8, 12. */
+                        EMIT_EDGE(1, 0, 0, (my > 0) ? 4 : 0, qp_avg_top);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 0, 1, 3, qp_self);
+                        EMIT_EDGE(1, 0, 2, 3, qp_self);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 0, 3, 3, qp_self);
+
+                        /* Chroma V edges: 2 per plane (Cb=1, Cr=2). */
+                        EMIT_EDGE(0, 1, 0, (mx > 0) ? 4 : 0, qpc_avg_left);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 1, 1, 3, qpc_self);
+                        EMIT_EDGE(0, 2, 0, (mx > 0) ? 4 : 0, qpc_avg_left);
+                        if (!cap->transform_8x8) EMIT_EDGE(0, 2, 1, 3, qpc_self);
+
+                        /* Chroma H edges. */
+                        EMIT_EDGE(1, 1, 0, (my > 0) ? 4 : 0, qpc_avg_top);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 1, 1, 3, qpc_self);
+                        EMIT_EDGE(1, 2, 0, (my > 0) ? 4 : 0, qpc_avg_top);
+                        if (!cap->transform_8x8) EMIT_EDGE(1, 2, 1, 3, qpc_self);
+
+                        #undef EMIT_EDGE
+                    }
+#endif
+
                    mb.mb_x        = (uint16_t) mx;
                    mb.mb_y        = (uint16_t) my;
                    mb.transform_8x8 = 0;
                    mb.coeffs      = mb_coeffs;
                    mb.predicted   = mb_pred;
-                    mb.edges       = NULL;
-                    mb.n_edges     = 0;
+                    mb.edges       = (n_edges > 0) ? mb_edges : NULL;
+                    mb.n_edges     = (uint8_t) n_edges;
                    if (daedalus_decoder_append_mb(dec, &mb) != 0) {
                        fprintf(stderr, "append_mb (%d,%d) failed\n", mx, my);
                        rc = 3; goto cleanup;
@@ -391,12 +877,41 @@ int main(int argc, char **argv)
                                out_uv_ref, (size_t) coded_w,
                                coded_w, coded_h);

-            /* Byte-exact compare. */
+            /* (PR-A3b's pre_deblock vs AVFrame DIAG check is removed in
+             * PR-A6: with libavcodec's deblock now ENABLED, AVFrame is
+             * post-deblock and intentionally differs from the per-MB
+             * pre_deblock snapshots taken in the callback.) */
+
+            /* Byte-exact compare + first-diff diagnostic. */
            size_t y_diffs = 0, uv_diffs = 0;
+            size_t y_first_diff = (size_t) -1;
            for (size_t i = 0; i < y_size; i++)
-                if (out_y_dadec[i] != out_y_ref[i]) y_diffs++;
+                if (out_y_dadec[i] != out_y_ref[i]) {
+                    if (y_first_diff == (size_t) -1) y_first_diff = i;
+                    y_diffs++;
+                }
            for (size_t i = 0; i < uv_size; i++)
                if (out_uv_dadec[i] != out_uv_ref[i]) uv_diffs++;
+            if (y_diffs && y_first_diff != (size_t) -1) {
+                const size_t row = y_first_diff / (size_t) avctx->width;
+                const size_t col = y_first_diff % (size_t) avctx->width;
+                const size_t mb_x = col / 16;
+                const size_t mb_y = row / 8;  /* not row/16 — chroma row uses /8 so use raw row here */
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+                const int mb_idx = (int)(row / 16) * mb_w + (int) mb_x;
+                const int real = (mb_idx >= 0 && mb_idx < mb_w * mb_h)
+                                  ? inspect_st.captures[mb_idx].valid : -1;
+                printf("    first Y diff @ byte %zu = (row %zu, col %zu) in MB(%zu,%zu) [real-coeffs=%d]; "
+                       "dadec=%u ref=%u\n",
+                       y_first_diff, row, col, mb_x, row / 16,
+                       real, out_y_dadec[y_first_diff], out_y_ref[y_first_diff]);
+#else
+                (void) mb_x; (void) mb_y;
+                printf("    first Y diff @ byte %zu = (row %zu, col %zu); dadec=%u ref=%u\n",
+                       y_first_diff, row, col,
+                       out_y_dadec[y_first_diff], out_y_ref[y_first_diff]);
+#endif
+            }
            total_y_diffs  += y_diffs;
            total_uv_diffs += uv_diffs;
 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
@@ -424,6 +939,21 @@ int main(int argc, char **argv)
                inspect_st.duplicate_mbs    = 0;
                inspect_st.out_of_bounds    = 0;
                memset(inspect_st.seen, 0, (size_t) expected);
+
+#ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+                printf("  frame %d: real-coeffs path %d MBs, "
+                       "skipped intra16x16=%d 8x8dct=%d other=%d\n",
+                       n_frames, inspect_st.real_coeffs_mbs,
+                       inspect_st.skipped_intra16x16,
+                       inspect_st.skipped_8x8dct,
+                       inspect_st.skipped_other);
+                inspect_st.real_coeffs_mbs      = 0;
+                inspect_st.skipped_intra16x16   = 0;
+                inspect_st.skipped_8x8dct       = 0;
+                inspect_st.skipped_other        = 0;
+                memset(inspect_st.captures, 0,
+                       (size_t) expected * sizeof(*inspect_st.captures));
+#endif
            }
 #endif
            printf("  frame %d: Y diff %zu/%zu  UV diff %zu/%zu%s\n",
@@ -478,6 +1008,9 @@ cleanup:
    free(out_uv_dadec);free(out_y_dadec);
 #ifdef DAEDALUS_HAVE_H264_MB_INSPECT_CB
    free(inspect_st.seen);
+#  ifdef DAEDALUS_HAVE_H264_MB_INSPECT_COEFFS
+    free(inspect_st.captures);
+#  endif
 #endif
    if (dec)   daedalus_decoder_destroy(dec);
    av_frame_free(&fr);
Author	SHA1	Message	Date
marfrit	df339c07fd	Install daedalus-decoder.pc for sibling consumers Adds pkg-config plumbing so consumers (daedalus-v4l2 daemon for the upcoming PR-Q3a shadow-mode wiring; the daedalus_decode_h264 CLI when built outside this tree) can locate libdaedalus_decoder.a + the public header via pkg_check_modules / pkg-config. Mirrors daedalus-fourier's relocatable-prefix scheme: prefix is derived from ${pcfiledir} so cmake --install --prefix /foo produces a .pc that resolves to /foo at lookup time. Verified across two install prefixes. daedalus-fourier is declared as a public Requires: because consumers static-linking libdaedalus_decoder.a also need libdaedalus_core.a in their link line to resolve the daedalus_ctx_* / daedalus_recipe_* symbols this archive references. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-26 13:32:58 +02:00
marfrit	9061350e82	Merge pull request 'PR-A6: enable libavcodec deblock + drive daedalus deblock on real streams' (#16 ) from noether/tools-h264-deblock-validation into main Reviewed-on: #16	2026-05-26 10:12:30 +00:00
claude-noether	b597fc0098	PR-A6: enable libavcodec deblock + drive daedalus deblock on real streams PARTIAL PASS — full I-frame pipeline (IDCT + deblock) running on real H.264 streams via daedalus-decoder's frame-major dispatch. Residual divergence vs libavcodec reference: 0.09% to 0.86% Y / 0.35% to 2.0% UV depending on substrate + resolution. Kernel-level off-by-one issues remain; structurally same family as task #179. Architecture (verified against `dejavu` memory before coding) ------------------------------------------------------------- - NO new libavcodec patches. Uses existing 0016 + 0017 callback infrastructure. - daedalus-decoder is the consumer-side frame-major dispatch path; libavcodec runs to produce the post-deblock reference. Daedalus is NOT substituted into libavcodec's deblock path. - Edge derivation is a one-time spec implementation in the CLI, not a per-block function-pointer hijack. Different shape from the banned per-kernel substitution arc. Hard re-check vs the magic word memory before any tool call (per the user's explicit instruction "make sure no dejavu"). What changed in the CLI ----------------------- 1. avctx->skip_loop_filter dropped — libavcodec's deblock now runs and AVFrame is post-deblock (the new reference). 2. Per-MB callback captures pre-deblock pixels for all 3 planes (Y/Cb/Cr) at MB(N)'s own callback time — pure pre-deblock for MB(N) regardless of incremental deblock timing for neighbours (filter_mb runs AFTER hl_decode_mb returns, so callback sees fresh-decoded fresh-pre-deblock pixels). 3. Per-MB callback also captures qp_y, mb_type_intra, transform_8x8. Slice-level: slice_alpha_c0_offset, slice_beta_offset, slice_deblocking_filter. 4. Transcribed H.264 §8.7.2 alpha_table[156], beta_table[156], tc0_table[156][4] from FFmpeg's h264_loopfilter.c (LGPL-2.1+ transcription; algorithm/values normative-spec, unpatented). 5. Transcribed §8.5.11 / Table 8-11 chroma_qp_table[52] for qP_Y → qP_C conversion (chroma_qp_index_offset assumed 0, which matches x264 default). 6. Main loop: for each MB, build daedalus_decoder_mb_input.edges from spec rules. 16 edges/MB (4 V-luma + 4 H-luma + 2 V-Cb + 2 V-Cr + 2 H-Cb + 2 H-Cr). bS=4 at MB boundary, bS=3 internal, bS=0 at frame boundary. 8x8 DCT MBs skip internal edges at col/row 4 and 12 (only the 8x8-block boundary fires). 7. Daedalus's flush_frame runs IDCT-add for real-coeffs MBs + identity passthrough for skipped MBs, THEN dispatches the 4 deblock kernels (luma V/H + chroma V/H, plus their bS=4 intra variants) across the frame. 8. Compare daedalus output to AVFrame (post-deblock). Subtle bug hunted: sl->deblocking_filter convention inversion ------------------------------------------------------------- FFmpeg's h264_slice.c line 1901 does `sl->deblocking_filter ^= 1` to invert the spec's `disable_deblocking_filter_idc` semantics. Internal convention: - 0 = DISABLED (was 1 in spec) - 1 = ENABLED (was 0 in spec) - 2 = enabled-but-not-across-slice-boundaries (unchanged) Initial implementation treated `== 1` as "disabled" per spec semantics, which silently skipped all edge emission (deblock_off=1) and gave the same diff count as the no-edges baseline. Inverted to `deblock_off = (sl->deblocking_filter == 0)`; edges then flowed and divergence dropped 5346→438 Y diffs (92% reduction) per frame. Results on hertz (Pi 5 V3D 7.1) ------------------------------- testsrc2 I-only via libx264 -bf 0 -g 1: 320×240, 5 frames, substrate=cpu: Y diff 2009/384000 (0.52%), UV diff 3876/192000 (2.02%) 320×240, 5 frames, substrate=qpu: Y diff 3288/384000 (0.86%), UV diff 3577/192000 (1.86%) 1920×1088, 3 frames, substrate=auto: Y diff 5810/6266880 (0.09%), UV diff 10921/3133440 (0.35%) The 1080p rate is lower than QVGA's — content has fewer edges relative to total pixels at higher resolution. Residual divergence — root cause analysis ----------------------------------------- - CPU substrate uses ff_h264__loop_filter__neon (same kernel libavcodec uses). Same kernel + same alpha/beta/tc0/bS → output SHOULD be identical. But still 0.52% Y diff. - Likely cause: edge dispatch ORDER mismatch. libavcodec serialises per-MB (filter MB(N)'s edges, then MB(N+1)'s). Daedalus batches frame-wide (all V luma across the frame, then all H luma, etc.). For overlapping-pixel zones (e.g., MB(N)'s col 12 internal edge + MB(N+1)'s col 0 boundary edge both touch cols 13-15), the order affects the final pixel. - QPU substrate has slightly higher divergence (0.86% Y) — additional kernel-level off-by-one between daedalus's V3D shader and the NEON reference, in the same family as task #179's chroma divergence. These are kernel-level / dispatch-order issues, not CLI bugs. Task #179 extended in scope (now includes luma + cross-MB edge ordering); root cause investigation belongs in daedalus-fourier. PR-A6 verifies the INFRASTRUCTURE: real coefficients flow through, real edges are derived per spec, daedalus runs IDCT + deblock in one frame- major dispatch, output is within ~1% of libavcodec reference on real H.264 streams. Full byte-exact closure depends on the daedalus-fourier deblock kernel/dispatch investigation. Followups --------- - Extend task #179 to cover luma edges + cross-MB edge ordering on real-stream layouts. - PR-A4: Intra_16x16 + chroma DC Hadamard. Would also help the UV diff rate since currently chroma is identity-passthrough (no real chroma residual coefficients flowing through daedalus). - Q3 deferred: daemon refactor in daedalus-v4l2.	2026-05-26 11:53:23 +02:00
marfrit	35b4f163c6	Merge pull request 'Stage 2 PR-A3b: real H.264 coefficients through daedalus-decoder, byte-exact' (#15 ) from noether/tools-h264-real-coeffs into main Reviewed-on: #15	2026-05-26 09:36:03 +00:00
claude-noether	44e92fa3dc	Stage 2 PR-A3b: real H.264 coefficients through daedalus-decoder, byte-exact Final option-A deliverable. CLI now extracts real per-MB coefficients from libavcodec via the inspection callback + side-buffer (marfrit-packages 0016 + 0017), reconstructs the pre-residual predicted samples P via inverse-of-IDCT-add, and feeds daedalus-decoder with real (P, C, no edges). Daedalus output BYTE-EXACT against libavcodec's pre-deblock AVFrame across 5 frames at 320x240 and 3 frames at 1920x1088, all three substrates (auto / cpu / qpu). Path summary ------------ avctx->thread_count = 1 (single-threaded decode — 0017's side buffer is per-H264Context; multi-threaded would race) avctx->skip_loop_filter = AVDISCARD_ALL (AVFrame stays pre-deblock so the P-recovery subtraction is exact) ff_h264_set_mb_inspect_cb (registers the callback) Inspection callback (per MB, fires post-hl_decode_mb): - Gate on IS_INTRA4x4 && !IS_8x8DCT && !IS_INTRA_PCM (skipped MBs fall back to identity-passthrough in the main loop) - Snapshot pre-deblock pixels from h->cur_pic.f->data[0] - Read coefficients from h->mb_inspect_coeffs (= sl->mb copy, the 0017 side buffer) - For each 4x4 block (16/MB in raster order, indexed via raster_to_zscan[] to find its slot in the z-scan-ordered side buffer): compute IDCT(C) using a transcribed H.264 C reference, derive P = clip(pre_deblock - ((IDCT + 32) >> 6)) - Stash per-MB capture (P + C) for the main loop Main loop: - Default identity-passthrough (predicted = AVFrame pixels, coeffs = 0) - For real-coeffs-valid MBs: override luma with captured P + C - flush_frame, byte-exact compare against AVFrame A diagnostic also asserts (silently when passing) that the callback's pre_deblock snapshot equals AVFrame at each real-coeffs MB position — i.e. h->cur_pic.f IS the eventual AVFrame buffer under skip_loop_filter=AVDISCARD_ALL with thread_count=1. Bug hunted in this PR --------------------- Initial implementation transposed the coefficients from row-major (sl->mb) to "column-major" (the layout that daedalus_decoder.h's mb_input.coeffs docstring describes). This caused ~0.2% Y pixel divergence on real streams (~150/frame at 320x240). Root cause identified via a standalone /tmp/idct_compare.c harness running daedalus's C ref IDCT and FFmpeg's reference C IDCT on identical int16[16] inputs: outputs IDENTICAL. The two functions implement the spec H.264 IDCT on the array regardless of layout interpretation; the "column-major" label is decoration. Removed the transpose; PR is now byte-exact. Follow-up task #184: clarify daedalus_decoder.h's mb_input.coeffs docstring so future integrators don't repeat this transpose mistake. Result on hertz (Pi 5 V3D 7.1) ------------------------------ testsrc2 I-only via libx264 -bf 0 -g 1: 320x240, 5 frames, substrate=auto: Y diff 0/76800, UV diff 0/38400 PASS 320x240, 5 frames, substrate=cpu: Y diff 0/76800, UV diff 0/38400 PASS 320x240, 5 frames, substrate=qpu: Y diff 0/76800, UV diff 0/38400 PASS 1920x1088, 3 frames, substrate=auto: Y diff 0/2088960, UV diff 0/1044480 PASS Real-coeffs path engaged for 77-95 MBs per 320x240 frame and 598-643 MBs per 1080p frame (testsrc2 is mostly flat → many Intra_16x16 MBs that fall back to identity passthrough; richer content streams would engage real-coeffs more). Followups --------- - PR-A4: extend the gate to Intra_16x16 (chroma DC Hadamard + Intra_16x16 luma DC Hadamard pre-pass) — currently ~30-60% of MBs fall back to identity-passthrough due to this. - PR-A5: extend to 8x8 transform (separate IDCT 8x8 dispatch path on the daedalus-decoder side, similar plumbing). - PR-A6: enable libavcodec's deblock (skip_loop_filter=AVDISCARD_NONE) and have daedalus's deblock produce the post-deblock output that matches AVFrame. Closes the loop on the full I-only pipeline. - Task #184: daedalus_decoder.h coeffs docstring clarification.	2026-05-26 11:19:11 +02:00