phase1/stage1: chroma 4x4 IDCT dispatch (Cb+Cr planar scratch, NV12 interleave)

Replaces the chroma placeholder (memset 128) with a real frame-scaled 4x4 IDCT dispatch for the Cb and Cr components. Two Vulkan submits + waits per frame now (one luma, one chroma) instead of one + memset. Implementation: - One combined planar scratch buffer (W*H/2 bytes) holds Cb then Cr; a single `daedalus_recipe_dispatch_h264_idct4` call processes both components by setting meta[].dst_off accordingly (Cr blocks add cb_plane_size). - Stride = W/2 (chroma row pitch); shared between Cb and Cr since they have identical geometry. - Per-MB coeff layout already had [256..320) for Cb and [320..384) for Cr (4 raster-order 4x4 blocks per component) from the original daedalus_decoder_append_mb design — no header-side changes. - Post-dispatch CPU memcpy loop interleaves Cb[r][c] and Cr[r][c] into NV12 UV at out_uv[r][2c..2c+1]. ~1 MB/frame at 1080p, well off the critical path; a GPU-side interleave shader is a Stage-5 optimisation. - Chroma dispatch is gated on out_uv != NULL so callers that only want luma (e.g. the bit-exact test before this PR) still pay nothing. Test changes: - tests/test_idct_bitexact.c extended with parallel reference IDCT for Cb and Cr planes (W/2 x H/2 each), then deinterleaves NV12 UV back into Cb/Cr for the compare. Random coeffs in [-512, 511] for all 384 per-MB int16 slots (previously only luma was randomised). - tests/test_smoke.c UV expectation flipped from "all 128 placeholder" to "all 0" (real dispatch with zero coeffs). Sentinel 0xcd pre-fill stays — same purpose: catches read-then-write bugs. Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0): $ ctest --test-dir build --output-on-failure Start 1: smoke 1/2 Test #1: smoke ............................ Passed 1.27 sec Start 2: idct_bitexact 2/2 Test #2: idct_bitexact .................... Passed 0.05 sec 100% tests passed, 0 tests failed out of 2 $ ./build/test_idct_bitexact test_idct_bitexact: 320x240 (300 MBs), seed=0xfeedface5a5a5a5a Y bytes total: 76800 Y bytes diff: 0 (0.0000%) Cb bytes total: 19200 diff: 0 (0.0000%) Cr bytes total: 19200 diff: 0 (0.0000%) BIT-EXACT PASS (Y + Cb + Cr) $ ./build/test_smoke daedalus-decoder version: 0.0.1 ctx created: 1920x1088, has_qpu=1 appended 8160 MBs (120x68) flush_frame rc=0 Y non-zero bytes: 0 / 2088960 UV non-zero bytes: 0 / 1044480 smoke OK (Smoke's 1.27s includes the 1080p frame: 8160 MBs * 16 = 130,560 luma blocks + 8160 * 8 = 65,280 chroma blocks across two dispatches — shader pool warm-up dominates the wall time, not the IDCT work.) What's NOT covered yet (deferred): - Chroma DC / Intra16x16 luma DC 2x2 Hadamard pre-pass. Real H.264 chroma puts the per-block DC coefficient through a Hadamard before it's added to the AC block; we currently treat all chroma blocks as plain 4x4 AC. Will land alongside the libavcodec intercept patch, since CABAC/CAVLC is where the DC vs AC distinction is exposed. - Z-scan permutation for FFmpeg compatibility — only matters at the intercept boundary, not here. - IDCT 8x8 (High profile). Closes the "chroma is a stub" item from PR #3's "what's NOT done" list.
2026-05-24 22:34:42 +02:00
parent 41306e48ee
commit 58848bd162
3 changed files with 238 additions and 52 deletions
@@ -19,9 +19,14 @@
 * layout is a separate concern (handled in the eventual libavcodec-
 * intercept patch).
 *
+ * Covers BOTH luma (Y plane, 16 blocks/MB) and chroma (UV plane,
+ * 4 Cb + 4 Cr blocks/MB, NV12-interleaved).  Random coeffs for all
+ * three components; reference IDCT applied per block.  The chroma
+ * compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
+ *
 * Not in scope (covered by other tests / future PRs):
- *   - chroma planes (Phase 1 stage 1 fills UV with grey 128)
 *   - IDCT 8×8 (Phase 1 follow-on)
+ *   - Chroma DC / Intra16x16 DC Hadamard pre-pass
 *   - bit-exactness against real H.264 streams (test-vector PR)
 *   - non-zero predicted pixels (intra prediction lands in Stage 2a)
 */
@@ -120,10 +125,9 @@ int main(int argc, char **argv)

    for (int mb = 0; mb < n_mbs; mb++) {
        for (int i = 0; i < 384; i++) {
-            if (i < 256)
-                per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
-            else
-                per_mb_coeffs[mb][i] = 0;  /* chroma — unused this stage */
+            /* Random coeffs in [-512, 511] for all of luma + Cb + Cr.
+             * Same range as the daedalus-fourier cycle-6 M1 gate. */
+            per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
        }
    }

@@ -142,12 +146,16 @@ int main(int argc, char **argv)
        }
    }

-    /* Flush. */
-    size_t y_size = (size_t) width * height;
-    uint8_t *gpu_y = calloc(1, y_size);
-    if (!gpu_y) return 1;
+    /* Flush — exercise BOTH the luma path (out_y) and the chroma path
+     * (out_uv set to non-NULL so flush_frame runs the chroma dispatch
+     * + NV12 interleave). */
+    size_t y_size  = (size_t) width * height;
+    size_t uv_size = (size_t) width * height / 2;
+    uint8_t *gpu_y  = calloc(1, y_size);
+    uint8_t *gpu_uv = calloc(1, uv_size);
+    if (!gpu_y || !gpu_uv) return 1;
    int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
-                                            NULL, 0);
+                                            gpu_uv, (size_t) width);
    if (frc != 0) {
        fprintf(stderr, "flush_frame rc=%d\n", frc);
        return 1;
@@ -180,29 +188,101 @@ int main(int argc, char **argv)
        }
    }

-    /* Byte-by-byte compare. */
-    size_t diffs = 0;
-    size_t first_diff = 0;
+    /* Build the chroma reference: separate planar Cb and Cr (W/2 by
+     * H/2), each block IDCT'd into its plane.  Chroma per-MB layout
+     * matches flush_frame: 4 Cb blocks then 4 Cr blocks, raster order
+     * within each component (sb_y * 2 + sb_x). */
+    size_t chroma_w = (size_t) width  / 2;
+    size_t chroma_h = (size_t) height / 2;
+    size_t chroma_plane_size = chroma_w * chroma_h;
+    uint8_t *ref_cb = calloc(1, chroma_plane_size);
+    uint8_t *ref_cr = calloc(1, chroma_plane_size);
+    if (!ref_cb || !ref_cr) return 1;
+    for (int my = 0; my < mb_h; my++) {
+        for (int mx = 0; mx < mb_w; mx++) {
+            int mb_idx = my * mb_w + mx;
+            for (int comp = 0; comp < 2; comp++) {
+                uint8_t *plane = (comp == 0) ? ref_cb : ref_cr;
+                size_t coeff_base = 256u + (size_t) comp * 64u;
+                for (int sb_y = 0; sb_y < 2; sb_y++) {
+                    for (int sb_x = 0; sb_x < 2; sb_x++) {
+                        int block_in_comp = sb_y * 2 + sb_x;
+                        memcpy(block_scratch,
+                               &per_mb_coeffs[mb_idx][coeff_base +
+                                                       (size_t) block_in_comp * 16],
+                               16 * sizeof(int16_t));
+                        size_t px_y = (size_t) my * 8 + (size_t) sb_y * 4;
+                        size_t px_x = (size_t) mx * 8 + (size_t) sb_x * 4;
+                        ref_idct4_add(&plane[px_y * chroma_w + px_x],
+                                      (ptrdiff_t) chroma_w, block_scratch);
+                    }
+                }
+            }
+        }
+    }
+
+    /* Y compare. */
+    size_t y_diffs = 0, y_first_diff = 0;
    for (size_t i = 0; i < y_size; i++) {
        if (gpu_y[i] != ref_y[i]) {
-            if (diffs == 0) first_diff = i;
-            diffs++;
+            if (y_diffs == 0) y_first_diff = i;
+            y_diffs++;
        }
    }
    printf("Y bytes total:  %zu\n", y_size);
-    printf("Y bytes diff:   %zu (%.4f%%)\n", diffs, 100.0 * diffs / y_size);
-    if (diffs) {
-        printf("first diff at offset %zu: gpu=%u ref=%u\n",
-               first_diff, gpu_y[first_diff], ref_y[first_diff]);
+    printf("Y bytes diff:   %zu (%.4f%%)\n", y_diffs, 100.0 * y_diffs / y_size);
+    if (y_diffs) {
+        printf("Y first diff at offset %zu: gpu=%u ref=%u\n",
+               y_first_diff, gpu_y[y_first_diff], ref_y[y_first_diff]);
    }

+    /* UV compare — deinterleave NV12 back into Cb/Cr and compare. */
+    size_t cb_diffs = 0, cr_diffs = 0;
+    size_t cb_first = 0, cr_first = 0;
+    for (size_t r = 0; r < chroma_h; r++) {
+        const uint8_t *gpu_row = gpu_uv + r * (size_t) width;
+        const uint8_t *cb_row  = ref_cb + r * chroma_w;
+        const uint8_t *cr_row  = ref_cr + r * chroma_w;
+        for (size_t c = 0; c < chroma_w; c++) {
+            uint8_t gpu_cb = gpu_row[c * 2 + 0];
+            uint8_t gpu_cr = gpu_row[c * 2 + 1];
+            if (gpu_cb != cb_row[c]) {
+                if (cb_diffs == 0) cb_first = r * chroma_w + c;
+                cb_diffs++;
+            }
+            if (gpu_cr != cr_row[c]) {
+                if (cr_diffs == 0) cr_first = r * chroma_w + c;
+                cr_diffs++;
+            }
+        }
+    }
+    printf("Cb bytes total: %zu  diff: %zu (%.4f%%)\n",
+           chroma_plane_size, cb_diffs,
+           100.0 * cb_diffs / chroma_plane_size);
+    printf("Cr bytes total: %zu  diff: %zu (%.4f%%)\n",
+           chroma_plane_size, cr_diffs,
+           100.0 * cr_diffs / chroma_plane_size);
+    if (cb_diffs) {
+        size_t r = cb_first / chroma_w, c = cb_first % chroma_w;
+        printf("Cb first diff at (%zu,%zu): gpu=%u ref=%u\n",
+               r, c, gpu_uv[r * (size_t) width + c * 2 + 0], ref_cb[cb_first]);
+    }
+    if (cr_diffs) {
+        size_t r = cr_first / chroma_w, c = cr_first % chroma_w;
+        printf("Cr first diff at (%zu,%zu): gpu=%u ref=%u\n",
+               r, c, gpu_uv[r * (size_t) width + c * 2 + 1], ref_cr[cr_first]);
+    }
+
+    free(ref_cr);
+    free(ref_cb);
    free(ref_y);
+    free(gpu_uv);
    free(gpu_y);
    free(per_mb_coeffs);
    daedalus_decoder_destroy(dec);

-    if (diffs == 0) {
-        printf("BIT-EXACT PASS\n");
+    if (y_diffs == 0 && cb_diffs == 0 && cr_diffs == 0) {
+        printf("BIT-EXACT PASS (Y + Cb + Cr)\n");
        return 0;
    }
    fprintf(stderr, "BIT-EXACT FAIL\n");
@@ -145,12 +145,16 @@ int main(void)
    printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
    EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");

-    /* UV plane should be neutral grey (128) per Phase 1 placeholder. */
-    int uv_wrong = 0;
+    /* UV plane should be all zero now (real chroma IDCT runs with
+     * zero coeffs → zero residual → clip255(0+0) = 0).  Previously a
+     * 128 placeholder when chroma was a memset stub; this PR replaced
+     * that with the real dispatch.  Sentinel 0xcd above guarantees we
+     * are observing post-dispatch writes, not the leftover memset. */
+    int uv_nz = 0;
    for (size_t i = 0; i < uv_size; i++)
-        if (out_uv[i] != 128) uv_wrong++;
-    printf("UV non-128 bytes: %d / %zu\n", uv_wrong, uv_size);
-    EXPECT(uv_wrong == 0, "UV plane is grey (128) Phase 1 placeholder");
+        if (out_uv[i] != 0) uv_nz++;
+    printf("UV non-zero bytes: %d / %zu\n", uv_nz, uv_size);
+    EXPECT(uv_nz == 0, "UV plane all zero for zero-coeff frame");

    free(out_y);
    free(out_uv);