2026-05-24 20:18:44 +00:00
2 changed files with 186 additions and 9 deletions
@@ -128,6 +128,41 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec,
    return 0;
 }

+/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch.
+ *
+ * Brings up the GPU substrate by calling daedalus-fourier's existing
+ * `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity
+ * (n_blocks = N_MBs × 16 luma 4×4 blocks per frame), in contrast to
+ * the substitution-arc shim that called it with n_blocks = 1 per call.
+ * ONE Vulkan submit + wait round-trip per frame instead of millions.
+ *
+ * What's done in this stage:
+ *   - Build a per-frame luma-4x4 meta[] in raster order across all MBs
+ *   - Repack the per-MB coeffs[] (384 int16, first 256 are luma) into
+ *     a flat block-major coeffs buffer (n_blocks × 16 int16)
+ *   - Allocate a frame-sized scratch Y plane (zero-initialised — no
+ *     intra prediction yet, so "predicted" = 0)
+ *   - Dispatch once via the recipe layer; the shader does
+ *     clip255(predicted + idct(coeffs)), i.e. with predicted=0 it's
+ *     clip255(idct(coeffs))
+ *   - Copy the scratch Y plane to the caller's out_y at the requested
+ *     stride
+ *
+ * What's NOT done yet (follow-on Phase 1 sub-PRs):
+ *   - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
+ *     so output pixels are residual-only and not a valid frame decode.
+ *     Sufficient for Vulkan round-trip validation, not for bit-exact
+ *     against FFmpeg.
+ *   - Motion compensation (Stage 2b): inter MBs not handled.
+ *   - High-profile IDCT 8x8 (Stage 1 extension)
+ *   - Deblock (Stage 4)
+ *   - Chroma planes — the daedalus-fourier idct4 shader is luma-only
+ *     in this revision; chroma blocks (4×4, 4 cb + 4 cr per MB) need a
+ *     separate dispatch with different meta/dst layout.  out_uv is
+ *     filled with neutral grey (128) as placeholder.
+ *   - dmabuf export — still memcpy-out to caller-provided planes.
+ *   - Stage 5 RGBA opt-in.
+ */
 int daedalus_decoder_flush_frame(daedalus_decoder *dec,
                                  uint8_t *out_y,  size_t y_stride,
                                  uint8_t *out_uv, size_t uv_stride)
@@ -136,19 +171,100 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
        return -1;
    if (dec->mbs_appended != dec->n_mbs)
        return -1;  /* incomplete frame */
+    if (!out_y)
+        return -1;

-    /* TODO Phase 1: build VkCommandBuffer with the 4 (or 5) pipeline
-     * stages, vkQueueSubmit, wait on fence, copy out to the caller's
-     * planes (or dma_buf-export when caller uses the export API).
+    int rc = 0;
+
+    /* ---- Build frame-scaled luma-4x4 dispatch ---- */
+
+    const size_t n_luma_blocks_per_mb = 16;
+    const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb;
+
+    /* Scratch Y plane — coded-size byte buffer.  Zero-initialised so
+     * the IDCT-ADD-clip operation reduces to clip255(IDCT) per block
+     * (predicted=0 because no intra/MC has run yet). */
+    const size_t y_stride_int = (size_t) dec->width;
+    const size_t y_size = y_stride_int * (size_t) dec->height;
+    uint8_t *scratch_y = calloc(1, y_size);
+    int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t));
+    daedalus_h264_block_meta *meta = malloc(
+        n_luma_blocks * sizeof(daedalus_h264_block_meta));
+
+    if (!scratch_y || !flat_coeffs || !meta) {
+        rc = -1;
+        goto cleanup;
+    }
+
+    /* Raster-order layout: walk each MB, then each of its 16 luma 4×4
+     * sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner).
     *
-     * Scaffold behaviour: zero the output planes so downstream
-     * consumers don't read uninitialised memory, then reset for the
-     * next frame.  Returns -ENOSYS-equivalent (-2) so any test that
-     * expects real pixels notices. */
-    (void) out_y; (void) y_stride; (void) out_uv; (void) uv_stride;
+     * NB: H.264's actual per-MB 4×4 coefficient scan order is the
+     * z-scan from spec §6.4.3 / fig 6-10.  We're using a flat raster
+     * here because Phase 1 stage 1 only validates the dispatch
+     * round-trip; bit-exact against an FFmpeg reference requires the
+     * z-scan permutation and is a follow-on test.  The per-MB
+     * coeffs[] field's first 256 entries are interpreted as 16
+     * consecutive 4×4 blocks in the same raster order on the input
+     * side, so this is self-consistent for the validation. */
+    size_t bi = 0;
+    for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
+        for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
+            int mb_idx = mb_y * dec->mb_width + mb_x;
+            const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];

+            for (int sb_y = 0; sb_y < 4; sb_y++) {
+                for (int sb_x = 0; sb_x < 4; sb_x++) {
+                    /* Block top-left pixel in the coded Y plane. */
+                    size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
+                    size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
+                    meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x);
+
+                    /* Copy 16 coeffs for this block from the per-MB
+                     * coeffs[] (luma offset = block_idx * 16). */
+                    int block_in_mb = sb_y * 4 + sb_x;
+                    memcpy(&flat_coeffs[bi * 16],
+                           &mb_coeffs[block_in_mb * 16],
+                           16 * sizeof(int16_t));
+                    bi++;
+                }
+            }
+        }
+    }
+    /* assert bi == n_luma_blocks; the loop math guarantees it */
+
+    /* ---- One Vulkan submit + wait for the whole frame's luma IDCT.
+     * AUTO substrate picks QPU per the post-decree recipe table; falls
+     * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */
+    int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
+                                                  scratch_y, y_stride_int,
+                                                  flat_coeffs,
+                                                  n_luma_blocks,
+                                                  meta);
+    if (dr != 0) {
+        rc = -3;  /* GPU dispatch failure */
+        goto cleanup;
+    }
+
+    /* ---- Copy out to caller's planes at the requested stride. ---- */
+    for (int r = 0; r < dec->height; r++)
+        memcpy(out_y + (size_t) r * y_stride,
+               &scratch_y[(size_t) r * y_stride_int],
+               (size_t) dec->width);
+
+    /* Chroma placeholder: 128 = mid-grey (NV12 neutral).  Real chroma
+     * IDCT dispatch is the next sub-PR. */
+    if (out_uv) {
+        for (int r = 0; r < dec->height / 2; r++)
+            memset(out_uv + (size_t) r * uv_stride, 128, (size_t) dec->width);
+    }
+
+cleanup:
+    free(meta);
+    free(flat_coeffs);
+    free(scratch_y);
    dec->mbs_appended = 0;
-    return -2;  /* not implemented */
+    return rc;
 }

 int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
@@ -95,6 +95,67 @@ int main(void)

    daedalus_decoder_destroy(dec);

+    /* ---- Full-frame round-trip with all-zero coefficients.
+     * Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT
+     * dispatch and a successful GPU round-trip returns 0.  IDCT of
+     * all-zero coefficients with zero-initialised predicted = all-zero
+     * output pixels. */
+    dec = daedalus_decoder_create(1920, 1088);
+    if (!dec) {
+        fprintf(stderr, "SKIP roundtrip: ctx create failed\n");
+        return 0;
+    }
+
+    static int16_t zero_coeffs[384] = {0};
+    struct daedalus_decoder_mb_input zmb = {0};
+    zmb.coeffs = zero_coeffs;
+
+    int mb_width = 1920 / 16;   /* 120 */
+    int mb_height = 1088 / 16;  /* 68 */
+    int n_mbs = mb_width * mb_height;
+
+    for (int mby = 0; mby < mb_height; mby++) {
+        for (int mbx = 0; mbx < mb_width; mbx++) {
+            zmb.mb_x = (uint16_t) mbx;
+            zmb.mb_y = (uint16_t) mby;
+            if (daedalus_decoder_append_mb(dec, &zmb) != 0) {
+                fprintf(stderr, "append (%d, %d) failed\n", mbx, mby);
+                return 1;
+            }
+        }
+    }
+    printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height);
+
+    size_t y_size = (size_t) 1920 * 1088;
+    size_t uv_size = (size_t) 1920 * 1088 / 2;
+    uint8_t *out_y = malloc(y_size);
+    uint8_t *out_uv = malloc(uv_size);
+    /* Pre-fill with sentinel so any read-then-write bug becomes visible. */
+    memset(out_y, 0xab, y_size);
+    memset(out_uv, 0xcd, uv_size);
+
+    int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920);
+    printf("flush_frame rc=%d\n", frc);
+    EXPECT(frc == 0, "flush succeeds on full frame");
+
+    /* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */
+    int y_nz = 0;
+    for (size_t i = 0; i < y_size; i++)
+        if (out_y[i] != 0) y_nz++;
+    printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
+    EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");
+
+    /* UV plane should be neutral grey (128) per Phase 1 placeholder. */
+    int uv_wrong = 0;
+    for (size_t i = 0; i < uv_size; i++)
+        if (out_uv[i] != 128) uv_wrong++;
+    printf("UV non-128 bytes: %d / %zu\n", uv_wrong, uv_size);
+    EXPECT(uv_wrong == 0, "UV plane is grey (128) Phase 1 placeholder");
+
+    free(out_y);
+    free(out_uv);
+    daedalus_decoder_destroy(dec);
+
    printf("smoke OK\n");
    return 0;
 }