2026-05-24 20:18:44 +00:00
2 changed files with 186 additions and 9 deletions
@@ -128,6 +128,41 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec,
    return 0;
 }
 /* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch.
 *
 * Brings up the GPU substrate by calling daedalus-fourier's existing
 * `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity
 * (n_blocks = N_MBs × 16 luma 4×4 blocks per frame), in contrast to
 * the substitution-arc shim that called it with n_blocks = 1 per call.
 * ONE Vulkan submit + wait round-trip per frame instead of millions.
 *
 * What's done in this stage:
 *   - Build a per-frame luma-4x4 meta[] in raster order across all MBs
 *   - Repack the per-MB coeffs[] (384 int16, first 256 are luma) into
 *     a flat block-major coeffs buffer (n_blocks × 16 int16)
 *   - Allocate a frame-sized scratch Y plane (zero-initialised — no
 *     intra prediction yet, so "predicted" = 0)
 *   - Dispatch once via the recipe layer; the shader does
 *     clip255(predicted + idct(coeffs)), i.e. with predicted=0 it's
 *     clip255(idct(coeffs))
 *   - Copy the scratch Y plane to the caller's out_y at the requested
 *     stride
 *
 * What's NOT done yet (follow-on Phase 1 sub-PRs):
 *   - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
 *     so output pixels are residual-only and not a valid frame decode.
 *     Sufficient for Vulkan round-trip validation, not for bit-exact
 *     against FFmpeg.
 *   - Motion compensation (Stage 2b): inter MBs not handled.
 *   - High-profile IDCT 8x8 (Stage 1 extension)
 *   - Deblock (Stage 4)
 *   - Chroma planes — the daedalus-fourier idct4 shader is luma-only
 *     in this revision; chroma blocks (4×4, 4 cb + 4 cr per MB) need a
 *     separate dispatch with different meta/dst layout.  out_uv is
 *     filled with neutral grey (128) as placeholder.
 *   - dmabuf export — still memcpy-out to caller-provided planes.
 *   - Stage 5 RGBA opt-in.
 */
 int daedalus_decoder_flush_frame(daedalus_decoder *dec,
                                  uint8_t *out_y,  size_t y_stride,
                                  uint8_t *out_uv, size_t uv_stride)
@@ -136,19 +171,100 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
        return -1;
    if (dec->mbs_appended != dec->n_mbs)
        return -1;  /* incomplete frame */
    if (!out_y)
        return -1;
-    /* TODO Phase 1: build VkCommandBuffer with the 4 (or 5) pipeline
+    int rc = 0;
-     * stages, vkQueueSubmit, wait on fence, copy out to the caller's
+
-     * planes (or dma_buf-export when caller uses the export API).
+    /* ---- Build frame-scaled luma-4x4 dispatch ---- */
    const size_t n_luma_blocks_per_mb = 16;
    const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb;
    /* Scratch Y plane — coded-size byte buffer.  Zero-initialised so
     * the IDCT-ADD-clip operation reduces to clip255(IDCT) per block
     * (predicted=0 because no intra/MC has run yet). */
    const size_t y_stride_int = (size_t) dec->width;
    const size_t y_size = y_stride_int * (size_t) dec->height;
    uint8_t *scratch_y = calloc(1, y_size);
    int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t));
    daedalus_h264_block_meta *meta = malloc(
        n_luma_blocks * sizeof(daedalus_h264_block_meta));
    if (!scratch_y || !flat_coeffs || !meta) {
        rc = -1;
        goto cleanup;
    }
    /* Raster-order layout: walk each MB, then each of its 16 luma 4×4
     * sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner).
     *
-     * Scaffold behaviour: zero the output planes so downstream
+     * NB: H.264's actual per-MB 4×4 coefficient scan order is the
-     * consumers don't read uninitialised memory, then reset for the
+     * z-scan from spec §6.4.3 / fig 6-10.  We're using a flat raster
-     * next frame.  Returns -ENOSYS-equivalent (-2) so any test that
+     * here because Phase 1 stage 1 only validates the dispatch
-     * expects real pixels notices. */
+     * round-trip; bit-exact against an FFmpeg reference requires the
-    (void) out_y; (void) y_stride; (void) out_uv; (void) uv_stride;
+     * z-scan permutation and is a follow-on test.  The per-MB
     * coeffs[] field's first 256 entries are interpreted as 16
     * consecutive 4×4 blocks in the same raster order on the input
     * side, so this is self-consistent for the validation. */
    size_t bi = 0;
    for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
        for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
            int mb_idx = mb_y * dec->mb_width + mb_x;
            const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
            for (int sb_y = 0; sb_y < 4; sb_y++) {
                for (int sb_x = 0; sb_x < 4; sb_x++) {
                    /* Block top-left pixel in the coded Y plane. */
                    size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
                    size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
                    meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x);
                    /* Copy 16 coeffs for this block from the per-MB
                     * coeffs[] (luma offset = block_idx * 16). */
                    int block_in_mb = sb_y * 4 + sb_x;
                    memcpy(&flat_coeffs[bi * 16],
                           &mb_coeffs[block_in_mb * 16],
                           16 * sizeof(int16_t));
                    bi++;
                }
            }
        }
    }
    /* assert bi == n_luma_blocks; the loop math guarantees it */
    /* ---- One Vulkan submit + wait for the whole frame's luma IDCT.
     * AUTO substrate picks QPU per the post-decree recipe table; falls
     * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */
    int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
                                                  scratch_y, y_stride_int,
                                                  flat_coeffs,
                                                  n_luma_blocks,
                                                  meta);
    if (dr != 0) {
        rc = -3;  /* GPU dispatch failure */
        goto cleanup;
    }
    /* ---- Copy out to caller's planes at the requested stride. ---- */
    for (int r = 0; r < dec->height; r++)
        memcpy(out_y + (size_t) r * y_stride,
               &scratch_y[(size_t) r * y_stride_int],
               (size_t) dec->width);
    /* Chroma placeholder: 128 = mid-grey (NV12 neutral).  Real chroma
     * IDCT dispatch is the next sub-PR. */
    if (out_uv) {
        for (int r = 0; r < dec->height / 2; r++)
            memset(out_uv + (size_t) r * uv_stride, 128, (size_t) dec->width);
    }
 cleanup:
    free(meta);
    free(flat_coeffs);
    free(scratch_y);
    dec->mbs_appended = 0;
-    return -2;  /* not implemented */
+    return rc;
 }
 int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
@@ -95,6 +95,67 @@ int main(void)
    daedalus_decoder_destroy(dec);
    /* ---- Full-frame round-trip with all-zero coefficients.
     * Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT
     * dispatch and a successful GPU round-trip returns 0.  IDCT of
     * all-zero coefficients with zero-initialised predicted = all-zero
     * output pixels. */
    dec = daedalus_decoder_create(1920, 1088);
    if (!dec) {
        fprintf(stderr, "SKIP roundtrip: ctx create failed\n");
        return 0;
    }
    static int16_t zero_coeffs[384] = {0};
    struct daedalus_decoder_mb_input zmb = {0};
    zmb.coeffs = zero_coeffs;
    int mb_width = 1920 / 16;   /* 120 */
    int mb_height = 1088 / 16;  /* 68 */
    int n_mbs = mb_width * mb_height;
    for (int mby = 0; mby < mb_height; mby++) {
        for (int mbx = 0; mbx < mb_width; mbx++) {
            zmb.mb_x = (uint16_t) mbx;
            zmb.mb_y = (uint16_t) mby;
            if (daedalus_decoder_append_mb(dec, &zmb) != 0) {
                fprintf(stderr, "append (%d, %d) failed\n", mbx, mby);
                return 1;
            }
        }
    }
    printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height);
    size_t y_size = (size_t) 1920 * 1088;
    size_t uv_size = (size_t) 1920 * 1088 / 2;
    uint8_t *out_y = malloc(y_size);
    uint8_t *out_uv = malloc(uv_size);
    /* Pre-fill with sentinel so any read-then-write bug becomes visible. */
    memset(out_y, 0xab, y_size);
    memset(out_uv, 0xcd, uv_size);
    int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920);
    printf("flush_frame rc=%d\n", frc);
    EXPECT(frc == 0, "flush succeeds on full frame");
    /* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */
    int y_nz = 0;
    for (size_t i = 0; i < y_size; i++)
        if (out_y[i] != 0) y_nz++;
    printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
    EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");
    /* UV plane should be neutral grey (128) per Phase 1 placeholder. */
    int uv_wrong = 0;
    for (size_t i = 0; i < uv_size; i++)
        if (out_uv[i] != 128) uv_wrong++;
    printf("UV non-128 bytes: %d / %zu\n", uv_wrong, uv_size);
    EXPECT(uv_wrong == 0, "UV plane is grey (128) Phase 1 placeholder");
    free(out_y);
    free(out_uv);
    daedalus_decoder_destroy(dec);
    printf("smoke OK\n");
    return 0;
 }