diff --git a/src/daedalus_decoder.c b/src/daedalus_decoder.c index 5fcb210..d91fa2d 100644 --- a/src/daedalus_decoder.c +++ b/src/daedalus_decoder.c @@ -128,6 +128,41 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec, return 0; } +/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch. + * + * Brings up the GPU substrate by calling daedalus-fourier's existing + * `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity + * (n_blocks = N_MBs × 16 luma 4×4 blocks per frame), in contrast to + * the substitution-arc shim that called it with n_blocks = 1 per call. + * ONE Vulkan submit + wait round-trip per frame instead of millions. + * + * What's done in this stage: + * - Build a per-frame luma-4x4 meta[] in raster order across all MBs + * - Repack the per-MB coeffs[] (384 int16, first 256 are luma) into + * a flat block-major coeffs buffer (n_blocks × 16 int16) + * - Allocate a frame-sized scratch Y plane (zero-initialised — no + * intra prediction yet, so "predicted" = 0) + * - Dispatch once via the recipe layer; the shader does + * clip255(predicted + idct(coeffs)), i.e. with predicted=0 it's + * clip255(idct(coeffs)) + * - Copy the scratch Y plane to the caller's out_y at the requested + * stride + * + * What's NOT done yet (follow-on Phase 1 sub-PRs): + * - Intra prediction (Stage 2a wavefront): predicted is forced to 0, + * so output pixels are residual-only and not a valid frame decode. + * Sufficient for Vulkan round-trip validation, not for bit-exact + * against FFmpeg. + * - Motion compensation (Stage 2b): inter MBs not handled. + * - High-profile IDCT 8x8 (Stage 1 extension) + * - Deblock (Stage 4) + * - Chroma planes — the daedalus-fourier idct4 shader is luma-only + * in this revision; chroma blocks (4×4, 4 cb + 4 cr per MB) need a + * separate dispatch with different meta/dst layout. out_uv is + * filled with neutral grey (128) as placeholder. + * - dmabuf export — still memcpy-out to caller-provided planes. + * - Stage 5 RGBA opt-in. + */ int daedalus_decoder_flush_frame(daedalus_decoder *dec, uint8_t *out_y, size_t y_stride, uint8_t *out_uv, size_t uv_stride) @@ -136,19 +171,100 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec, return -1; if (dec->mbs_appended != dec->n_mbs) return -1; /* incomplete frame */ + if (!out_y) + return -1; - /* TODO Phase 1: build VkCommandBuffer with the 4 (or 5) pipeline - * stages, vkQueueSubmit, wait on fence, copy out to the caller's - * planes (or dma_buf-export when caller uses the export API). + int rc = 0; + + /* ---- Build frame-scaled luma-4x4 dispatch ---- */ + + const size_t n_luma_blocks_per_mb = 16; + const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb; + + /* Scratch Y plane — coded-size byte buffer. Zero-initialised so + * the IDCT-ADD-clip operation reduces to clip255(IDCT) per block + * (predicted=0 because no intra/MC has run yet). */ + const size_t y_stride_int = (size_t) dec->width; + const size_t y_size = y_stride_int * (size_t) dec->height; + uint8_t *scratch_y = calloc(1, y_size); + int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t)); + daedalus_h264_block_meta *meta = malloc( + n_luma_blocks * sizeof(daedalus_h264_block_meta)); + + if (!scratch_y || !flat_coeffs || !meta) { + rc = -1; + goto cleanup; + } + + /* Raster-order layout: walk each MB, then each of its 16 luma 4×4 + * sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner). * - * Scaffold behaviour: zero the output planes so downstream - * consumers don't read uninitialised memory, then reset for the - * next frame. Returns -ENOSYS-equivalent (-2) so any test that - * expects real pixels notices. */ - (void) out_y; (void) y_stride; (void) out_uv; (void) uv_stride; + * NB: H.264's actual per-MB 4×4 coefficient scan order is the + * z-scan from spec §6.4.3 / fig 6-10. We're using a flat raster + * here because Phase 1 stage 1 only validates the dispatch + * round-trip; bit-exact against an FFmpeg reference requires the + * z-scan permutation and is a follow-on test. The per-MB + * coeffs[] field's first 256 entries are interpreted as 16 + * consecutive 4×4 blocks in the same raster order on the input + * side, so this is self-consistent for the validation. */ + size_t bi = 0; + for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) { + for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) { + int mb_idx = mb_y * dec->mb_width + mb_x; + const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384]; + for (int sb_y = 0; sb_y < 4; sb_y++) { + for (int sb_x = 0; sb_x < 4; sb_x++) { + /* Block top-left pixel in the coded Y plane. */ + size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4; + size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4; + meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x); + + /* Copy 16 coeffs for this block from the per-MB + * coeffs[] (luma offset = block_idx * 16). */ + int block_in_mb = sb_y * 4 + sb_x; + memcpy(&flat_coeffs[bi * 16], + &mb_coeffs[block_in_mb * 16], + 16 * sizeof(int16_t)); + bi++; + } + } + } + } + /* assert bi == n_luma_blocks; the loop math guarantees it */ + + /* ---- One Vulkan submit + wait for the whole frame's luma IDCT. + * AUTO substrate picks QPU per the post-decree recipe table; falls + * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */ + int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx, + scratch_y, y_stride_int, + flat_coeffs, + n_luma_blocks, + meta); + if (dr != 0) { + rc = -3; /* GPU dispatch failure */ + goto cleanup; + } + + /* ---- Copy out to caller's planes at the requested stride. ---- */ + for (int r = 0; r < dec->height; r++) + memcpy(out_y + (size_t) r * y_stride, + &scratch_y[(size_t) r * y_stride_int], + (size_t) dec->width); + + /* Chroma placeholder: 128 = mid-grey (NV12 neutral). Real chroma + * IDCT dispatch is the next sub-PR. */ + if (out_uv) { + for (int r = 0; r < dec->height / 2; r++) + memset(out_uv + (size_t) r * uv_stride, 128, (size_t) dec->width); + } + +cleanup: + free(meta); + free(flat_coeffs); + free(scratch_y); dec->mbs_appended = 0; - return -2; /* not implemented */ + return rc; } int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane) diff --git a/tests/test_smoke.c b/tests/test_smoke.c index 1e47ad2..37b56b1 100644 --- a/tests/test_smoke.c +++ b/tests/test_smoke.c @@ -95,6 +95,67 @@ int main(void) daedalus_decoder_destroy(dec); + /* ---- Full-frame round-trip with all-zero coefficients. + * Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT + * dispatch and a successful GPU round-trip returns 0. IDCT of + * all-zero coefficients with zero-initialised predicted = all-zero + * output pixels. */ + dec = daedalus_decoder_create(1920, 1088); + if (!dec) { + fprintf(stderr, "SKIP roundtrip: ctx create failed\n"); + return 0; + } + + static int16_t zero_coeffs[384] = {0}; + struct daedalus_decoder_mb_input zmb = {0}; + zmb.coeffs = zero_coeffs; + + int mb_width = 1920 / 16; /* 120 */ + int mb_height = 1088 / 16; /* 68 */ + int n_mbs = mb_width * mb_height; + + for (int mby = 0; mby < mb_height; mby++) { + for (int mbx = 0; mbx < mb_width; mbx++) { + zmb.mb_x = (uint16_t) mbx; + zmb.mb_y = (uint16_t) mby; + if (daedalus_decoder_append_mb(dec, &zmb) != 0) { + fprintf(stderr, "append (%d, %d) failed\n", mbx, mby); + return 1; + } + } + } + printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height); + + size_t y_size = (size_t) 1920 * 1088; + size_t uv_size = (size_t) 1920 * 1088 / 2; + uint8_t *out_y = malloc(y_size); + uint8_t *out_uv = malloc(uv_size); + /* Pre-fill with sentinel so any read-then-write bug becomes visible. */ + memset(out_y, 0xab, y_size); + memset(out_uv, 0xcd, uv_size); + + int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920); + printf("flush_frame rc=%d\n", frc); + EXPECT(frc == 0, "flush succeeds on full frame"); + + /* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */ + int y_nz = 0; + for (size_t i = 0; i < y_size; i++) + if (out_y[i] != 0) y_nz++; + printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size); + EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame"); + + /* UV plane should be neutral grey (128) per Phase 1 placeholder. */ + int uv_wrong = 0; + for (size_t i = 0; i < uv_size; i++) + if (out_uv[i] != 128) uv_wrong++; + printf("UV non-128 bytes: %d / %zu\n", uv_wrong, uv_size); + EXPECT(uv_wrong == 0, "UV plane is grey (128) Phase 1 placeholder"); + + free(out_y); + free(out_uv); + daedalus_decoder_destroy(dec); + printf("smoke OK\n"); return 0; }