phase1/stage1: frame-scaled luma IDCT 4×4 — first GPU round-trip #3

Merged
marfrit merged 1 commits from noether/phase1-stage1-idct into main 2026-05-24 20:18:44 +00:00
2 changed files with 186 additions and 9 deletions
+125 -9
View File
@@ -128,6 +128,41 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec,
return 0; return 0;
} }
/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch.
*
* Brings up the GPU substrate by calling daedalus-fourier's existing
* `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity
* (n_blocks = N_MBs × 16 luma 4×4 blocks per frame), in contrast to
* the substitution-arc shim that called it with n_blocks = 1 per call.
* ONE Vulkan submit + wait round-trip per frame instead of millions.
*
* What's done in this stage:
* - Build a per-frame luma-4x4 meta[] in raster order across all MBs
* - Repack the per-MB coeffs[] (384 int16, first 256 are luma) into
* a flat block-major coeffs buffer (n_blocks × 16 int16)
* - Allocate a frame-sized scratch Y plane (zero-initialised — no
* intra prediction yet, so "predicted" = 0)
* - Dispatch once via the recipe layer; the shader does
* clip255(predicted + idct(coeffs)), i.e. with predicted=0 it's
* clip255(idct(coeffs))
* - Copy the scratch Y plane to the caller's out_y at the requested
* stride
*
* What's NOT done yet (follow-on Phase 1 sub-PRs):
* - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
* so output pixels are residual-only and not a valid frame decode.
* Sufficient for Vulkan round-trip validation, not for bit-exact
* against FFmpeg.
* - Motion compensation (Stage 2b): inter MBs not handled.
* - High-profile IDCT 8x8 (Stage 1 extension)
* - Deblock (Stage 4)
* - Chroma planes — the daedalus-fourier idct4 shader is luma-only
* in this revision; chroma blocks (4×4, 4 cb + 4 cr per MB) need a
* separate dispatch with different meta/dst layout. out_uv is
* filled with neutral grey (128) as placeholder.
* - dmabuf export — still memcpy-out to caller-provided planes.
* - Stage 5 RGBA opt-in.
*/
int daedalus_decoder_flush_frame(daedalus_decoder *dec, int daedalus_decoder_flush_frame(daedalus_decoder *dec,
uint8_t *out_y, size_t y_stride, uint8_t *out_y, size_t y_stride,
uint8_t *out_uv, size_t uv_stride) uint8_t *out_uv, size_t uv_stride)
@@ -136,19 +171,100 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
return -1; return -1;
if (dec->mbs_appended != dec->n_mbs) if (dec->mbs_appended != dec->n_mbs)
return -1; /* incomplete frame */ return -1; /* incomplete frame */
if (!out_y)
return -1;
/* TODO Phase 1: build VkCommandBuffer with the 4 (or 5) pipeline int rc = 0;
* stages, vkQueueSubmit, wait on fence, copy out to the caller's
* planes (or dma_buf-export when caller uses the export API). /* ---- Build frame-scaled luma-4x4 dispatch ---- */
const size_t n_luma_blocks_per_mb = 16;
const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb;
/* Scratch Y plane — coded-size byte buffer. Zero-initialised so
* the IDCT-ADD-clip operation reduces to clip255(IDCT) per block
* (predicted=0 because no intra/MC has run yet). */
const size_t y_stride_int = (size_t) dec->width;
const size_t y_size = y_stride_int * (size_t) dec->height;
uint8_t *scratch_y = calloc(1, y_size);
int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t));
daedalus_h264_block_meta *meta = malloc(
n_luma_blocks * sizeof(daedalus_h264_block_meta));
if (!scratch_y || !flat_coeffs || !meta) {
rc = -1;
goto cleanup;
}
/* Raster-order layout: walk each MB, then each of its 16 luma 4×4
* sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner).
* *
* Scaffold behaviour: zero the output planes so downstream * NB: H.264's actual per-MB 4×4 coefficient scan order is the
* consumers don't read uninitialised memory, then reset for the * z-scan from spec §6.4.3 / fig 6-10. We're using a flat raster
* next frame. Returns -ENOSYS-equivalent (-2) so any test that * here because Phase 1 stage 1 only validates the dispatch
* expects real pixels notices. */ * round-trip; bit-exact against an FFmpeg reference requires the
(void) out_y; (void) y_stride; (void) out_uv; (void) uv_stride; * z-scan permutation and is a follow-on test. The per-MB
* coeffs[] field's first 256 entries are interpreted as 16
* consecutive 4×4 blocks in the same raster order on the input
* side, so this is self-consistent for the validation. */
size_t bi = 0;
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
int mb_idx = mb_y * dec->mb_width + mb_x;
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
/* Block top-left pixel in the coded Y plane. */
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x);
/* Copy 16 coeffs for this block from the per-MB
* coeffs[] (luma offset = block_idx * 16). */
int block_in_mb = sb_y * 4 + sb_x;
memcpy(&flat_coeffs[bi * 16],
&mb_coeffs[block_in_mb * 16],
16 * sizeof(int16_t));
bi++;
}
}
}
}
/* assert bi == n_luma_blocks; the loop math guarantees it */
/* ---- One Vulkan submit + wait for the whole frame's luma IDCT.
* AUTO substrate picks QPU per the post-decree recipe table; falls
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */
int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
scratch_y, y_stride_int,
flat_coeffs,
n_luma_blocks,
meta);
if (dr != 0) {
rc = -3; /* GPU dispatch failure */
goto cleanup;
}
/* ---- Copy out to caller's planes at the requested stride. ---- */
for (int r = 0; r < dec->height; r++)
memcpy(out_y + (size_t) r * y_stride,
&scratch_y[(size_t) r * y_stride_int],
(size_t) dec->width);
/* Chroma placeholder: 128 = mid-grey (NV12 neutral). Real chroma
* IDCT dispatch is the next sub-PR. */
if (out_uv) {
for (int r = 0; r < dec->height / 2; r++)
memset(out_uv + (size_t) r * uv_stride, 128, (size_t) dec->width);
}
cleanup:
free(meta);
free(flat_coeffs);
free(scratch_y);
dec->mbs_appended = 0; dec->mbs_appended = 0;
return -2; /* not implemented */ return rc;
} }
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane) int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
+61
View File
@@ -95,6 +95,67 @@ int main(void)
daedalus_decoder_destroy(dec); daedalus_decoder_destroy(dec);
/* ---- Full-frame round-trip with all-zero coefficients.
* Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT
* dispatch and a successful GPU round-trip returns 0. IDCT of
* all-zero coefficients with zero-initialised predicted = all-zero
* output pixels. */
dec = daedalus_decoder_create(1920, 1088);
if (!dec) {
fprintf(stderr, "SKIP roundtrip: ctx create failed\n");
return 0;
}
static int16_t zero_coeffs[384] = {0};
struct daedalus_decoder_mb_input zmb = {0};
zmb.coeffs = zero_coeffs;
int mb_width = 1920 / 16; /* 120 */
int mb_height = 1088 / 16; /* 68 */
int n_mbs = mb_width * mb_height;
for (int mby = 0; mby < mb_height; mby++) {
for (int mbx = 0; mbx < mb_width; mbx++) {
zmb.mb_x = (uint16_t) mbx;
zmb.mb_y = (uint16_t) mby;
if (daedalus_decoder_append_mb(dec, &zmb) != 0) {
fprintf(stderr, "append (%d, %d) failed\n", mbx, mby);
return 1;
}
}
}
printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height);
size_t y_size = (size_t) 1920 * 1088;
size_t uv_size = (size_t) 1920 * 1088 / 2;
uint8_t *out_y = malloc(y_size);
uint8_t *out_uv = malloc(uv_size);
/* Pre-fill with sentinel so any read-then-write bug becomes visible. */
memset(out_y, 0xab, y_size);
memset(out_uv, 0xcd, uv_size);
int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920);
printf("flush_frame rc=%d\n", frc);
EXPECT(frc == 0, "flush succeeds on full frame");
/* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */
int y_nz = 0;
for (size_t i = 0; i < y_size; i++)
if (out_y[i] != 0) y_nz++;
printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");
/* UV plane should be neutral grey (128) per Phase 1 placeholder. */
int uv_wrong = 0;
for (size_t i = 0; i < uv_size; i++)
if (out_uv[i] != 128) uv_wrong++;
printf("UV non-128 bytes: %d / %zu\n", uv_wrong, uv_size);
EXPECT(uv_wrong == 0, "UV plane is grey (128) Phase 1 placeholder");
free(out_y);
free(out_uv);
daedalus_decoder_destroy(dec);
printf("smoke OK\n"); printf("smoke OK\n");
return 0; return 0;
} }