phase1/stage1: frame-scaled luma IDCT 4×4 — first GPU round-trip #3
+125
-9
@@ -128,6 +128,41 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch.
|
||||
*
|
||||
* Brings up the GPU substrate by calling daedalus-fourier's existing
|
||||
* `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity
|
||||
* (n_blocks = N_MBs × 16 luma 4×4 blocks per frame), in contrast to
|
||||
* the substitution-arc shim that called it with n_blocks = 1 per call.
|
||||
* ONE Vulkan submit + wait round-trip per frame instead of millions.
|
||||
*
|
||||
* What's done in this stage:
|
||||
* - Build a per-frame luma-4x4 meta[] in raster order across all MBs
|
||||
* - Repack the per-MB coeffs[] (384 int16, first 256 are luma) into
|
||||
* a flat block-major coeffs buffer (n_blocks × 16 int16)
|
||||
* - Allocate a frame-sized scratch Y plane (zero-initialised — no
|
||||
* intra prediction yet, so "predicted" = 0)
|
||||
* - Dispatch once via the recipe layer; the shader does
|
||||
* clip255(predicted + idct(coeffs)), i.e. with predicted=0 it's
|
||||
* clip255(idct(coeffs))
|
||||
* - Copy the scratch Y plane to the caller's out_y at the requested
|
||||
* stride
|
||||
*
|
||||
* What's NOT done yet (follow-on Phase 1 sub-PRs):
|
||||
* - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
|
||||
* so output pixels are residual-only and not a valid frame decode.
|
||||
* Sufficient for Vulkan round-trip validation, not for bit-exact
|
||||
* against FFmpeg.
|
||||
* - Motion compensation (Stage 2b): inter MBs not handled.
|
||||
* - High-profile IDCT 8x8 (Stage 1 extension)
|
||||
* - Deblock (Stage 4)
|
||||
* - Chroma planes — the daedalus-fourier idct4 shader is luma-only
|
||||
* in this revision; chroma blocks (4×4, 4 cb + 4 cr per MB) need a
|
||||
* separate dispatch with different meta/dst layout. out_uv is
|
||||
* filled with neutral grey (128) as placeholder.
|
||||
* - dmabuf export — still memcpy-out to caller-provided planes.
|
||||
* - Stage 5 RGBA opt-in.
|
||||
*/
|
||||
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
||||
uint8_t *out_y, size_t y_stride,
|
||||
uint8_t *out_uv, size_t uv_stride)
|
||||
@@ -136,19 +171,100 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
||||
return -1;
|
||||
if (dec->mbs_appended != dec->n_mbs)
|
||||
return -1; /* incomplete frame */
|
||||
if (!out_y)
|
||||
return -1;
|
||||
|
||||
/* TODO Phase 1: build VkCommandBuffer with the 4 (or 5) pipeline
|
||||
* stages, vkQueueSubmit, wait on fence, copy out to the caller's
|
||||
* planes (or dma_buf-export when caller uses the export API).
|
||||
int rc = 0;
|
||||
|
||||
/* ---- Build frame-scaled luma-4x4 dispatch ---- */
|
||||
|
||||
const size_t n_luma_blocks_per_mb = 16;
|
||||
const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb;
|
||||
|
||||
/* Scratch Y plane — coded-size byte buffer. Zero-initialised so
|
||||
* the IDCT-ADD-clip operation reduces to clip255(IDCT) per block
|
||||
* (predicted=0 because no intra/MC has run yet). */
|
||||
const size_t y_stride_int = (size_t) dec->width;
|
||||
const size_t y_size = y_stride_int * (size_t) dec->height;
|
||||
uint8_t *scratch_y = calloc(1, y_size);
|
||||
int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t));
|
||||
daedalus_h264_block_meta *meta = malloc(
|
||||
n_luma_blocks * sizeof(daedalus_h264_block_meta));
|
||||
|
||||
if (!scratch_y || !flat_coeffs || !meta) {
|
||||
rc = -1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Raster-order layout: walk each MB, then each of its 16 luma 4×4
|
||||
* sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner).
|
||||
*
|
||||
* Scaffold behaviour: zero the output planes so downstream
|
||||
* consumers don't read uninitialised memory, then reset for the
|
||||
* next frame. Returns -ENOSYS-equivalent (-2) so any test that
|
||||
* expects real pixels notices. */
|
||||
(void) out_y; (void) y_stride; (void) out_uv; (void) uv_stride;
|
||||
* NB: H.264's actual per-MB 4×4 coefficient scan order is the
|
||||
* z-scan from spec §6.4.3 / fig 6-10. We're using a flat raster
|
||||
* here because Phase 1 stage 1 only validates the dispatch
|
||||
* round-trip; bit-exact against an FFmpeg reference requires the
|
||||
* z-scan permutation and is a follow-on test. The per-MB
|
||||
* coeffs[] field's first 256 entries are interpreted as 16
|
||||
* consecutive 4×4 blocks in the same raster order on the input
|
||||
* side, so this is self-consistent for the validation. */
|
||||
size_t bi = 0;
|
||||
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
|
||||
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
|
||||
int mb_idx = mb_y * dec->mb_width + mb_x;
|
||||
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
|
||||
|
||||
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||||
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||||
/* Block top-left pixel in the coded Y plane. */
|
||||
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
|
||||
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
|
||||
meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x);
|
||||
|
||||
/* Copy 16 coeffs for this block from the per-MB
|
||||
* coeffs[] (luma offset = block_idx * 16). */
|
||||
int block_in_mb = sb_y * 4 + sb_x;
|
||||
memcpy(&flat_coeffs[bi * 16],
|
||||
&mb_coeffs[block_in_mb * 16],
|
||||
16 * sizeof(int16_t));
|
||||
bi++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* assert bi == n_luma_blocks; the loop math guarantees it */
|
||||
|
||||
/* ---- One Vulkan submit + wait for the whole frame's luma IDCT.
|
||||
* AUTO substrate picks QPU per the post-decree recipe table; falls
|
||||
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */
|
||||
int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
|
||||
scratch_y, y_stride_int,
|
||||
flat_coeffs,
|
||||
n_luma_blocks,
|
||||
meta);
|
||||
if (dr != 0) {
|
||||
rc = -3; /* GPU dispatch failure */
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* ---- Copy out to caller's planes at the requested stride. ---- */
|
||||
for (int r = 0; r < dec->height; r++)
|
||||
memcpy(out_y + (size_t) r * y_stride,
|
||||
&scratch_y[(size_t) r * y_stride_int],
|
||||
(size_t) dec->width);
|
||||
|
||||
/* Chroma placeholder: 128 = mid-grey (NV12 neutral). Real chroma
|
||||
* IDCT dispatch is the next sub-PR. */
|
||||
if (out_uv) {
|
||||
for (int r = 0; r < dec->height / 2; r++)
|
||||
memset(out_uv + (size_t) r * uv_stride, 128, (size_t) dec->width);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
free(meta);
|
||||
free(flat_coeffs);
|
||||
free(scratch_y);
|
||||
dec->mbs_appended = 0;
|
||||
return -2; /* not implemented */
|
||||
return rc;
|
||||
}
|
||||
|
||||
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
|
||||
|
||||
@@ -95,6 +95,67 @@ int main(void)
|
||||
|
||||
daedalus_decoder_destroy(dec);
|
||||
|
||||
/* ---- Full-frame round-trip with all-zero coefficients.
|
||||
* Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT
|
||||
* dispatch and a successful GPU round-trip returns 0. IDCT of
|
||||
* all-zero coefficients with zero-initialised predicted = all-zero
|
||||
* output pixels. */
|
||||
dec = daedalus_decoder_create(1920, 1088);
|
||||
if (!dec) {
|
||||
fprintf(stderr, "SKIP roundtrip: ctx create failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int16_t zero_coeffs[384] = {0};
|
||||
struct daedalus_decoder_mb_input zmb = {0};
|
||||
zmb.coeffs = zero_coeffs;
|
||||
|
||||
int mb_width = 1920 / 16; /* 120 */
|
||||
int mb_height = 1088 / 16; /* 68 */
|
||||
int n_mbs = mb_width * mb_height;
|
||||
|
||||
for (int mby = 0; mby < mb_height; mby++) {
|
||||
for (int mbx = 0; mbx < mb_width; mbx++) {
|
||||
zmb.mb_x = (uint16_t) mbx;
|
||||
zmb.mb_y = (uint16_t) mby;
|
||||
if (daedalus_decoder_append_mb(dec, &zmb) != 0) {
|
||||
fprintf(stderr, "append (%d, %d) failed\n", mbx, mby);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height);
|
||||
|
||||
size_t y_size = (size_t) 1920 * 1088;
|
||||
size_t uv_size = (size_t) 1920 * 1088 / 2;
|
||||
uint8_t *out_y = malloc(y_size);
|
||||
uint8_t *out_uv = malloc(uv_size);
|
||||
/* Pre-fill with sentinel so any read-then-write bug becomes visible. */
|
||||
memset(out_y, 0xab, y_size);
|
||||
memset(out_uv, 0xcd, uv_size);
|
||||
|
||||
int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920);
|
||||
printf("flush_frame rc=%d\n", frc);
|
||||
EXPECT(frc == 0, "flush succeeds on full frame");
|
||||
|
||||
/* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */
|
||||
int y_nz = 0;
|
||||
for (size_t i = 0; i < y_size; i++)
|
||||
if (out_y[i] != 0) y_nz++;
|
||||
printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
|
||||
EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");
|
||||
|
||||
/* UV plane should be neutral grey (128) per Phase 1 placeholder. */
|
||||
int uv_wrong = 0;
|
||||
for (size_t i = 0; i < uv_size; i++)
|
||||
if (out_uv[i] != 128) uv_wrong++;
|
||||
printf("UV non-128 bytes: %d / %zu\n", uv_wrong, uv_size);
|
||||
EXPECT(uv_wrong == 0, "UV plane is grey (128) Phase 1 placeholder");
|
||||
|
||||
free(out_y);
|
||||
free(out_uv);
|
||||
daedalus_decoder_destroy(dec);
|
||||
|
||||
printf("smoke OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user