From adaabb1f63de7c8a8934d73398d224d6584447e5 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 24 May 2026 22:41:05 +0200 Subject: [PATCH] phase1: IDCT 8x8 dispatch (High profile transform_8x8_size_flag) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the High-profile 8x8 luma transform path alongside the existing 4x4 dispatch. flush_frame now partitions macroblocks by each MB's transform_8x8 flag and issues a separate luma dispatch per partition: - mb.transform_8x8 == 0 (Baseline/Main) → coeffs[0..256) interpreted as 16 4x4 blocks, fed to daedalus_recipe_dispatch_h264_idct4 (existing behaviour, unchanged). - mb.transform_8x8 == 1 (High) → coeffs[0..256) interpreted as 4 8x8 blocks (64 int16 each, column-major), fed to the new daedalus_recipe_dispatch_h264_idct8 call. Both luma partitions can be non-empty in the same frame (FFmpeg sets the flag per-MB). Each non-empty partition costs one vkQueueSubmit + vkQueueWaitIdle; empty partitions are skipped (common case: Baseline streams skip the 8x8 dispatch entirely). Chroma is unchanged — 4:2:0 chroma always uses the 4x4 transform. API surface: - New uint8_t `transform_8x8` field in `struct daedalus_decoder_mb_input` (after deblock_*). Backwards-compatible at the source level because the field defaults to 0 with C99 designated initialisers or {0} struct zeroing, both of which select the existing 4x4 path. ABI is pre-0.1 (per the header doc) so structural change is fine. - Mirrored in `struct daedalus_decoder_mb_desc` (internal layout). Test changes: - test_idct_bitexact now exercises a mixed-mode frame: every odd raster MB uses 8x8, every even uses 4x4 (so flush_frame's partitioning is also under test, not just the underlying shaders). - 8x8 C reference (h264_idct8_butterfly + ref_idct8_add) transcribed from daedalus-fourier tests/h264_idct8_ref.c per H.264 §8.5.13.2. Block layout column-major; +32 >> 6 rounding; add-to-predicted; clip255. - Reference luma compute branches per MB on the same mb_8x8[] array used to set the input flag. Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0): $ ./build/test_idct_bitexact test_idct_bitexact: 320x240 (300 MBs), seed=0xfeedface5a5a5a5a MB mix: 150 4x4 MBs, 150 8x8 MBs Y bytes total: 76800 Y bytes diff: 0 (0.0000%) Cb bytes total: 19200 diff: 0 (0.0000%) Cr bytes total: 19200 diff: 0 (0.0000%) BIT-EXACT PASS (Y + Cb + Cr) $ ctest --test-dir build 100% tests passed, 0 tests failed out of 2 Bit-exact PASS first try for the 8x8 path — 150 8x8 MBs × 4 blocks = 600 8x8 IDCTs against the spec C reference, identical output. Validates both the daedalus-fourier IDCT 8x8 shader (already gated by its own cycle-7 bit-exact test, now also gated end-to-end through our flush_frame), and our 8x8 layout assumptions (column-major coeffs, raster sb_y*2+sb_x block order, top-left = mb*16 + sb*8). What's NOT covered yet (deferred): - Z-scan permutation for FFmpeg compatibility (libavcodec intercept patch's concern; both 4x4 and 8x8 z-scans differ). - Chroma DC / luma Intra16x16 DC Hadamard pre-pass. - Mixed intra/inter MB handling — currently all MBs treated as residual-only (predicted=0). Closes the "IDCT 8x8 (High profile)" item from PR #3's deferred list. --- include/daedalus_decoder.h | 16 ++++- src/daedalus_decoder.c | 131 +++++++++++++++++++++++-------------- src/internal.h | 3 +- tests/test_idct_bitexact.c | 126 +++++++++++++++++++++++++++++------ 4 files changed, 203 insertions(+), 73 deletions(-) diff --git a/include/daedalus_decoder.h b/include/daedalus_decoder.h index f1a626c..925a953 100644 --- a/include/daedalus_decoder.h +++ b/include/daedalus_decoder.h @@ -75,9 +75,19 @@ struct daedalus_decoder_mb_input { int8_t deblock_alpha_c0; int8_t deblock_beta; - /* Transform coefficients — 256 luma (4x4 x 16) + 64 cb + 64 cr, - * column-major within each 4x4 block (matches FFmpeg convention). - * Caller-owned; copied during append. */ + /* High-profile 8x8 transform selector. + * 0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks + * (16 coeffs each, raster sb_y*4+sb_x); the chroma section is + * always 4x4. + * 1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs + * each, raster sb_y*2+sb_x). Set per H.264's + * transform_8x8_size_flag. Chroma remains 4x4 (4:2:0). + */ + uint8_t transform_8x8; + + /* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all + * column-major within each 4x4 or 8x8 block (matches FFmpeg + * convention). Caller-owned; copied during append. */ const int16_t *coeffs; /* points at exactly 384 int16_t */ }; diff --git a/src/daedalus_decoder.c b/src/daedalus_decoder.c index d130f56..74934f0 100644 --- a/src/daedalus_decoder.c +++ b/src/daedalus_decoder.c @@ -119,6 +119,7 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec, d->deblock_disable = mb->deblock_disable; d->deblock_alpha_c0 = mb->deblock_alpha_c0; d->deblock_beta = mb->deblock_beta; + d->transform_8x8 = mb->transform_8x8; memcpy(&dec->coeffs[(size_t) expected * 384], mb->coeffs, @@ -179,74 +180,104 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec, int rc = 0; - /* ---- Build frame-scaled luma-4x4 dispatch ---- */ + /* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */ - const size_t n_luma_blocks_per_mb = 16; - const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb; - - /* Scratch Y plane — coded-size byte buffer. Zero-initialised so - * the IDCT-ADD-clip operation reduces to clip255(IDCT) per block - * (predicted=0 because no intra/MC has run yet). */ + /* Two partitions of the per-MB luma section based on each MB's + * transform_8x8 flag: + * + * transform_8x8 == 0 → 16 4x4 blocks contribute to the 4x4 + * dispatch (16 coeffs each). + * transform_8x8 == 1 → 4 8x8 blocks contribute to the 8x8 + * dispatch (64 coeffs each). + * + * Both partitions can be non-empty in the same frame (FFmpeg sets + * transform_8x8_size_flag per MB), so we allocate worst-case for + * each and track actual counts. + */ const size_t y_stride_int = (size_t) dec->width; const size_t y_size = y_stride_int * (size_t) dec->height; uint8_t *scratch_y = calloc(1, y_size); - int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t)); - daedalus_h264_block_meta *meta = malloc( - n_luma_blocks * sizeof(daedalus_h264_block_meta)); - if (!scratch_y || !flat_coeffs || !meta) { + const size_t worst_4x4 = (size_t) dec->n_mbs * 16; + const size_t worst_8x8 = (size_t) dec->n_mbs * 4; + int16_t *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t)); + int16_t *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t)); + daedalus_h264_block_meta *meta4 = malloc(worst_4x4 * sizeof(*meta4)); + daedalus_h264_block_meta *meta8 = malloc(worst_8x8 * sizeof(*meta8)); + + if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) { rc = -1; goto cleanup; } - /* Raster-order layout: walk each MB, then each of its 16 luma 4×4 - * sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner). + /* Walk MBs in raster order, append each MB's luma blocks to the + * partition selected by its transform_8x8 flag. * - * NB: H.264's actual per-MB 4×4 coefficient scan order is the - * z-scan from spec §6.4.3 / fig 6-10. We're using a flat raster - * here because Phase 1 stage 1 only validates the dispatch - * round-trip; bit-exact against an FFmpeg reference requires the - * z-scan permutation and is a follow-on test. The per-MB - * coeffs[] field's first 256 entries are interpreted as 16 - * consecutive 4×4 blocks in the same raster order on the input - * side, so this is self-consistent for the validation. */ - size_t bi = 0; + * NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream + * follows the z-scan from spec §6.4.3 / fig 6-10. We're using + * flat raster on the input side too (sb_y outer, sb_x inner) for + * Phase 1 self-consistency; the z-scan permutation is the + * libavcodec-intercept patch's responsibility. + */ + size_t bi4 = 0, bi8 = 0; for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) { for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) { int mb_idx = mb_y * dec->mb_width + mb_x; + const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx]; const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384]; - for (int sb_y = 0; sb_y < 4; sb_y++) { - for (int sb_x = 0; sb_x < 4; sb_x++) { - /* Block top-left pixel in the coded Y plane. */ - size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4; - size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4; - meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x); - - /* Copy 16 coeffs for this block from the per-MB - * coeffs[] (luma offset = block_idx * 16). */ - int block_in_mb = sb_y * 4 + sb_x; - memcpy(&flat_coeffs[bi * 16], - &mb_coeffs[block_in_mb * 16], - 16 * sizeof(int16_t)); - bi++; + if (d->transform_8x8) { + /* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */ + for (int sb_y = 0; sb_y < 2; sb_y++) { + for (int sb_x = 0; sb_x < 2; sb_x++) { + size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8; + size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8; + meta8[bi8].dst_off = (uint32_t) + (px_y * y_stride_int + px_x); + int block_in_mb = sb_y * 2 + sb_x; + memcpy(&coeffs8[bi8 * 64], + &mb_coeffs[block_in_mb * 64], + 64 * sizeof(int16_t)); + bi8++; + } + } + } else { + /* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */ + for (int sb_y = 0; sb_y < 4; sb_y++) { + for (int sb_x = 0; sb_x < 4; sb_x++) { + size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4; + size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4; + meta4[bi4].dst_off = (uint32_t) + (px_y * y_stride_int + px_x); + int block_in_mb = sb_y * 4 + sb_x; + memcpy(&coeffs4[bi4 * 16], + &mb_coeffs[block_in_mb * 16], + 16 * sizeof(int16_t)); + bi4++; + } } } } } - /* assert bi == n_luma_blocks; the loop math guarantees it */ + /* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */ - /* ---- One Vulkan submit + wait for the whole frame's luma IDCT. + /* ---- One Vulkan submit + wait per non-empty luma partition. * AUTO substrate picks QPU per the post-decree recipe table; falls - * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */ - int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx, - scratch_y, y_stride_int, - flat_coeffs, - n_luma_blocks, - meta); - if (dr != 0) { - rc = -3; /* GPU dispatch failure */ - goto cleanup; + * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. + * Skipping the dispatch when the partition is empty avoids the + * shader-pool warm-up cost on the common case (a typical Baseline + * stream is all-4x4 → 8x8 dispatch is no-op). */ + if (bi4 > 0) { + int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx, + scratch_y, y_stride_int, + coeffs4, bi4, meta4); + if (dr != 0) { rc = -3; goto cleanup; } + } + if (bi8 > 0) { + int dr = daedalus_recipe_dispatch_h264_idct8(dec->dctx, + scratch_y, y_stride_int, + coeffs8, bi8, meta8); + if (dr != 0) { rc = -3; goto cleanup; } } /* ---- Copy Y out to caller's plane at the requested stride. ---- */ @@ -362,8 +393,10 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec, } cleanup: - free(meta); - free(flat_coeffs); + free(meta8); + free(meta4); + free(coeffs8); + free(coeffs4); free(scratch_y); dec->mbs_appended = 0; return rc; diff --git a/src/internal.h b/src/internal.h index bb006e4..637f1e0 100644 --- a/src/internal.h +++ b/src/internal.h @@ -41,7 +41,8 @@ struct daedalus_decoder_mb_desc { uint8_t deblock_disable; int8_t deblock_alpha_c0; int8_t deblock_beta; - uint8_t _pad1; + uint8_t transform_8x8; /* 0 = 4 luma blocks of 4x4 (16 total), + * 1 = 4 luma blocks of 8x8. */ }; struct daedalus_decoder { diff --git a/tests/test_idct_bitexact.c b/tests/test_idct_bitexact.c index 5b76226..8b5a92f 100644 --- a/tests/test_idct_bitexact.c +++ b/tests/test_idct_bitexact.c @@ -19,13 +19,15 @@ * layout is a separate concern (handled in the eventual libavcodec- * intercept patch). * - * Covers BOTH luma (Y plane, 16 blocks/MB) and chroma (UV plane, - * 4 Cb + 4 Cr blocks/MB, NV12-interleaved). Random coeffs for all - * three components; reference IDCT applied per block. The chroma + * Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved). + * Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use + * transform_8x8=0 (16 luma 4x4 blocks); both partitions are + * exercised in the same frame so the flush_frame partitioning logic + * is also under test, not just the underlying shaders. Random coeffs + * for all components; reference IDCT applied per block. The chroma * compare deinterleaves NV12 UV back into separate Cb/Cr expectations. * * Not in scope (covered by other tests / future PRs): - * - IDCT 8×8 (Phase 1 follow-on) * - Chroma DC / Intra16x16 DC Hadamard pre-pass * - bit-exactness against real H.264 streams (test-vector PR) * - non-zero predicted pixels (intra prediction lands in Stage 2a) @@ -66,6 +68,65 @@ static void h264_idct4_butterfly(const int d[4], int out[4]) out[3] = e - h; } +/* 1D 8-point butterfly per H.264 §8.5.13.2. Transcribed from + * daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original — + * algorithm reproduced here for test purposes, no copy of code). */ +static void h264_idct8_butterfly(const int d[8], int g[8]) +{ + int e[8], f[8]; + e[0] = d[0] + d[4]; + e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1); + e[2] = d[0] - d[4]; + e[3] = d[1] + d[7] - d[3] - (d[3] >> 1); + e[4] = (d[2] >> 1) - d[6]; + e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1); + e[6] = d[2] + (d[6] >> 1); + e[7] = d[3] + d[5] + d[1] + (d[1] >> 1); + + f[0] = e[0] + e[6]; + f[1] = e[1] + (e[7] >> 2); + f[2] = e[2] + e[4]; + f[3] = e[3] + (e[5] >> 2); + f[4] = e[2] - e[4]; + f[5] = (e[3] >> 2) - e[5]; + f[6] = e[0] - e[6]; + f[7] = e[7] - (e[1] >> 2); + + g[0] = f[0] + f[7]; + g[1] = f[2] + f[5]; + g[2] = f[4] + f[3]; + g[3] = f[6] + f[1]; + g[4] = f[6] - f[1]; + g[5] = f[4] - f[3]; + g[6] = f[2] - f[5]; + g[7] = f[0] - f[7]; +} + +static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block) +{ + /* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */ + int tmp[8][8]; + for (int r = 0; r < 8; r++) { + int d[8]; + for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r]; + int g[8]; + h264_idct8_butterfly(d, g); + for (int c = 0; c < 8; c++) tmp[r][c] = g[c]; + } + int col_out[8][8]; + for (int c = 0; c < 8; c++) { + int d[8]; + for (int r = 0; r < 8; r++) d[r] = tmp[r][c]; + int g[8]; + h264_idct8_butterfly(d, g); + for (int r = 0; r < 8; r++) col_out[r][c] = g[r]; + } + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) + dst[r * stride + c] = (uint8_t) clip_u8( + dst[r * stride + c] + ((col_out[r][c] + 32) >> 6)); +} + static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block) { /* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier): @@ -131,20 +192,31 @@ int main(int argc, char **argv) } } + /* Per-MB transform mode (deterministic split: every odd raster MB + * is 8x8, every even is 4x4 — exercises BOTH partitions in the + * same frame so the flush_frame partitioning logic is under test). */ + uint8_t *mb_8x8 = malloc((size_t) n_mbs); + if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; } + for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0; + /* Append in raster order. */ struct daedalus_decoder_mb_input mb = {0}; + int n_8x8_mbs = 0, n_4x4_mbs = 0; for (int my = 0; my < mb_h; my++) { for (int mx = 0; mx < mb_w; mx++) { int idx = my * mb_w + mx; mb.mb_x = (uint16_t) mx; mb.mb_y = (uint16_t) my; mb.coeffs = per_mb_coeffs[idx]; + mb.transform_8x8 = mb_8x8[idx]; + if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++; if (daedalus_decoder_append_mb(dec, &mb) != 0) { fprintf(stderr, "append (%d,%d) failed\n", mx, my); return 1; } } } + printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs); /* Flush — exercise BOTH the luma path (out_y) and the chroma path * (out_uv set to non-NULL so flush_frame runs the chroma dispatch @@ -162,27 +234,40 @@ int main(int argc, char **argv) } /* Compute the reference output: same per-MB → flat raster block - * layout as flush_frame uses. */ + * layout as flush_frame uses. Branch per MB on transform_8x8. */ uint8_t *ref_y = calloc(1, y_size); if (!ref_y) return 1; - /* Need a destructively-mutable copy because the reference IDCT - * doesn't actually mutate, but the GPU's IDCT shader does zero - * the coeffs. Our reference doesn't zero; that's fine because we - * use a fresh copy per block. */ - int16_t block_scratch[16]; + int16_t block_scratch[64]; /* large enough for 8x8 */ for (int my = 0; my < mb_h; my++) { for (int mx = 0; mx < mb_w; mx++) { int mb_idx = my * mb_w + mx; - for (int sb_y = 0; sb_y < 4; sb_y++) { - for (int sb_x = 0; sb_x < 4; sb_x++) { - int block_in_mb = sb_y * 4 + sb_x; - memcpy(block_scratch, - &per_mb_coeffs[mb_idx][block_in_mb * 16], - 16 * sizeof(int16_t)); - size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4; - size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4; - ref_idct4_add(&ref_y[px_y * (size_t) width + px_x], - width, block_scratch); + if (mb_8x8[mb_idx]) { + /* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */ + for (int sb_y = 0; sb_y < 2; sb_y++) { + for (int sb_x = 0; sb_x < 2; sb_x++) { + int block_in_mb = sb_y * 2 + sb_x; + memcpy(block_scratch, + &per_mb_coeffs[mb_idx][block_in_mb * 64], + 64 * sizeof(int16_t)); + size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8; + size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8; + ref_idct8_add(&ref_y[px_y * (size_t) width + px_x], + width, block_scratch); + } + } + } else { + /* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */ + for (int sb_y = 0; sb_y < 4; sb_y++) { + for (int sb_x = 0; sb_x < 4; sb_x++) { + int block_in_mb = sb_y * 4 + sb_x; + memcpy(block_scratch, + &per_mb_coeffs[mb_idx][block_in_mb * 16], + 16 * sizeof(int16_t)); + size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4; + size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4; + ref_idct4_add(&ref_y[px_y * (size_t) width + px_x], + width, block_scratch); + } } } } @@ -278,6 +363,7 @@ int main(int argc, char **argv) free(ref_y); free(gpu_uv); free(gpu_y); + free(mb_8x8); free(per_mb_coeffs); daedalus_decoder_destroy(dec); -- 2.47.3