Merge pull request 'phase1: IDCT 8x8 dispatch (High profile transform_8x8_size_flag)' (#6) from noether/phase1-idct8 into main
Reviewed-on: #6
This commit was merged in pull request #6.
This commit is contained in:
@@ -75,9 +75,19 @@ struct daedalus_decoder_mb_input {
|
||||
int8_t deblock_alpha_c0;
|
||||
int8_t deblock_beta;
|
||||
|
||||
/* Transform coefficients — 256 luma (4x4 x 16) + 64 cb + 64 cr,
|
||||
* column-major within each 4x4 block (matches FFmpeg convention).
|
||||
* Caller-owned; copied during append. */
|
||||
/* High-profile 8x8 transform selector.
|
||||
* 0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks
|
||||
* (16 coeffs each, raster sb_y*4+sb_x); the chroma section is
|
||||
* always 4x4.
|
||||
* 1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs
|
||||
* each, raster sb_y*2+sb_x). Set per H.264's
|
||||
* transform_8x8_size_flag. Chroma remains 4x4 (4:2:0).
|
||||
*/
|
||||
uint8_t transform_8x8;
|
||||
|
||||
/* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all
|
||||
* column-major within each 4x4 or 8x8 block (matches FFmpeg
|
||||
* convention). Caller-owned; copied during append. */
|
||||
const int16_t *coeffs; /* points at exactly 384 int16_t */
|
||||
};
|
||||
|
||||
|
||||
+82
-49
@@ -119,6 +119,7 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec,
|
||||
d->deblock_disable = mb->deblock_disable;
|
||||
d->deblock_alpha_c0 = mb->deblock_alpha_c0;
|
||||
d->deblock_beta = mb->deblock_beta;
|
||||
d->transform_8x8 = mb->transform_8x8;
|
||||
|
||||
memcpy(&dec->coeffs[(size_t) expected * 384],
|
||||
mb->coeffs,
|
||||
@@ -179,74 +180,104 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
||||
|
||||
int rc = 0;
|
||||
|
||||
/* ---- Build frame-scaled luma-4x4 dispatch ---- */
|
||||
/* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */
|
||||
|
||||
const size_t n_luma_blocks_per_mb = 16;
|
||||
const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb;
|
||||
|
||||
/* Scratch Y plane — coded-size byte buffer. Zero-initialised so
|
||||
* the IDCT-ADD-clip operation reduces to clip255(IDCT) per block
|
||||
* (predicted=0 because no intra/MC has run yet). */
|
||||
/* Two partitions of the per-MB luma section based on each MB's
|
||||
* transform_8x8 flag:
|
||||
*
|
||||
* transform_8x8 == 0 → 16 4x4 blocks contribute to the 4x4
|
||||
* dispatch (16 coeffs each).
|
||||
* transform_8x8 == 1 → 4 8x8 blocks contribute to the 8x8
|
||||
* dispatch (64 coeffs each).
|
||||
*
|
||||
* Both partitions can be non-empty in the same frame (FFmpeg sets
|
||||
* transform_8x8_size_flag per MB), so we allocate worst-case for
|
||||
* each and track actual counts.
|
||||
*/
|
||||
const size_t y_stride_int = (size_t) dec->width;
|
||||
const size_t y_size = y_stride_int * (size_t) dec->height;
|
||||
uint8_t *scratch_y = calloc(1, y_size);
|
||||
int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t));
|
||||
daedalus_h264_block_meta *meta = malloc(
|
||||
n_luma_blocks * sizeof(daedalus_h264_block_meta));
|
||||
|
||||
if (!scratch_y || !flat_coeffs || !meta) {
|
||||
const size_t worst_4x4 = (size_t) dec->n_mbs * 16;
|
||||
const size_t worst_8x8 = (size_t) dec->n_mbs * 4;
|
||||
int16_t *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t));
|
||||
int16_t *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t));
|
||||
daedalus_h264_block_meta *meta4 = malloc(worst_4x4 * sizeof(*meta4));
|
||||
daedalus_h264_block_meta *meta8 = malloc(worst_8x8 * sizeof(*meta8));
|
||||
|
||||
if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) {
|
||||
rc = -1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Raster-order layout: walk each MB, then each of its 16 luma 4×4
|
||||
* sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner).
|
||||
/* Walk MBs in raster order, append each MB's luma blocks to the
|
||||
* partition selected by its transform_8x8 flag.
|
||||
*
|
||||
* NB: H.264's actual per-MB 4×4 coefficient scan order is the
|
||||
* z-scan from spec §6.4.3 / fig 6-10. We're using a flat raster
|
||||
* here because Phase 1 stage 1 only validates the dispatch
|
||||
* round-trip; bit-exact against an FFmpeg reference requires the
|
||||
* z-scan permutation and is a follow-on test. The per-MB
|
||||
* coeffs[] field's first 256 entries are interpreted as 16
|
||||
* consecutive 4×4 blocks in the same raster order on the input
|
||||
* side, so this is self-consistent for the validation. */
|
||||
size_t bi = 0;
|
||||
* NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream
|
||||
* follows the z-scan from spec §6.4.3 / fig 6-10. We're using
|
||||
* flat raster on the input side too (sb_y outer, sb_x inner) for
|
||||
* Phase 1 self-consistency; the z-scan permutation is the
|
||||
* libavcodec-intercept patch's responsibility.
|
||||
*/
|
||||
size_t bi4 = 0, bi8 = 0;
|
||||
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
|
||||
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
|
||||
int mb_idx = mb_y * dec->mb_width + mb_x;
|
||||
const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx];
|
||||
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
|
||||
|
||||
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||||
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||||
/* Block top-left pixel in the coded Y plane. */
|
||||
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
|
||||
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
|
||||
meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x);
|
||||
|
||||
/* Copy 16 coeffs for this block from the per-MB
|
||||
* coeffs[] (luma offset = block_idx * 16). */
|
||||
int block_in_mb = sb_y * 4 + sb_x;
|
||||
memcpy(&flat_coeffs[bi * 16],
|
||||
&mb_coeffs[block_in_mb * 16],
|
||||
16 * sizeof(int16_t));
|
||||
bi++;
|
||||
if (d->transform_8x8) {
|
||||
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
|
||||
for (int sb_y = 0; sb_y < 2; sb_y++) {
|
||||
for (int sb_x = 0; sb_x < 2; sb_x++) {
|
||||
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8;
|
||||
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8;
|
||||
meta8[bi8].dst_off = (uint32_t)
|
||||
(px_y * y_stride_int + px_x);
|
||||
int block_in_mb = sb_y * 2 + sb_x;
|
||||
memcpy(&coeffs8[bi8 * 64],
|
||||
&mb_coeffs[block_in_mb * 64],
|
||||
64 * sizeof(int16_t));
|
||||
bi8++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
|
||||
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||||
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||||
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
|
||||
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
|
||||
meta4[bi4].dst_off = (uint32_t)
|
||||
(px_y * y_stride_int + px_x);
|
||||
int block_in_mb = sb_y * 4 + sb_x;
|
||||
memcpy(&coeffs4[bi4 * 16],
|
||||
&mb_coeffs[block_in_mb * 16],
|
||||
16 * sizeof(int16_t));
|
||||
bi4++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* assert bi == n_luma_blocks; the loop math guarantees it */
|
||||
/* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */
|
||||
|
||||
/* ---- One Vulkan submit + wait for the whole frame's luma IDCT.
|
||||
/* ---- One Vulkan submit + wait per non-empty luma partition.
|
||||
* AUTO substrate picks QPU per the post-decree recipe table; falls
|
||||
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */
|
||||
int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
|
||||
scratch_y, y_stride_int,
|
||||
flat_coeffs,
|
||||
n_luma_blocks,
|
||||
meta);
|
||||
if (dr != 0) {
|
||||
rc = -3; /* GPU dispatch failure */
|
||||
goto cleanup;
|
||||
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable.
|
||||
* Skipping the dispatch when the partition is empty avoids the
|
||||
* shader-pool warm-up cost on the common case (a typical Baseline
|
||||
* stream is all-4x4 → 8x8 dispatch is no-op). */
|
||||
if (bi4 > 0) {
|
||||
int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
|
||||
scratch_y, y_stride_int,
|
||||
coeffs4, bi4, meta4);
|
||||
if (dr != 0) { rc = -3; goto cleanup; }
|
||||
}
|
||||
if (bi8 > 0) {
|
||||
int dr = daedalus_recipe_dispatch_h264_idct8(dec->dctx,
|
||||
scratch_y, y_stride_int,
|
||||
coeffs8, bi8, meta8);
|
||||
if (dr != 0) { rc = -3; goto cleanup; }
|
||||
}
|
||||
|
||||
/* ---- Copy Y out to caller's plane at the requested stride. ---- */
|
||||
@@ -362,8 +393,10 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
||||
}
|
||||
|
||||
cleanup:
|
||||
free(meta);
|
||||
free(flat_coeffs);
|
||||
free(meta8);
|
||||
free(meta4);
|
||||
free(coeffs8);
|
||||
free(coeffs4);
|
||||
free(scratch_y);
|
||||
dec->mbs_appended = 0;
|
||||
return rc;
|
||||
|
||||
+2
-1
@@ -41,7 +41,8 @@ struct daedalus_decoder_mb_desc {
|
||||
uint8_t deblock_disable;
|
||||
int8_t deblock_alpha_c0;
|
||||
int8_t deblock_beta;
|
||||
uint8_t _pad1;
|
||||
uint8_t transform_8x8; /* 0 = 4 luma blocks of 4x4 (16 total),
|
||||
* 1 = 4 luma blocks of 8x8. */
|
||||
};
|
||||
|
||||
struct daedalus_decoder {
|
||||
|
||||
+106
-20
@@ -19,13 +19,15 @@
|
||||
* layout is a separate concern (handled in the eventual libavcodec-
|
||||
* intercept patch).
|
||||
*
|
||||
* Covers BOTH luma (Y plane, 16 blocks/MB) and chroma (UV plane,
|
||||
* 4 Cb + 4 Cr blocks/MB, NV12-interleaved). Random coeffs for all
|
||||
* three components; reference IDCT applied per block. The chroma
|
||||
* Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved).
|
||||
* Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use
|
||||
* transform_8x8=0 (16 luma 4x4 blocks); both partitions are
|
||||
* exercised in the same frame so the flush_frame partitioning logic
|
||||
* is also under test, not just the underlying shaders. Random coeffs
|
||||
* for all components; reference IDCT applied per block. The chroma
|
||||
* compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
|
||||
*
|
||||
* Not in scope (covered by other tests / future PRs):
|
||||
* - IDCT 8×8 (Phase 1 follow-on)
|
||||
* - Chroma DC / Intra16x16 DC Hadamard pre-pass
|
||||
* - bit-exactness against real H.264 streams (test-vector PR)
|
||||
* - non-zero predicted pixels (intra prediction lands in Stage 2a)
|
||||
@@ -66,6 +68,65 @@ static void h264_idct4_butterfly(const int d[4], int out[4])
|
||||
out[3] = e - h;
|
||||
}
|
||||
|
||||
/* 1D 8-point butterfly per H.264 §8.5.13.2. Transcribed from
|
||||
* daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original —
|
||||
* algorithm reproduced here for test purposes, no copy of code). */
|
||||
static void h264_idct8_butterfly(const int d[8], int g[8])
|
||||
{
|
||||
int e[8], f[8];
|
||||
e[0] = d[0] + d[4];
|
||||
e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
|
||||
e[2] = d[0] - d[4];
|
||||
e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
|
||||
e[4] = (d[2] >> 1) - d[6];
|
||||
e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
|
||||
e[6] = d[2] + (d[6] >> 1);
|
||||
e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);
|
||||
|
||||
f[0] = e[0] + e[6];
|
||||
f[1] = e[1] + (e[7] >> 2);
|
||||
f[2] = e[2] + e[4];
|
||||
f[3] = e[3] + (e[5] >> 2);
|
||||
f[4] = e[2] - e[4];
|
||||
f[5] = (e[3] >> 2) - e[5];
|
||||
f[6] = e[0] - e[6];
|
||||
f[7] = e[7] - (e[1] >> 2);
|
||||
|
||||
g[0] = f[0] + f[7];
|
||||
g[1] = f[2] + f[5];
|
||||
g[2] = f[4] + f[3];
|
||||
g[3] = f[6] + f[1];
|
||||
g[4] = f[6] - f[1];
|
||||
g[5] = f[4] - f[3];
|
||||
g[6] = f[2] - f[5];
|
||||
g[7] = f[0] - f[7];
|
||||
}
|
||||
|
||||
static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
|
||||
{
|
||||
/* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */
|
||||
int tmp[8][8];
|
||||
for (int r = 0; r < 8; r++) {
|
||||
int d[8];
|
||||
for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r];
|
||||
int g[8];
|
||||
h264_idct8_butterfly(d, g);
|
||||
for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
|
||||
}
|
||||
int col_out[8][8];
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int d[8];
|
||||
for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
|
||||
int g[8];
|
||||
h264_idct8_butterfly(d, g);
|
||||
for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
|
||||
}
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
dst[r * stride + c] = (uint8_t) clip_u8(
|
||||
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
|
||||
}
|
||||
|
||||
static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
|
||||
{
|
||||
/* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
|
||||
@@ -131,20 +192,31 @@ int main(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
/* Per-MB transform mode (deterministic split: every odd raster MB
|
||||
* is 8x8, every even is 4x4 — exercises BOTH partitions in the
|
||||
* same frame so the flush_frame partitioning logic is under test). */
|
||||
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
|
||||
if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; }
|
||||
for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0;
|
||||
|
||||
/* Append in raster order. */
|
||||
struct daedalus_decoder_mb_input mb = {0};
|
||||
int n_8x8_mbs = 0, n_4x4_mbs = 0;
|
||||
for (int my = 0; my < mb_h; my++) {
|
||||
for (int mx = 0; mx < mb_w; mx++) {
|
||||
int idx = my * mb_w + mx;
|
||||
mb.mb_x = (uint16_t) mx;
|
||||
mb.mb_y = (uint16_t) my;
|
||||
mb.coeffs = per_mb_coeffs[idx];
|
||||
mb.transform_8x8 = mb_8x8[idx];
|
||||
if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++;
|
||||
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
|
||||
fprintf(stderr, "append (%d,%d) failed\n", mx, my);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs);
|
||||
|
||||
/* Flush — exercise BOTH the luma path (out_y) and the chroma path
|
||||
* (out_uv set to non-NULL so flush_frame runs the chroma dispatch
|
||||
@@ -162,27 +234,40 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
/* Compute the reference output: same per-MB → flat raster block
|
||||
* layout as flush_frame uses. */
|
||||
* layout as flush_frame uses. Branch per MB on transform_8x8. */
|
||||
uint8_t *ref_y = calloc(1, y_size);
|
||||
if (!ref_y) return 1;
|
||||
/* Need a destructively-mutable copy because the reference IDCT
|
||||
* doesn't actually mutate, but the GPU's IDCT shader does zero
|
||||
* the coeffs. Our reference doesn't zero; that's fine because we
|
||||
* use a fresh copy per block. */
|
||||
int16_t block_scratch[16];
|
||||
int16_t block_scratch[64]; /* large enough for 8x8 */
|
||||
for (int my = 0; my < mb_h; my++) {
|
||||
for (int mx = 0; mx < mb_w; mx++) {
|
||||
int mb_idx = my * mb_w + mx;
|
||||
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||||
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||||
int block_in_mb = sb_y * 4 + sb_x;
|
||||
memcpy(block_scratch,
|
||||
&per_mb_coeffs[mb_idx][block_in_mb * 16],
|
||||
16 * sizeof(int16_t));
|
||||
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
|
||||
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
|
||||
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
|
||||
width, block_scratch);
|
||||
if (mb_8x8[mb_idx]) {
|
||||
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
|
||||
for (int sb_y = 0; sb_y < 2; sb_y++) {
|
||||
for (int sb_x = 0; sb_x < 2; sb_x++) {
|
||||
int block_in_mb = sb_y * 2 + sb_x;
|
||||
memcpy(block_scratch,
|
||||
&per_mb_coeffs[mb_idx][block_in_mb * 64],
|
||||
64 * sizeof(int16_t));
|
||||
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8;
|
||||
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8;
|
||||
ref_idct8_add(&ref_y[px_y * (size_t) width + px_x],
|
||||
width, block_scratch);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
|
||||
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||||
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||||
int block_in_mb = sb_y * 4 + sb_x;
|
||||
memcpy(block_scratch,
|
||||
&per_mb_coeffs[mb_idx][block_in_mb * 16],
|
||||
16 * sizeof(int16_t));
|
||||
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
|
||||
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
|
||||
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
|
||||
width, block_scratch);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -278,6 +363,7 @@ int main(int argc, char **argv)
|
||||
free(ref_y);
|
||||
free(gpu_uv);
|
||||
free(gpu_y);
|
||||
free(mb_8x8);
|
||||
free(per_mb_coeffs);
|
||||
daedalus_decoder_destroy(dec);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user