h264: chroma DC 2x2 Hadamard pre-pass primitive #23
+9
-2
@@ -564,14 +564,21 @@ add_executable(test_intra_pred_chroma8x8
|
||||
target_compile_options(test_intra_pred_chroma8x8 PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_8x8 luma prediction (High profile, 9 modes + 1-2-1
|
||||
# reference-sample pre-filter). This PR ships the pre-filter + the
|
||||
# 3 simple modes (V, H, DC); the 6 directional modes follow.
|
||||
# reference-sample pre-filter).
|
||||
add_executable(test_intra_pred_8x8_luma
|
||||
tests/test_intra_pred_8x8_luma.c
|
||||
tests/h264_intra_pred_8x8_luma_ref.c
|
||||
)
|
||||
target_compile_options(test_intra_pred_8x8_luma PRIVATE -O2)
|
||||
|
||||
# H.264 chroma DC 2x2 Hadamard pre-pass primitive. Pure transform,
|
||||
# no QP-dependent scaling (that's caller-side composition).
|
||||
add_executable(test_chroma_dc_hadamard
|
||||
tests/test_chroma_dc_hadamard.c
|
||||
tests/h264_chroma_dc_hadamard_ref.c
|
||||
)
|
||||
target_compile_options(test_chroma_dc_hadamard PRIVATE -O2)
|
||||
|
||||
add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
|
||||
target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
|
||||
target_compile_options(bench_pool_overhead PRIVATE -O2)
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for the H.264 chroma DC 2x2
|
||||
* Hadamard transform (per H.264 §8.5.11.1).
|
||||
*
|
||||
* In 4:2:0 chroma, the four DC coefficients (one from each chroma
|
||||
* 4x4 AC block within an MB) are arranged into a 2x2 block:
|
||||
*
|
||||
* c[0,0] c[0,1] block (0,0) DC block (0,1) DC
|
||||
* c[1,0] c[1,1] block (1,0) DC block (1,1) DC
|
||||
*
|
||||
* The 2x2 Hadamard transform:
|
||||
*
|
||||
* f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1]
|
||||
* f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1]
|
||||
* f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1]
|
||||
* f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1]
|
||||
*
|
||||
* Equivalently expressed as 2-stage butterflies (row then col), which
|
||||
* the NEON impl uses for SIMD friendliness — we present that form
|
||||
* here too so the QPU/NEON ports are 1:1.
|
||||
*
|
||||
* Output f[] replaces the input c[]. The QP-dependent scaling per
|
||||
* §8.5.11.2 happens AFTER this primitive — the intercept patch
|
||||
* composes Hadamard + LevelScale + shift itself, since the scaling
|
||||
* shape depends on QP and on whether we're in the chroma_qp_offset
|
||||
* adjustment regime.
|
||||
*
|
||||
* Input/output layout:
|
||||
* c[0..3] in row-major order: [c[0,0], c[0,1], c[1,0], c[1,1]]
|
||||
*
|
||||
* License: BSD-2-Clause. Algorithm is in the H.264 spec.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
|
||||
void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4])
|
||||
{
|
||||
/* Stage 1: butterfly along rows.
|
||||
* t[0] = c[0,0] + c[0,1] = c[0] + c[1]
|
||||
* t[1] = c[0,0] - c[0,1] = c[0] - c[1]
|
||||
* t[2] = c[1,0] + c[1,1] = c[2] + c[3]
|
||||
* t[3] = c[1,0] - c[1,1] = c[2] - c[3]
|
||||
*/
|
||||
int t0 = c[0] + c[1];
|
||||
int t1 = c[0] - c[1];
|
||||
int t2 = c[2] + c[3];
|
||||
int t3 = c[2] - c[3];
|
||||
|
||||
/* Stage 2: butterfly along cols. */
|
||||
c[0] = (int16_t)(t0 + t2); /* f[0,0] = t0+t2 = sum of all 4 */
|
||||
c[1] = (int16_t)(t1 + t3); /* f[0,1] = (c0-c1) + (c2-c3) */
|
||||
c[2] = (int16_t)(t0 - t2); /* f[1,0] = (c0+c1) - (c2+c3) */
|
||||
c[3] = (int16_t)(t1 - t3); /* f[1,1] = (c0-c1) - (c2-c3) */
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Tests the H.264 chroma DC 2x2 Hadamard primitive against
|
||||
* spec-derived expected outputs.
|
||||
*
|
||||
* f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1] "sum"
|
||||
* f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1] "col-diff"
|
||||
* f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1] "row-diff"
|
||||
* f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1] "anti-diag"
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4]);
|
||||
|
||||
static int check(const char *name, int16_t in[4], int16_t expect[4])
|
||||
{
|
||||
int16_t c[4]; memcpy(c, in, sizeof(c));
|
||||
daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
|
||||
int fail = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (c[i] != expect[i]) {
|
||||
fprintf(stderr, "%s: c[%d] = %d, expected %d\n",
|
||||
name, i, c[i], expect[i]);
|
||||
fail = 1;
|
||||
}
|
||||
}
|
||||
if (!fail) printf(" %-32s PASS\n", name);
|
||||
else printf(" %-32s FAIL\n", name);
|
||||
return fail;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* Test 1: All-same input.
|
||||
* c = [5, 5, 5, 5]
|
||||
* f[0,0] = 20, f[0,1] = 0, f[1,0] = 0, f[1,1] = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 5, 5, 5, 5 };
|
||||
int16_t ex[4] = { 20, 0, 0, 0 };
|
||||
fail |= check("all-uniform 5", in, ex); }
|
||||
|
||||
/* Test 2: Single-axis variation (col 1 = 0, col 2 = 10).
|
||||
* c = [0, 10, 0, 10]
|
||||
* f[0,0] = 0+10+0+10 = 20
|
||||
* f[0,1] = 0-10+0-10 = -20
|
||||
* f[1,0] = 0+10-0-10 = 0
|
||||
* f[1,1] = 0-10-0+10 = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 0, 10, 0, 10 };
|
||||
int16_t ex[4] = { 20, -20, 0, 0 };
|
||||
fail |= check("col gradient [0,10,0,10]", in, ex); }
|
||||
|
||||
/* Test 3: Row gradient.
|
||||
* c = [0, 0, 10, 10]
|
||||
* f[0,0] = 20, f[0,1] = 0, f[1,0] = 0-20 = -20, f[1,1] = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 0, 0, 10, 10 };
|
||||
int16_t ex[4] = { 20, 0, -20, 0 };
|
||||
fail |= check("row gradient [0,0,10,10]", in, ex); }
|
||||
|
||||
/* Test 4: Anti-diagonal pattern.
|
||||
* c = [10, 0, 0, 10]
|
||||
* f[0,0] = 20
|
||||
* f[0,1] = 10-0+0-10 = 0
|
||||
* f[1,0] = 10+0-0-10 = 0
|
||||
* f[1,1] = 10-0-0+10 = 20
|
||||
*/
|
||||
{ int16_t in[4] = { 10, 0, 0, 10 };
|
||||
int16_t ex[4] = { 20, 0, 0, 20 };
|
||||
fail |= check("anti-diagonal [10,0,0,10]", in, ex); }
|
||||
|
||||
/* Test 5: Asymmetric — all bands non-zero.
|
||||
* c = [1, 2, 3, 4]
|
||||
* f[0,0] = 10
|
||||
* f[0,1] = 1-2+3-4 = -2
|
||||
* f[1,0] = 1+2-3-4 = -4
|
||||
* f[1,1] = 1-2-3+4 = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 1, 2, 3, 4 };
|
||||
int16_t ex[4] = { 10, -2, -4, 0 };
|
||||
fail |= check("asymmetric [1,2,3,4]", in, ex); }
|
||||
|
||||
/* Test 6: Negative inputs (Hadamard is linear, so signs preserve).
|
||||
* c = [-5, 5, -5, 5]
|
||||
* f[0,0] = -5+5-5+5 = 0
|
||||
* f[0,1] = -5-5-5-5 = -20
|
||||
* f[1,0] = -5+5+5-5 = 0
|
||||
* f[1,1] = -5-5+5+5 = 0
|
||||
*/
|
||||
{ int16_t in[4] = { -5, 5, -5, 5 };
|
||||
int16_t ex[4] = { 0, -20, 0, 0 };
|
||||
fail |= check("sign-alternating [-5,5,-5,5]", in, ex); }
|
||||
|
||||
/* Test 7: Inverse-property check. H * H = 4*I for the unscaled
|
||||
* 2x2 Hadamard. So applying twice multiplies each by 4.
|
||||
* c = [1, 2, 3, 4]
|
||||
* First Hadamard: [10, -2, -4, 0]
|
||||
* Second Hadamard: [4, 8, 12, 16]
|
||||
*/
|
||||
{ int16_t in[4] = { 1, 2, 3, 4 };
|
||||
int16_t ex[4] = { 4, 8, 12, 16 };
|
||||
int16_t c[4]; memcpy(c, in, sizeof(c));
|
||||
daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
|
||||
daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
|
||||
int local_fail = 0;
|
||||
for (int i = 0; i < 4; i++) if (c[i] != ex[i]) local_fail = 1;
|
||||
printf(" %-32s %s\n", "double-Hadamard = 4*orig",
|
||||
local_fail ? "FAIL" : "PASS");
|
||||
fail |= local_fail;
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL chroma DC Hadamard tests PASS\n");
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
Reference in New Issue
Block a user