Files
daedalus-fourier/tests/test_api_h264.c
T
claude-noether 9d5451e0fe h264: deblock_luma_h — CPU/NEON via vendored ff_h264_h_loop_filter
Adds the horizontal-edge sibling of cycle 8's deblock_luma_v.  The
vendored FFmpeg snapshot already includes ff_h264_h_loop_filter_luma_neon
in libavcodec/aarch64/h264dsp_neon.S — this PR wires up the symbol,
the bit-exact reference, and the recipe-table entry so daedalus-decoder
and other consumers can call the H variant through the same dispatch
shape they use for _v.

Scope:
  - Public API: daedalus_dispatch_h264_deblock_luma_h(ctx, sub, ...)
    + daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, ...) wrapper.
  - Internal: dispatch_h264_deblock_h_cpu() calls the NEON entry.
  - Recipe table: new DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10, mapped
    to DAEDALUS_SUBSTRATE_CPU until a QPU shader is written.  An
    explicit SUBSTRATE_QPU request on the H dispatch returns -1
    (fails fast, no silent CPU degradation).
  - C reference: tests/h264_h_loop_filter_luma_ref.c — the
    column-axis transpose of h264_deblock_ref.c.  Same per-segment
    kernel; pix[-4..+3] accesses cols instead of rows*stride.
  - Test: test_api_h264 grows a test_deblock_h() with 8 tiles
    (8 cols x 16 rows each, edge at col 4), random alpha/beta/tc0;
    compares NEON dispatch against reference byte-for-byte.

Verified on hertz (Pi 5 / V3D 7.1):

  $ ./build/test_api_h264
  === Phase 8a API smoke: H.264 kernels via recipe dispatch ===
    H264_IDCT4 recipe substrate:      2 (1=CPU, 2=QPU)
    H264_IDCT8 recipe substrate:      2
    H264_DEBLOCK_LV recipe substrate: 2
    H264_QPEL_MC20 recipe substrate:  2
    H264_DEBLOCK_LH recipe substrate: 1 (CPU, no QPU H shader yet)
    H.264 IDCT 4x4: 2048/2048 bytes bit-exact (100.0000%)
    H.264 IDCT 8x8: 2048/2048 bytes bit-exact (100.0000%)
    H.264 deblock luma v: 2048/2048 bytes bit-exact (100.0000%)
    H.264 deblock luma h: 1024/1024 bytes bit-exact (100.0000%)
    H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)

  All 5 kernels bit-exact PASS.  The new H variant joins the suite
  with 1024 random-input bytes per tile x 8 tiles.

Why CPU-only for now: the daedalus-decoder downstream needs the H
edge dispatched somewhere — even at CPU NEON cost (~6 ns/edge per
the cycle 8 M3 baseline) a frame's worth at 1080p is
~ 8160 MBs * 4 edges = 32 640 edges = ~200 us — well inside the
30 fps budget.  Writing the V3D H-edge shader is a follow-up
(would be cycle 8' or similar; the V-edge shader's transpose isn't
mechanical because of how the workgroup organisation maps to columns
vs rows).

Backlog addition (out of scope for this PR):
  - V3D shader for the H variant (mirror of v3d_h264deblock.spv).
  - bS=4 intra-strength filter (different algebra; both _v and _h).
  - Chroma deblock luma_v/_h (8-cell variants).
2026-05-24 23:28:56 +02:00

257 lines
10 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Phase 8a — H.264 kernels through the public API.
*
* Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel
* exercised through daedalus_recipe_dispatch_* and compared to
* the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8
* verdicts).
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "../include/daedalus.h"
extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
static uint64_t xs_state = 0xa11264ULL;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static int test_idct4(void)
{
enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
int16_t coeffs[N * 16], coeffs_ref[N * 16];
uint8_t dst[BYTES], dst_ref[BYTES];
daedalus_h264_block_meta meta[N];
/* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols).
* Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big
* enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use
* 8 row-blocks. */
enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE };
uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES];
for (int i = 0; i < FULL_BYTES; i++)
big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512);
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
int i = by * BX + bx;
meta[i].dst_off = by * 4 * STRIDE + bx * 4;
}
for (int i = 0; i < N; i++)
daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off,
coeffs_ref + i * 16, STRIDE);
int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE,
coeffs, N, meta);
if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++;
printf(" H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n",
FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_idct8(void)
{
enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
int16_t coeffs[N * 64], coeffs_ref[N * 64];
uint8_t dst[BYTES], dst_ref[BYTES];
daedalus_h264_block_meta meta[N];
for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024);
/* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks
* for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64
* blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */
int BX = 8, BY = 2; /* 16 blocks total */
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
int i = by * BX + bx;
meta[i].dst_off = by * 8 * STRIDE + bx * 8;
}
for (int i = 0; i < N; i++)
daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off,
coeffs_ref + i * 64, STRIDE);
int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE,
coeffs, N, meta);
if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n",
BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock(void)
{
/* One edge per 16x16 tile. */
enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_h(void)
{
/* Mirror of test_deblock but for the H variant. Per-tile layout
* is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
* of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
* column of the right block so the kernel's pix[-4..+3] read sits
* inside the tile. */
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_mc20(void)
{
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
* holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the
* cycle-9 bench convention so the same C reference and NEON .S can
* be compared. */
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_COL = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
memset(dst, 0, sizeof(dst));
memset(dst_ref, 0, sizeof(dst_ref));
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
}
for (int i = 0; i < N; i++)
daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off,
src + meta[i].src_off,
TILE_STRIDE);
int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src,
TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
int main(void)
{
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
printf(" H264_IDCT4 recipe substrate: %d (1=CPU, 2=QPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4));
printf(" H264_IDCT8 recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
printf(" H264_DEBLOCK_LV recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
int fail = 0;
fail |= test_idct4();
fail |= test_idct8();
fail |= test_deblock();
fail |= test_deblock_h();
fail |= test_qpel_mc20();
return fail;
}