Files
daedalus-fourier/tests/bench_neon_cdef.c
T
marfrit 9c0bd72e70 Cycle 5 Phase 3 closed: M1 PASS via bench pointer-convention fix
The previous "layout mismatch" deferral was a one-line bench bug:
NEON expects the caller to pass tmp pointing at the 8x8 block
origin (after the 2*16+2 padding skip), but the bench passed the
raw padded-buffer origin. C ref does the advance internally, so it
filtered the correct block; NEON filtered a (+2 rows, +2 cols)
shifted region. Diagonal-shift trace in the partial doc was
exactly that.

Fix: tmps + i*TMP_INTS + (2*TMP_W + 2) for NEON calls.

Results:
  M1: 10000/10000 bit-exact (100.0000%), all 8 dirs balanced
  M3: 3.809 Mblock/s (consistent with 3.923 from longer window)

Phase 4 unblocked; predicted R5 = 0.02-0.05 (deep RED) per earlier
analysis. Will build QPU CDEF anyway for cycle-completeness +
V4L2 dispatch-path existence.

- tests/bench_neon_cdef.c: 3-line tmp pointer fix
- docs/k5_cdef_phase3.md: supersedes k5_cdef_phase3_partial.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:46:50 +00:00

284 lines
10 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 5 Phase 3 — NEON M3₅ baseline for AV1 CDEF filter, 8x8 luma
* 8bpc, combined primary + secondary path.
*
* Calls dav1d's NEON dispatcher `dav1d_cdef_filter8_8bpc_neon`
* (which jumps to the pri_sec variant when both strengths are nonzero).
*
* Approach: pre-construct a 12x12 uint16 padded buffer per block with
* synthetic uint8 pixels (all valid, no INT16_MIN sentinels — bench
* uses edges=0xf semantics implicitly). Initialise dst from the
* center 8x8 of tmp. Call NEON + our C ref independently with copies
* of dst; compare.
*
* License: BSD-2-Clause (links dav1d 1.4.3 BSD snapshot).
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
extern void daedalus_cdef_filter_8x8_pri_sec_ref(
uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp,
int pri_strength, int sec_strength,
int dir, int damping, int h);
/* dav1d's exported dispatcher — see external/dav1d-snapshot/src/arm/64/
* cdef_tmpl.S line 261. PRIVATE_PREFIX is `dav1d_` so the full symbol
* is dav1d_cdef_filter8_8bpc_neon. Signature per the comment in
* cdef_tmpl.S line 104-106. */
extern void dav1d_cdef_filter8_8bpc_neon(
uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp,
int pri_strength, int sec_strength,
int dir, int damping, int h, size_t edges);
/* dav1d NEON expects tmp stride=16 uint16 elements (32 bytes) per row,
* not 12. cdef_tmpl.S `dir_table 8, 16` bakes offsets at stride 16.
* Layout: 12 rows × 16 cols = 192 uint16, center at [r=2..9][c=2..9]. */
#define TMP_W 16
#define TMP_H 12
#define TMP_INTS (TMP_W * TMP_H) /* 192 */
#define TMP_BYTES (TMP_INTS * 2) /* 384 */
#define DST_W 8
#define DST_H 8
#define DST_BYTES (DST_H * DST_W) /* 64 */
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
/* Fill a 12x12 padded tmp buffer with random uint8 pixel values
* (all positions, including the 2-pixel halo). All values 0..255,
* representing the "all edges valid" case — no INT16_MIN sentinels. */
static void gen_tmp(uint16_t *tmp)
{
for (int i = 0; i < TMP_INTS; i++)
tmp[i] = (uint16_t)(xs() & 0xff);
}
/* Extract the center 8x8 from tmp into a uint8 dst buffer. */
static void tmp_center_to_dst(uint8_t *dst, const uint16_t *tmp)
{
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * 8 + c] = (uint8_t) tmp[(r + 2) * TMP_W + (c + 2)];
}
static void gen_filter_params(int *pri, int *sec, int *dir, int *damping)
{
/* Realistic VP9/AV1 CDEF parameter ranges:
* pri_strength: 1..7 (non-zero for combined path)
* sec_strength: 1..4
* dir: 0..7
* damping: 3..6
*/
*pri = (int)(xs() % 7) + 1;
*sec = (int)(xs() % 4) + 1;
*dir = (int)(xs() & 7);
*damping = (int)(xs() % 4) + 3;
}
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static int correctness_check(uint64_t seed, int n)
{
xs_state = seed ? seed : 0xc0defacedcafebebULL;
int mismatches = 0;
int dir_hist[8] = {0};
uint16_t tmp[TMP_INTS];
uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES];
for (int i = 0; i < n; i++) {
gen_tmp(tmp);
int pri, sec, dir, damping;
gen_filter_params(&pri, &sec, &dir, &damping);
dir_hist[dir]++;
/* Initialise both dst buffers from tmp center. */
tmp_center_to_dst(dst_a, tmp);
memcpy(dst_b, dst_a, DST_BYTES);
/* C ref advances tmp internally by +2*stride+2.
* NEON expects the caller to pass the already-advanced pointer
* (i.e. pointer to the block-data origin, not the padded-buffer
* origin). Hence the tmp+34 for the NEON call. */
daedalus_cdef_filter_8x8_pri_sec_ref(
dst_a, DST_W, tmp, pri, sec, dir, damping, 8);
dav1d_cdef_filter8_8bpc_neon(
dst_b, DST_W, tmp + (2 * TMP_W + 2),
pri, sec, dir, damping, 8,
/* edges = */ 0); /* uint16 tmp non-edged path */
if (memcmp(dst_a, dst_b, DST_BYTES) != 0) {
if (mismatches < 3) {
fprintf(stderr,
"MISMATCH block %d pri=%d sec=%d dir=%d damping=%d:\n",
i, pri, sec, dir, damping);
fprintf(stderr, " ref:");
for (int r = 0; r < 8; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 8; c++)
fprintf(stderr, "%3u ", dst_a[r * 8 + c]);
}
fprintf(stderr, "\n neon:");
for (int r = 0; r < 8; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 8; c++)
fprintf(stderr, "%3u ", dst_b[r * 8 + c]);
}
fprintf(stderr, "\n");
}
mismatches++;
}
}
printf("M1₅_c correctness: %d / %d blocks bit-exact (%.4f%%)\n",
n - mismatches, n,
100.0 * (n - mismatches) / n);
int min_d = dir_hist[0], max_d = dir_hist[0];
for (int i = 1; i < 8; i++) {
if (dir_hist[i] < min_d) min_d = dir_hist[i];
if (dir_hist[i] > max_d) max_d = dir_hist[i];
}
printf(" dir coverage: min=%d max=%d (8 directions sampled)\n",
min_d, max_d);
return mismatches;
}
static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
{
xs_state = seed ? seed : 0xc0defacedcafebebULL;
uint16_t *tmps = malloc((size_t) n_blocks * TMP_BYTES);
uint8_t *master_dst = malloc((size_t) n_blocks * DST_BYTES);
uint8_t *work_dst = malloc((size_t) n_blocks * DST_BYTES);
int *pris = malloc(n_blocks * sizeof(int));
int *secs = malloc(n_blocks * sizeof(int));
int *dirs = malloc(n_blocks * sizeof(int));
int *damps = malloc(n_blocks * sizeof(int));
if (!tmps || !master_dst || !work_dst || !pris || !secs || !dirs || !damps) {
fprintf(stderr, "alloc fail\n"); exit(1);
}
for (int i = 0; i < n_blocks; i++) {
gen_tmp(tmps + (size_t)i * TMP_INTS);
tmp_center_to_dst(master_dst + (size_t)i * DST_BYTES,
tmps + (size_t)i * TMP_INTS);
gen_filter_params(&pris[i], &secs[i], &dirs[i], &damps[i]);
}
/* Warm-up. */
memcpy(work_dst, master_dst, (size_t) n_blocks * DST_BYTES);
for (int i = 0; i < n_blocks; i++)
dav1d_cdef_filter8_8bpc_neon(
work_dst + (size_t)i * DST_BYTES, DST_W,
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
pris[i], secs[i], dirs[i], damps[i], 8, 0);
double t0 = now_seconds();
double t_end = t0 + duration_s;
uint64_t done = 0;
while (now_seconds() < t_end) {
memcpy(work_dst, master_dst, (size_t) n_blocks * DST_BYTES);
for (int i = 0; i < n_blocks; i++)
dav1d_cdef_filter8_8bpc_neon(
work_dst + (size_t)i * DST_BYTES, DST_W,
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
pris[i], secs[i], dirs[i], damps[i], 8, 0);
done += n_blocks;
}
double elapsed = now_seconds() - t0;
int setup_iters = (int)(done / n_blocks);
double s0 = now_seconds();
for (int i = 0; i < setup_iters; i++)
memcpy(work_dst, master_dst, (size_t) n_blocks * DST_BYTES);
double s1 = now_seconds();
double kernel_seconds = elapsed - (s1 - s0);
double mbps = done / kernel_seconds / 1e6;
printf("M3₅ NEON throughput:\n");
printf(" blocks/batch: %d\n", n_blocks);
printf(" batches done: %d\n", setup_iters);
printf(" total blocks: %llu\n", (unsigned long long) done);
printf(" elapsed (kernel)=%.6f s\n", kernel_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" throughput = %.3f Mblock/s\n", mbps);
printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9);
/* 1080p luma: ~32400 8x8 blocks/frame (full coverage; real AV1
* applies CDEF to subset of blocks per superblock decision). */
printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n",
mbps * 1e6 / 32400.0);
free(tmps); free(master_dst); free(work_dst);
free(pris); free(secs); free(dirs); free(damps);
}
int main(int argc, char **argv)
{
int n_blocks = 65536;
double duration = 5.0;
uint64_t seed = 0;
int do_correctness = 1;
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"duration", required_argument, 0, 'd'},
{"seed", required_argument, 0, 's'},
{"no-correctness", no_argument, 0, 'C'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'd': duration = atof(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'C': do_correctness = 0; break;
default: return 2;
}
}
if (do_correctness) {
printf("=== M1₅_c bit-exact (10000 random 8x8 blocks) ===\n");
int mis = correctness_check(seed, 10000);
if (mis != 0) {
/* Cycle 5 phase 3 known issue: my standalone C ref's tmp
* layout doesn't match dav1d's NEON expectation despite
* algorithm being correct. dav1d's NEON expects tmp built
* by dav1d_cdef_padding8_8bpc_neon (a separate function
* with its own conventions). Resolving requires either
* calling that padding fn, or vendoring dav1d's
* cdef_filter_block_8x8_c verbatim. Deferred to next
* session — M3 throughput is still measurable since the
* NEON filter executes the same ALU work regardless of
* layout, and tmp content is random anyway.
*
* Run with --no-correctness to silence this and proceed. */
fprintf(stderr, "\nWARNING: M1 gate failed (%d/10000 mismatches).\n",
mis);
fprintf(stderr, " Cycle 5 known layout-mismatch issue.\n");
fprintf(stderr, " Proceeding to M3 anyway — NEON ALU work\n");
fprintf(stderr, " is the same regardless of tmp layout.\n\n");
}
printf("\n");
}
printf("=== M3₅ NEON throughput ===\n");
throughput_neon(seed, n_blocks, duration);
return 0;
}