9c0bd72e70
The previous "layout mismatch" deferral was a one-line bench bug: NEON expects the caller to pass tmp pointing at the 8x8 block origin (after the 2*16+2 padding skip), but the bench passed the raw padded-buffer origin. C ref does the advance internally, so it filtered the correct block; NEON filtered a (+2 rows, +2 cols) shifted region. Diagonal-shift trace in the partial doc was exactly that. Fix: tmps + i*TMP_INTS + (2*TMP_W + 2) for NEON calls. Results: M1: 10000/10000 bit-exact (100.0000%), all 8 dirs balanced M3: 3.809 Mblock/s (consistent with 3.923 from longer window) Phase 4 unblocked; predicted R5 = 0.02-0.05 (deep RED) per earlier analysis. Will build QPU CDEF anyway for cycle-completeness + V4L2 dispatch-path existence. - tests/bench_neon_cdef.c: 3-line tmp pointer fix - docs/k5_cdef_phase3.md: supersedes k5_cdef_phase3_partial.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
284 lines
10 KiB
C
284 lines
10 KiB
C
/*
|
||
* Cycle 5 Phase 3 — NEON M3₅ baseline for AV1 CDEF filter, 8x8 luma
|
||
* 8bpc, combined primary + secondary path.
|
||
*
|
||
* Calls dav1d's NEON dispatcher `dav1d_cdef_filter8_8bpc_neon`
|
||
* (which jumps to the pri_sec variant when both strengths are nonzero).
|
||
*
|
||
* Approach: pre-construct a 12x12 uint16 padded buffer per block with
|
||
* synthetic uint8 pixels (all valid, no INT16_MIN sentinels — bench
|
||
* uses edges=0xf semantics implicitly). Initialise dst from the
|
||
* center 8x8 of tmp. Call NEON + our C ref independently with copies
|
||
* of dst; compare.
|
||
*
|
||
* License: BSD-2-Clause (links dav1d 1.4.3 BSD snapshot).
|
||
*/
|
||
#define _POSIX_C_SOURCE 200809L
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <stdint.h>
|
||
#include <stddef.h>
|
||
#include <string.h>
|
||
#include <time.h>
|
||
#include <getopt.h>
|
||
|
||
extern void daedalus_cdef_filter_8x8_pri_sec_ref(
|
||
uint8_t *dst, ptrdiff_t dst_stride,
|
||
const uint16_t *tmp,
|
||
int pri_strength, int sec_strength,
|
||
int dir, int damping, int h);
|
||
|
||
/* dav1d's exported dispatcher — see external/dav1d-snapshot/src/arm/64/
|
||
* cdef_tmpl.S line 261. PRIVATE_PREFIX is `dav1d_` so the full symbol
|
||
* is dav1d_cdef_filter8_8bpc_neon. Signature per the comment in
|
||
* cdef_tmpl.S line 104-106. */
|
||
extern void dav1d_cdef_filter8_8bpc_neon(
|
||
uint8_t *dst, ptrdiff_t dst_stride,
|
||
const uint16_t *tmp,
|
||
int pri_strength, int sec_strength,
|
||
int dir, int damping, int h, size_t edges);
|
||
|
||
/* dav1d NEON expects tmp stride=16 uint16 elements (32 bytes) per row,
|
||
* not 12. cdef_tmpl.S `dir_table 8, 16` bakes offsets at stride 16.
|
||
* Layout: 12 rows × 16 cols = 192 uint16, center at [r=2..9][c=2..9]. */
|
||
#define TMP_W 16
|
||
#define TMP_H 12
|
||
#define TMP_INTS (TMP_W * TMP_H) /* 192 */
|
||
#define TMP_BYTES (TMP_INTS * 2) /* 384 */
|
||
#define DST_W 8
|
||
#define DST_H 8
|
||
#define DST_BYTES (DST_H * DST_W) /* 64 */
|
||
|
||
static uint64_t xs_state;
|
||
static inline uint64_t xs(void) {
|
||
uint64_t x = xs_state;
|
||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||
return xs_state = x;
|
||
}
|
||
|
||
/* Fill a 12x12 padded tmp buffer with random uint8 pixel values
|
||
* (all positions, including the 2-pixel halo). All values 0..255,
|
||
* representing the "all edges valid" case — no INT16_MIN sentinels. */
|
||
static void gen_tmp(uint16_t *tmp)
|
||
{
|
||
for (int i = 0; i < TMP_INTS; i++)
|
||
tmp[i] = (uint16_t)(xs() & 0xff);
|
||
}
|
||
|
||
/* Extract the center 8x8 from tmp into a uint8 dst buffer. */
|
||
static void tmp_center_to_dst(uint8_t *dst, const uint16_t *tmp)
|
||
{
|
||
for (int r = 0; r < 8; r++)
|
||
for (int c = 0; c < 8; c++)
|
||
dst[r * 8 + c] = (uint8_t) tmp[(r + 2) * TMP_W + (c + 2)];
|
||
}
|
||
|
||
static void gen_filter_params(int *pri, int *sec, int *dir, int *damping)
|
||
{
|
||
/* Realistic VP9/AV1 CDEF parameter ranges:
|
||
* pri_strength: 1..7 (non-zero for combined path)
|
||
* sec_strength: 1..4
|
||
* dir: 0..7
|
||
* damping: 3..6
|
||
*/
|
||
*pri = (int)(xs() % 7) + 1;
|
||
*sec = (int)(xs() % 4) + 1;
|
||
*dir = (int)(xs() & 7);
|
||
*damping = (int)(xs() % 4) + 3;
|
||
}
|
||
|
||
static double now_seconds(void)
|
||
{
|
||
struct timespec ts;
|
||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||
}
|
||
|
||
static int correctness_check(uint64_t seed, int n)
|
||
{
|
||
xs_state = seed ? seed : 0xc0defacedcafebebULL;
|
||
int mismatches = 0;
|
||
int dir_hist[8] = {0};
|
||
|
||
uint16_t tmp[TMP_INTS];
|
||
uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES];
|
||
|
||
for (int i = 0; i < n; i++) {
|
||
gen_tmp(tmp);
|
||
int pri, sec, dir, damping;
|
||
gen_filter_params(&pri, &sec, &dir, &damping);
|
||
dir_hist[dir]++;
|
||
|
||
/* Initialise both dst buffers from tmp center. */
|
||
tmp_center_to_dst(dst_a, tmp);
|
||
memcpy(dst_b, dst_a, DST_BYTES);
|
||
|
||
/* C ref advances tmp internally by +2*stride+2.
|
||
* NEON expects the caller to pass the already-advanced pointer
|
||
* (i.e. pointer to the block-data origin, not the padded-buffer
|
||
* origin). Hence the tmp+34 for the NEON call. */
|
||
daedalus_cdef_filter_8x8_pri_sec_ref(
|
||
dst_a, DST_W, tmp, pri, sec, dir, damping, 8);
|
||
dav1d_cdef_filter8_8bpc_neon(
|
||
dst_b, DST_W, tmp + (2 * TMP_W + 2),
|
||
pri, sec, dir, damping, 8,
|
||
/* edges = */ 0); /* uint16 tmp non-edged path */
|
||
|
||
if (memcmp(dst_a, dst_b, DST_BYTES) != 0) {
|
||
if (mismatches < 3) {
|
||
fprintf(stderr,
|
||
"MISMATCH block %d pri=%d sec=%d dir=%d damping=%d:\n",
|
||
i, pri, sec, dir, damping);
|
||
fprintf(stderr, " ref:");
|
||
for (int r = 0; r < 8; r++) {
|
||
fprintf(stderr, "\n r%d ", r);
|
||
for (int c = 0; c < 8; c++)
|
||
fprintf(stderr, "%3u ", dst_a[r * 8 + c]);
|
||
}
|
||
fprintf(stderr, "\n neon:");
|
||
for (int r = 0; r < 8; r++) {
|
||
fprintf(stderr, "\n r%d ", r);
|
||
for (int c = 0; c < 8; c++)
|
||
fprintf(stderr, "%3u ", dst_b[r * 8 + c]);
|
||
}
|
||
fprintf(stderr, "\n");
|
||
}
|
||
mismatches++;
|
||
}
|
||
}
|
||
|
||
printf("M1₅_c correctness: %d / %d blocks bit-exact (%.4f%%)\n",
|
||
n - mismatches, n,
|
||
100.0 * (n - mismatches) / n);
|
||
int min_d = dir_hist[0], max_d = dir_hist[0];
|
||
for (int i = 1; i < 8; i++) {
|
||
if (dir_hist[i] < min_d) min_d = dir_hist[i];
|
||
if (dir_hist[i] > max_d) max_d = dir_hist[i];
|
||
}
|
||
printf(" dir coverage: min=%d max=%d (8 directions sampled)\n",
|
||
min_d, max_d);
|
||
return mismatches;
|
||
}
|
||
|
||
static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
|
||
{
|
||
xs_state = seed ? seed : 0xc0defacedcafebebULL;
|
||
uint16_t *tmps = malloc((size_t) n_blocks * TMP_BYTES);
|
||
uint8_t *master_dst = malloc((size_t) n_blocks * DST_BYTES);
|
||
uint8_t *work_dst = malloc((size_t) n_blocks * DST_BYTES);
|
||
int *pris = malloc(n_blocks * sizeof(int));
|
||
int *secs = malloc(n_blocks * sizeof(int));
|
||
int *dirs = malloc(n_blocks * sizeof(int));
|
||
int *damps = malloc(n_blocks * sizeof(int));
|
||
if (!tmps || !master_dst || !work_dst || !pris || !secs || !dirs || !damps) {
|
||
fprintf(stderr, "alloc fail\n"); exit(1);
|
||
}
|
||
for (int i = 0; i < n_blocks; i++) {
|
||
gen_tmp(tmps + (size_t)i * TMP_INTS);
|
||
tmp_center_to_dst(master_dst + (size_t)i * DST_BYTES,
|
||
tmps + (size_t)i * TMP_INTS);
|
||
gen_filter_params(&pris[i], &secs[i], &dirs[i], &damps[i]);
|
||
}
|
||
|
||
/* Warm-up. */
|
||
memcpy(work_dst, master_dst, (size_t) n_blocks * DST_BYTES);
|
||
for (int i = 0; i < n_blocks; i++)
|
||
dav1d_cdef_filter8_8bpc_neon(
|
||
work_dst + (size_t)i * DST_BYTES, DST_W,
|
||
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
|
||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||
|
||
double t0 = now_seconds();
|
||
double t_end = t0 + duration_s;
|
||
uint64_t done = 0;
|
||
while (now_seconds() < t_end) {
|
||
memcpy(work_dst, master_dst, (size_t) n_blocks * DST_BYTES);
|
||
for (int i = 0; i < n_blocks; i++)
|
||
dav1d_cdef_filter8_8bpc_neon(
|
||
work_dst + (size_t)i * DST_BYTES, DST_W,
|
||
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
|
||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||
done += n_blocks;
|
||
}
|
||
double elapsed = now_seconds() - t0;
|
||
|
||
int setup_iters = (int)(done / n_blocks);
|
||
double s0 = now_seconds();
|
||
for (int i = 0; i < setup_iters; i++)
|
||
memcpy(work_dst, master_dst, (size_t) n_blocks * DST_BYTES);
|
||
double s1 = now_seconds();
|
||
|
||
double kernel_seconds = elapsed - (s1 - s0);
|
||
double mbps = done / kernel_seconds / 1e6;
|
||
|
||
printf("M3₅ NEON throughput:\n");
|
||
printf(" blocks/batch: %d\n", n_blocks);
|
||
printf(" batches done: %d\n", setup_iters);
|
||
printf(" total blocks: %llu\n", (unsigned long long) done);
|
||
printf(" elapsed (kernel)=%.6f s\n", kernel_seconds);
|
||
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
|
||
printf(" throughput = %.3f Mblock/s\n", mbps);
|
||
printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9);
|
||
/* 1080p luma: ~32400 8x8 blocks/frame (full coverage; real AV1
|
||
* applies CDEF to subset of blocks per superblock decision). */
|
||
printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n",
|
||
mbps * 1e6 / 32400.0);
|
||
|
||
free(tmps); free(master_dst); free(work_dst);
|
||
free(pris); free(secs); free(dirs); free(damps);
|
||
}
|
||
|
||
int main(int argc, char **argv)
|
||
{
|
||
int n_blocks = 65536;
|
||
double duration = 5.0;
|
||
uint64_t seed = 0;
|
||
int do_correctness = 1;
|
||
|
||
static struct option opts[] = {
|
||
{"blocks", required_argument, 0, 'b'},
|
||
{"duration", required_argument, 0, 'd'},
|
||
{"seed", required_argument, 0, 's'},
|
||
{"no-correctness", no_argument, 0, 'C'},
|
||
{0,0,0,0}
|
||
};
|
||
for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) {
|
||
switch (c) {
|
||
case 'b': n_blocks = atoi(optarg); break;
|
||
case 'd': duration = atof(optarg); break;
|
||
case 's': seed = strtoull(optarg, 0, 0); break;
|
||
case 'C': do_correctness = 0; break;
|
||
default: return 2;
|
||
}
|
||
}
|
||
|
||
if (do_correctness) {
|
||
printf("=== M1₅_c bit-exact (10000 random 8x8 blocks) ===\n");
|
||
int mis = correctness_check(seed, 10000);
|
||
if (mis != 0) {
|
||
/* Cycle 5 phase 3 known issue: my standalone C ref's tmp
|
||
* layout doesn't match dav1d's NEON expectation despite
|
||
* algorithm being correct. dav1d's NEON expects tmp built
|
||
* by dav1d_cdef_padding8_8bpc_neon (a separate function
|
||
* with its own conventions). Resolving requires either
|
||
* calling that padding fn, or vendoring dav1d's
|
||
* cdef_filter_block_8x8_c verbatim. Deferred to next
|
||
* session — M3 throughput is still measurable since the
|
||
* NEON filter executes the same ALU work regardless of
|
||
* layout, and tmp content is random anyway.
|
||
*
|
||
* Run with --no-correctness to silence this and proceed. */
|
||
fprintf(stderr, "\nWARNING: M1 gate failed (%d/10000 mismatches).\n",
|
||
mis);
|
||
fprintf(stderr, " Cycle 5 known layout-mismatch issue.\n");
|
||
fprintf(stderr, " Proceeding to M3 anyway — NEON ALU work\n");
|
||
fprintf(stderr, " is the same regardless of tmp layout.\n\n");
|
||
}
|
||
printf("\n");
|
||
}
|
||
|
||
printf("=== M3₅ NEON throughput ===\n");
|
||
throughput_neon(seed, n_blocks, duration);
|
||
return 0;
|
||
}
|