Files
marfrit 20b59cd6a5 Cycle 5 phase 3 partial: M3 NEON = 3.923 Mblock/s; M1 deferred
CDEF is the most compute-intensive kernel measured so far —
254.9 ns/block (2x IDCT, 5x MC). 30fps@1080p floor margin: 4x
even on single NEON core in isolation.

M3 captured cleanly via dav1d_cdef_filter8_8bpc_neon. M1 bit-exact
gate failing due to tmp-layout mismatch between my standalone C
reference and dav1d's NEON expectation. The smoking gun: NEON output
appears at (+2 rows, -2 cols) shifted positions vs C ref output —
suggests NEON's padding-function output has a different convention
than my manual tmp construction.

Untangled in setup work:
- dav1d has TWO directions tables: stride-12 in src/tables.c
  (C-side), stride-16 in src/arm/64/cdef_tmpl.S (NEON-side).
  Initially vendored the C-side; should have used the NEON-side.
- dav1d's NEON expects tmp built by dav1d_cdef_padding8_8bpc_neon
  (a separate function with its own conventions), not the C-side
  padding() function from cdef_tmpl.c.
- Updated cdef_ref.c to use NEON-layout (stride 16) with table
  transcribed from cdef_tmpl.S. Algorithm matches — but bench's
  manual tmp construction doesn't match what NEON expects.

Resolution paths for next session (documented in
docs/k5_cdef_phase3_partial.md §'Resolution paths'):
1. Use dav1d_cdef_padding8_8bpc_neon to construct tmp (simplest)
2. Vendor dav1d's full C reference (most rigorous)
3. Reverse-engineer dav1d's padding output layout (hackiest)

Predicted R5 if/when QPU shader implemented: 0.02-0.05 (RED).
CDEF likely stays on CPU per cycle 3 lesson 7 (compute-bound
kernels don't benefit from QPU offload). 30fps floor still
passes regardless.

New artifacts:
- external/dav1d-snapshot/src/arm/64/cdef_tmpl.S (additional vendored)
- external/dav1d-snapshot/config.h — 14-define asm preamble shim
- tests/cdef_ref.c — standalone C ref (algorithmically correct,
                     layout mismatch with NEON known)
- tests/bench_neon_cdef.c — bench (M1 made warning, M3 captured)
- docs/k5_cdef_phase3_partial.md — phase 3 partial closure +
                                    resumption checklist

dav1d snapshot in PROVENANCE.md should be updated next session
with the new cdef_tmpl.S entry.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:21:24 +00:00

154 lines
6.5 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Standalone bit-exact C reference for AV1 CDEF filter, 8x8 luma 8bpc,
* combined primary + secondary path.
*
* Algorithm transcribed from dav1d's `cdef_filter_block_c` in
* src/cdef_tmpl.c (vendored at external/dav1d-snapshot/, tag 1.4.3).
*
* **Layout note (cycle 5 phase 3 finding):** dav1d's NEON expects
* tmp with stride 16 (uint16 elements), not stride 12 like the C
* reference uses. The NEON has its own directions table baked at
* stride 16 in src/arm/64/cdef_tmpl.S `dir_table 8, 16`. The C
* reference uses stride 12 and the table in src/tables.c.
*
* To compare bit-exact against NEON, this standalone C ref uses
* NEON's stride-16 layout + its embedded directions table. Same
* algorithm, different stride convention than dav1d's C path.
*
* Signature mirrors the dav1d NEON convention:
* void(uint8_t *dst, ptrdiff_t dst_stride, const uint16_t *tmp,
* int pri_strength, int sec_strength,
* int dir, int damping, int h);
*
* tmp is a (12 rows × 16 cols × uint16) padded buffer, stride 16.
* Center 8x8 region at tmp[r=2..9][c=2..9].
*
* License: BSD-2-Clause (matches dav1d upstream).
*
* Spec: AV1 specification §7.15 (CDEF).
*/
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#define TMP_STRIDE 16
/* dav1d's stride-16 directions table — verbatim from
* external/dav1d-snapshot/src/arm/64/cdef_tmpl.S `dir_table 8, 16`.
* 8 directions + 6 wrap-around copies (dir 0..5 repeated) = 14
* entries × 2 = 28 bytes. The asm needs ≥14 entries because for
* dir=7 the secondary-2 offset (+12 bytes = +6 entries) reads
* index 13 (which is wrap = dir 5). */
static const int8_t neon_directions8[14][2] = {
/* index 0 */ { -1 * TMP_STRIDE + 1, -2 * TMP_STRIDE + 2 },
/* index 1 */ { 0 * TMP_STRIDE + 1, -1 * TMP_STRIDE + 2 },
/* index 2 */ { 0 * TMP_STRIDE + 1, 0 * TMP_STRIDE + 2 },
/* index 3 */ { 0 * TMP_STRIDE + 1, 1 * TMP_STRIDE + 2 },
/* index 4 */ { 1 * TMP_STRIDE + 1, 2 * TMP_STRIDE + 2 },
/* index 5 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 1 },
/* index 6 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 0 },
/* index 7 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE - 1 },
/* wrap 8 = dir 0 */ { -1 * TMP_STRIDE + 1, -2 * TMP_STRIDE + 2 },
/* wrap 9 = dir 1 */ { 0 * TMP_STRIDE + 1, -1 * TMP_STRIDE + 2 },
/* wrap 10 = dir 2 */ { 0 * TMP_STRIDE + 1, 0 * TMP_STRIDE + 2 },
/* wrap 11 = dir 3 */ { 0 * TMP_STRIDE + 1, 1 * TMP_STRIDE + 2 },
/* wrap 12 = dir 4 */ { 1 * TMP_STRIDE + 1, 2 * TMP_STRIDE + 2 },
/* wrap 13 = dir 5 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 1 },
};
static inline int abs_i(int x) { return x < 0 ? -x : x; }
static inline int imin(int a, int b) { return a < b ? a : b; }
static inline int imax(int a, int b) { return a > b ? a : b; }
static inline int umin(int a, int b) { return (unsigned)a < (unsigned)b ? a : b; }
static inline int iclip(int v, int lo, int hi) {
return v < lo ? lo : v > hi ? hi : v;
}
static inline int apply_sign(int v, int s) { return s < 0 ? -v : v; }
static inline int constrain(int diff, int threshold, int shift)
{
int adiff = abs_i(diff);
return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))),
diff);
}
static inline int ulog2(unsigned x)
{
return 31 - __builtin_clz(x);
}
/* NEON-layout reference: tmp is (12 rows × 16 uint16 cols), center
* at [r=2..9][c=2..9]. dir is the precomputed direction [0..7].
* Direction lookups use NEON's table (stride-16-precomputed offsets).
*
* Note: dav1d's dispatcher branches dir+2, dir+4, dir+0 (after
* adjusting for the +2 leading offset in the table). With our 12-entry
* table indexed without the +2 lead, the equivalent is:
* primary: [dir][k] (was [dir + 2][k] with +2-prefixed table)
* secondary1: [(dir + 2) % 8][k] (was [dir + 4][k])
* secondary2: [(dir - 2 + 8) % 8][k] (was [dir + 0][k])
* Our `neon_directions8` includes 4 wrap-around entries (idx 8..11
* = idx 0..3) so [(dir+2)%8] is safe without explicit modulo.
*/
void daedalus_cdef_filter_8x8_pri_sec_ref(
uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp,
int pri_strength, int sec_strength,
int dir, int damping, int h)
{
const int pri_tap = 4 - (pri_strength & 1);
const int pri_shift = imax(0, damping - ulog2((unsigned) pri_strength));
const int sec_shift = damping - ulog2((unsigned) sec_strength);
/* Walk into the center 8x8 region of the 12×16 padded buffer. */
tmp = tmp + 2 * TMP_STRIDE + 2;
/* dav1d's dispatcher uses dir+2, dir+4, dir+0 with the C-side
* 2-prefixed directions table. Our table starts at index 0 = dir 0,
* so the equivalent indices are dir, (dir+2)%8, (dir-2+8)%8. */
const int pri_dir_idx = dir;
const int sec1_dir_idx = (dir + 2) & 7;
const int sec2_dir_idx = (dir + 6) & 7; /* (dir - 2) % 8 */
do {
for (int x = 0; x < 8; x++) {
int px = dst[x];
int sum = 0;
int max = px, min = px;
int pri_tap_k = pri_tap;
for (int k = 0; k < 2; k++) {
int off1 = neon_directions8[pri_dir_idx][k];
int p0 = tmp[x + off1];
int p1 = tmp[x - off1];
sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
pri_tap_k = (pri_tap_k & 3) | 2;
min = umin(p0, min); max = imax(p0, max);
min = umin(p1, min); max = imax(p1, max);
int off2 = neon_directions8[sec1_dir_idx][k];
int off3 = neon_directions8[sec2_dir_idx][k];
int s0 = tmp[x + off2];
int s1 = tmp[x - off2];
int s2 = tmp[x + off3];
int s3 = tmp[x - off3];
int sec_tap = 2 - k;
sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
min = umin(s0, min); max = imax(s0, max);
min = umin(s1, min); max = imax(s1, max);
min = umin(s2, min); max = imax(s2, max);
min = umin(s3, min); max = imax(s3, max);
}
dst[x] = (uint8_t) iclip(px + ((sum - (sum < 0) + 8) >> 4),
min, max);
}
dst += dst_stride;
tmp += TMP_STRIDE;
} while (--h);
}