Cycle 5 phase 3 partial: M3 NEON = 3.923 Mblock/s; M1 deferred
CDEF is the most compute-intensive kernel measured so far —
254.9 ns/block (2x IDCT, 5x MC). 30fps@1080p floor margin: 4x
even on single NEON core in isolation.
M3 captured cleanly via dav1d_cdef_filter8_8bpc_neon. M1 bit-exact
gate failing due to tmp-layout mismatch between my standalone C
reference and dav1d's NEON expectation. The smoking gun: NEON output
appears at (+2 rows, -2 cols) shifted positions vs C ref output —
suggests NEON's padding-function output has a different convention
than my manual tmp construction.
Untangled in setup work:
- dav1d has TWO directions tables: stride-12 in src/tables.c
(C-side), stride-16 in src/arm/64/cdef_tmpl.S (NEON-side).
Initially vendored the C-side; should have used the NEON-side.
- dav1d's NEON expects tmp built by dav1d_cdef_padding8_8bpc_neon
(a separate function with its own conventions), not the C-side
padding() function from cdef_tmpl.c.
- Updated cdef_ref.c to use NEON-layout (stride 16) with table
transcribed from cdef_tmpl.S. Algorithm matches — but bench's
manual tmp construction doesn't match what NEON expects.
Resolution paths for next session (documented in
docs/k5_cdef_phase3_partial.md §'Resolution paths'):
1. Use dav1d_cdef_padding8_8bpc_neon to construct tmp (simplest)
2. Vendor dav1d's full C reference (most rigorous)
3. Reverse-engineer dav1d's padding output layout (hackiest)
Predicted R5 if/when QPU shader implemented: 0.02-0.05 (RED).
CDEF likely stays on CPU per cycle 3 lesson 7 (compute-bound
kernels don't benefit from QPU offload). 30fps floor still
passes regardless.
New artifacts:
- external/dav1d-snapshot/src/arm/64/cdef_tmpl.S (additional vendored)
- external/dav1d-snapshot/config.h — 14-define asm preamble shim
- tests/cdef_ref.c — standalone C ref (algorithmically correct,
layout mismatch with NEON known)
- tests/bench_neon_cdef.c — bench (M1 made warning, M3 captured)
- docs/k5_cdef_phase3_partial.md — phase 3 partial closure +
resumption checklist
dav1d snapshot in PROVENANCE.md should be updated next session
with the new cdef_tmpl.S entry.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for AV1 CDEF filter, 8x8 luma 8bpc,
|
||||
* combined primary + secondary path.
|
||||
*
|
||||
* Algorithm transcribed from dav1d's `cdef_filter_block_c` in
|
||||
* src/cdef_tmpl.c (vendored at external/dav1d-snapshot/, tag 1.4.3).
|
||||
*
|
||||
* **Layout note (cycle 5 phase 3 finding):** dav1d's NEON expects
|
||||
* tmp with stride 16 (uint16 elements), not stride 12 like the C
|
||||
* reference uses. The NEON has its own directions table baked at
|
||||
* stride 16 in src/arm/64/cdef_tmpl.S `dir_table 8, 16`. The C
|
||||
* reference uses stride 12 and the table in src/tables.c.
|
||||
*
|
||||
* To compare bit-exact against NEON, this standalone C ref uses
|
||||
* NEON's stride-16 layout + its embedded directions table. Same
|
||||
* algorithm, different stride convention than dav1d's C path.
|
||||
*
|
||||
* Signature mirrors the dav1d NEON convention:
|
||||
* void(uint8_t *dst, ptrdiff_t dst_stride, const uint16_t *tmp,
|
||||
* int pri_strength, int sec_strength,
|
||||
* int dir, int damping, int h);
|
||||
*
|
||||
* tmp is a (12 rows × 16 cols × uint16) padded buffer, stride 16.
|
||||
* Center 8x8 region at tmp[r=2..9][c=2..9].
|
||||
*
|
||||
* License: BSD-2-Clause (matches dav1d upstream).
|
||||
*
|
||||
* Spec: AV1 specification §7.15 (CDEF).
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define TMP_STRIDE 16
|
||||
|
||||
/* dav1d's stride-16 directions table — verbatim from
|
||||
* external/dav1d-snapshot/src/arm/64/cdef_tmpl.S `dir_table 8, 16`.
|
||||
* 8 directions + 6 wrap-around copies (dir 0..5 repeated) = 14
|
||||
* entries × 2 = 28 bytes. The asm needs ≥14 entries because for
|
||||
* dir=7 the secondary-2 offset (+12 bytes = +6 entries) reads
|
||||
* index 13 (which is wrap = dir 5). */
|
||||
static const int8_t neon_directions8[14][2] = {
|
||||
/* index 0 */ { -1 * TMP_STRIDE + 1, -2 * TMP_STRIDE + 2 },
|
||||
/* index 1 */ { 0 * TMP_STRIDE + 1, -1 * TMP_STRIDE + 2 },
|
||||
/* index 2 */ { 0 * TMP_STRIDE + 1, 0 * TMP_STRIDE + 2 },
|
||||
/* index 3 */ { 0 * TMP_STRIDE + 1, 1 * TMP_STRIDE + 2 },
|
||||
/* index 4 */ { 1 * TMP_STRIDE + 1, 2 * TMP_STRIDE + 2 },
|
||||
/* index 5 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 1 },
|
||||
/* index 6 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 0 },
|
||||
/* index 7 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE - 1 },
|
||||
/* wrap 8 = dir 0 */ { -1 * TMP_STRIDE + 1, -2 * TMP_STRIDE + 2 },
|
||||
/* wrap 9 = dir 1 */ { 0 * TMP_STRIDE + 1, -1 * TMP_STRIDE + 2 },
|
||||
/* wrap 10 = dir 2 */ { 0 * TMP_STRIDE + 1, 0 * TMP_STRIDE + 2 },
|
||||
/* wrap 11 = dir 3 */ { 0 * TMP_STRIDE + 1, 1 * TMP_STRIDE + 2 },
|
||||
/* wrap 12 = dir 4 */ { 1 * TMP_STRIDE + 1, 2 * TMP_STRIDE + 2 },
|
||||
/* wrap 13 = dir 5 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 1 },
|
||||
};
|
||||
|
||||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||
static inline int imin(int a, int b) { return a < b ? a : b; }
|
||||
static inline int imax(int a, int b) { return a > b ? a : b; }
|
||||
static inline int umin(int a, int b) { return (unsigned)a < (unsigned)b ? a : b; }
|
||||
static inline int iclip(int v, int lo, int hi) {
|
||||
return v < lo ? lo : v > hi ? hi : v;
|
||||
}
|
||||
static inline int apply_sign(int v, int s) { return s < 0 ? -v : v; }
|
||||
|
||||
static inline int constrain(int diff, int threshold, int shift)
|
||||
{
|
||||
int adiff = abs_i(diff);
|
||||
return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))),
|
||||
diff);
|
||||
}
|
||||
|
||||
static inline int ulog2(unsigned x)
|
||||
{
|
||||
return 31 - __builtin_clz(x);
|
||||
}
|
||||
|
||||
/* NEON-layout reference: tmp is (12 rows × 16 uint16 cols), center
|
||||
* at [r=2..9][c=2..9]. dir is the precomputed direction [0..7].
|
||||
* Direction lookups use NEON's table (stride-16-precomputed offsets).
|
||||
*
|
||||
* Note: dav1d's dispatcher branches dir+2, dir+4, dir+0 (after
|
||||
* adjusting for the +2 leading offset in the table). With our 12-entry
|
||||
* table indexed without the +2 lead, the equivalent is:
|
||||
* primary: [dir][k] (was [dir + 2][k] with +2-prefixed table)
|
||||
* secondary1: [(dir + 2) % 8][k] (was [dir + 4][k])
|
||||
* secondary2: [(dir - 2 + 8) % 8][k] (was [dir + 0][k])
|
||||
* Our `neon_directions8` includes 4 wrap-around entries (idx 8..11
|
||||
* = idx 0..3) so [(dir+2)%8] is safe without explicit modulo.
|
||||
*/
|
||||
void daedalus_cdef_filter_8x8_pri_sec_ref(
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
int pri_strength, int sec_strength,
|
||||
int dir, int damping, int h)
|
||||
{
|
||||
const int pri_tap = 4 - (pri_strength & 1);
|
||||
const int pri_shift = imax(0, damping - ulog2((unsigned) pri_strength));
|
||||
const int sec_shift = damping - ulog2((unsigned) sec_strength);
|
||||
|
||||
/* Walk into the center 8x8 region of the 12×16 padded buffer. */
|
||||
tmp = tmp + 2 * TMP_STRIDE + 2;
|
||||
|
||||
/* dav1d's dispatcher uses dir+2, dir+4, dir+0 with the C-side
|
||||
* 2-prefixed directions table. Our table starts at index 0 = dir 0,
|
||||
* so the equivalent indices are dir, (dir+2)%8, (dir-2+8)%8. */
|
||||
const int pri_dir_idx = dir;
|
||||
const int sec1_dir_idx = (dir + 2) & 7;
|
||||
const int sec2_dir_idx = (dir + 6) & 7; /* (dir - 2) % 8 */
|
||||
|
||||
do {
|
||||
for (int x = 0; x < 8; x++) {
|
||||
int px = dst[x];
|
||||
int sum = 0;
|
||||
int max = px, min = px;
|
||||
int pri_tap_k = pri_tap;
|
||||
|
||||
for (int k = 0; k < 2; k++) {
|
||||
int off1 = neon_directions8[pri_dir_idx][k];
|
||||
int p0 = tmp[x + off1];
|
||||
int p1 = tmp[x - off1];
|
||||
sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
|
||||
sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
|
||||
pri_tap_k = (pri_tap_k & 3) | 2;
|
||||
min = umin(p0, min); max = imax(p0, max);
|
||||
min = umin(p1, min); max = imax(p1, max);
|
||||
|
||||
int off2 = neon_directions8[sec1_dir_idx][k];
|
||||
int off3 = neon_directions8[sec2_dir_idx][k];
|
||||
int s0 = tmp[x + off2];
|
||||
int s1 = tmp[x - off2];
|
||||
int s2 = tmp[x + off3];
|
||||
int s3 = tmp[x - off3];
|
||||
int sec_tap = 2 - k;
|
||||
sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
|
||||
min = umin(s0, min); max = imax(s0, max);
|
||||
min = umin(s1, min); max = imax(s1, max);
|
||||
min = umin(s2, min); max = imax(s2, max);
|
||||
min = umin(s3, min); max = imax(s3, max);
|
||||
}
|
||||
|
||||
dst[x] = (uint8_t) iclip(px + ((sum - (sum < 0) + 8) >> 4),
|
||||
min, max);
|
||||
}
|
||||
dst += dst_stride;
|
||||
tmp += TMP_STRIDE;
|
||||
} while (--h);
|
||||
}
|
||||
Reference in New Issue
Block a user