Cycle 5 Phase 3 closed: M1 PASS via bench pointer-convention fix
The previous "layout mismatch" deferral was a one-line bench bug: NEON expects the caller to pass tmp pointing at the 8x8 block origin (after the 2*16+2 padding skip), but the bench passed the raw padded-buffer origin. C ref does the advance internally, so it filtered the correct block; NEON filtered a (+2 rows, +2 cols) shifted region. Diagonal-shift trace in the partial doc was exactly that. Fix: tmps + i*TMP_INTS + (2*TMP_W + 2) for NEON calls. Results: M1: 10000/10000 bit-exact (100.0000%), all 8 dirs balanced M3: 3.809 Mblock/s (consistent with 3.923 from longer window) Phase 4 unblocked; predicted R5 = 0.02-0.05 (deep RED) per earlier analysis. Will build QPU CDEF anyway for cycle-completeness + V4L2 dispatch-path existence. - tests/bench_neon_cdef.c: 3-line tmp pointer fix - docs/k5_cdef_phase3.md: supersedes k5_cdef_phase3_partial.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -113,11 +113,16 @@ static int correctness_check(uint64_t seed, int n)
|
||||
tmp_center_to_dst(dst_a, tmp);
|
||||
memcpy(dst_b, dst_a, DST_BYTES);
|
||||
|
||||
/* C ref advances tmp internally by +2*stride+2.
|
||||
* NEON expects the caller to pass the already-advanced pointer
|
||||
* (i.e. pointer to the block-data origin, not the padded-buffer
|
||||
* origin). Hence the tmp+34 for the NEON call. */
|
||||
daedalus_cdef_filter_8x8_pri_sec_ref(
|
||||
dst_a, DST_W, tmp, pri, sec, dir, damping, 8);
|
||||
dav1d_cdef_filter8_8bpc_neon(
|
||||
dst_b, DST_W, tmp, pri, sec, dir, damping, 8,
|
||||
/* edges = */ 0); /* != 0xf → non-edged path, uint16 tmp w/stride 12 */
|
||||
dst_b, DST_W, tmp + (2 * TMP_W + 2),
|
||||
pri, sec, dir, damping, 8,
|
||||
/* edges = */ 0); /* uint16 tmp non-edged path */
|
||||
|
||||
if (memcmp(dst_a, dst_b, DST_BYTES) != 0) {
|
||||
if (mismatches < 3) {
|
||||
@@ -180,7 +185,7 @@ static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
dav1d_cdef_filter8_8bpc_neon(
|
||||
work_dst + (size_t)i * DST_BYTES, DST_W,
|
||||
tmps + (size_t)i * TMP_INTS,
|
||||
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
|
||||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||||
|
||||
double t0 = now_seconds();
|
||||
@@ -191,7 +196,7 @@ static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
dav1d_cdef_filter8_8bpc_neon(
|
||||
work_dst + (size_t)i * DST_BYTES, DST_W,
|
||||
tmps + (size_t)i * TMP_INTS,
|
||||
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
|
||||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||||
done += n_blocks;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user