/* * Standalone bit-exact C reference for AV1 CDEF filter, 8x8 luma 8bpc, * combined primary + secondary path. * * Algorithm transcribed from dav1d's `cdef_filter_block_c` in * src/cdef_tmpl.c (vendored at external/dav1d-snapshot/, tag 1.4.3). * * **Layout note (cycle 5 phase 3 finding):** dav1d's NEON expects * tmp with stride 16 (uint16 elements), not stride 12 like the C * reference uses. The NEON has its own directions table baked at * stride 16 in src/arm/64/cdef_tmpl.S `dir_table 8, 16`. The C * reference uses stride 12 and the table in src/tables.c. * * To compare bit-exact against NEON, this standalone C ref uses * NEON's stride-16 layout + its embedded directions table. Same * algorithm, different stride convention than dav1d's C path. * * Signature mirrors the dav1d NEON convention: * void(uint8_t *dst, ptrdiff_t dst_stride, const uint16_t *tmp, * int pri_strength, int sec_strength, * int dir, int damping, int h); * * tmp is a (12 rows × 16 cols × uint16) padded buffer, stride 16. * Center 8x8 region at tmp[r=2..9][c=2..9]. * * License: BSD-2-Clause (matches dav1d upstream). * * Spec: AV1 specification §7.15 (CDEF). */ #include #include #include #define TMP_STRIDE 16 /* dav1d's stride-16 directions table — verbatim from * external/dav1d-snapshot/src/arm/64/cdef_tmpl.S `dir_table 8, 16`. * 8 directions + 6 wrap-around copies (dir 0..5 repeated) = 14 * entries × 2 = 28 bytes. The asm needs ≥14 entries because for * dir=7 the secondary-2 offset (+12 bytes = +6 entries) reads * index 13 (which is wrap = dir 5). */ static const int8_t neon_directions8[14][2] = { /* index 0 */ { -1 * TMP_STRIDE + 1, -2 * TMP_STRIDE + 2 }, /* index 1 */ { 0 * TMP_STRIDE + 1, -1 * TMP_STRIDE + 2 }, /* index 2 */ { 0 * TMP_STRIDE + 1, 0 * TMP_STRIDE + 2 }, /* index 3 */ { 0 * TMP_STRIDE + 1, 1 * TMP_STRIDE + 2 }, /* index 4 */ { 1 * TMP_STRIDE + 1, 2 * TMP_STRIDE + 2 }, /* index 5 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 1 }, /* index 6 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 0 }, /* index 7 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE - 1 }, /* wrap 8 = dir 0 */ { -1 * TMP_STRIDE + 1, -2 * TMP_STRIDE + 2 }, /* wrap 9 = dir 1 */ { 0 * TMP_STRIDE + 1, -1 * TMP_STRIDE + 2 }, /* wrap 10 = dir 2 */ { 0 * TMP_STRIDE + 1, 0 * TMP_STRIDE + 2 }, /* wrap 11 = dir 3 */ { 0 * TMP_STRIDE + 1, 1 * TMP_STRIDE + 2 }, /* wrap 12 = dir 4 */ { 1 * TMP_STRIDE + 1, 2 * TMP_STRIDE + 2 }, /* wrap 13 = dir 5 */ { 1 * TMP_STRIDE + 0, 2 * TMP_STRIDE + 1 }, }; static inline int abs_i(int x) { return x < 0 ? -x : x; } static inline int imin(int a, int b) { return a < b ? a : b; } static inline int imax(int a, int b) { return a > b ? a : b; } static inline int umin(int a, int b) { return (unsigned)a < (unsigned)b ? a : b; } static inline int iclip(int v, int lo, int hi) { return v < lo ? lo : v > hi ? hi : v; } static inline int apply_sign(int v, int s) { return s < 0 ? -v : v; } static inline int constrain(int diff, int threshold, int shift) { int adiff = abs_i(diff); return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff); } static inline int ulog2(unsigned x) { return 31 - __builtin_clz(x); } /* NEON-layout reference: tmp is (12 rows × 16 uint16 cols), center * at [r=2..9][c=2..9]. dir is the precomputed direction [0..7]. * Direction lookups use NEON's table (stride-16-precomputed offsets). * * Note: dav1d's dispatcher branches dir+2, dir+4, dir+0 (after * adjusting for the +2 leading offset in the table). With our 12-entry * table indexed without the +2 lead, the equivalent is: * primary: [dir][k] (was [dir + 2][k] with +2-prefixed table) * secondary1: [(dir + 2) % 8][k] (was [dir + 4][k]) * secondary2: [(dir - 2 + 8) % 8][k] (was [dir + 0][k]) * Our `neon_directions8` includes 4 wrap-around entries (idx 8..11 * = idx 0..3) so [(dir+2)%8] is safe without explicit modulo. */ void daedalus_cdef_filter_8x8_pri_sec_ref( uint8_t *dst, ptrdiff_t dst_stride, const uint16_t *tmp, int pri_strength, int sec_strength, int dir, int damping, int h) { const int pri_tap = 4 - (pri_strength & 1); const int pri_shift = imax(0, damping - ulog2((unsigned) pri_strength)); const int sec_shift = damping - ulog2((unsigned) sec_strength); /* Walk into the center 8x8 region of the 12×16 padded buffer. */ tmp = tmp + 2 * TMP_STRIDE + 2; /* dav1d's dispatcher uses dir+2, dir+4, dir+0 with the C-side * 2-prefixed directions table. Our table starts at index 0 = dir 0, * so the equivalent indices are dir, (dir+2)%8, (dir-2+8)%8. */ const int pri_dir_idx = dir; const int sec1_dir_idx = (dir + 2) & 7; const int sec2_dir_idx = (dir + 6) & 7; /* (dir - 2) % 8 */ do { for (int x = 0; x < 8; x++) { int px = dst[x]; int sum = 0; int max = px, min = px; int pri_tap_k = pri_tap; for (int k = 0; k < 2; k++) { int off1 = neon_directions8[pri_dir_idx][k]; int p0 = tmp[x + off1]; int p1 = tmp[x - off1]; sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift); sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift); pri_tap_k = (pri_tap_k & 3) | 2; min = umin(p0, min); max = imax(p0, max); min = umin(p1, min); max = imax(p1, max); int off2 = neon_directions8[sec1_dir_idx][k]; int off3 = neon_directions8[sec2_dir_idx][k]; int s0 = tmp[x + off2]; int s1 = tmp[x - off2]; int s2 = tmp[x + off3]; int s3 = tmp[x - off3]; int sec_tap = 2 - k; sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift); min = umin(s0, min); max = imax(s0, max); min = umin(s1, min); max = imax(s1, max); min = umin(s2, min); max = imax(s2, max); min = umin(s3, min); max = imax(s3, max); } dst[x] = (uint8_t) iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max); } dst += dst_stride; tmp += TMP_STRIDE; } while (--h); }