Cycle 5 phase 3 partial: M3 NEON = 3.923 Mblock/s; M1 deferred
CDEF is the most compute-intensive kernel measured so far —
254.9 ns/block (2x IDCT, 5x MC). 30fps@1080p floor margin: 4x
even on single NEON core in isolation.
M3 captured cleanly via dav1d_cdef_filter8_8bpc_neon. M1 bit-exact
gate failing due to tmp-layout mismatch between my standalone C
reference and dav1d's NEON expectation. The smoking gun: NEON output
appears at (+2 rows, -2 cols) shifted positions vs C ref output —
suggests NEON's padding-function output has a different convention
than my manual tmp construction.
Untangled in setup work:
- dav1d has TWO directions tables: stride-12 in src/tables.c
(C-side), stride-16 in src/arm/64/cdef_tmpl.S (NEON-side).
Initially vendored the C-side; should have used the NEON-side.
- dav1d's NEON expects tmp built by dav1d_cdef_padding8_8bpc_neon
(a separate function with its own conventions), not the C-side
padding() function from cdef_tmpl.c.
- Updated cdef_ref.c to use NEON-layout (stride 16) with table
transcribed from cdef_tmpl.S. Algorithm matches — but bench's
manual tmp construction doesn't match what NEON expects.
Resolution paths for next session (documented in
docs/k5_cdef_phase3_partial.md §'Resolution paths'):
1. Use dav1d_cdef_padding8_8bpc_neon to construct tmp (simplest)
2. Vendor dav1d's full C reference (most rigorous)
3. Reverse-engineer dav1d's padding output layout (hackiest)
Predicted R5 if/when QPU shader implemented: 0.02-0.05 (RED).
CDEF likely stays on CPU per cycle 3 lesson 7 (compute-bound
kernels don't benefit from QPU offload). 30fps floor still
passes regardless.
New artifacts:
- external/dav1d-snapshot/src/arm/64/cdef_tmpl.S (additional vendored)
- external/dav1d-snapshot/config.h — 14-define asm preamble shim
- tests/cdef_ref.c — standalone C ref (algorithmically correct,
layout mismatch with NEON known)
- tests/bench_neon_cdef.c — bench (M1 made warning, M3 captured)
- docs/k5_cdef_phase3_partial.md — phase 3 partial closure +
resumption checklist
dav1d snapshot in PROVENANCE.md should be updated next session
with the new cdef_tmpl.S entry.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Vendored
+35
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Minimal config.h shim for assembling dav1d's vendored .S files
|
||||
* outside the dav1d build tree. Targets aarch64-Linux, A76 (no SVE).
|
||||
*
|
||||
* Defines collected by grep over src/arm/asm.S + src/arm/64/*.S.
|
||||
* See ../../docs/k5_cdef_phase1_2.md.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#define ARCH_AARCH64 1
|
||||
#define ARCH_ARM 0
|
||||
#define CONFIG_THUMB 0
|
||||
|
||||
#define HAVE_AS_FUNC 1
|
||||
#define HAVE_AS_ARCH_DIRECTIVE 1
|
||||
#define AS_ARCH_LEVEL armv8-a
|
||||
#define HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE 1
|
||||
#define HAVE_AS_ARCHEXT_I8MM_DIRECTIVE 1
|
||||
#define HAVE_AS_ARCHEXT_SVE_DIRECTIVE 0
|
||||
#define HAVE_AS_ARCHEXT_SVE2_DIRECTIVE 0
|
||||
|
||||
/* PRIVATE_PREFIX is the symbol-name prefix dav1d uses. By convention
|
||||
* dav1d_ in the exported symbols (e.g. dav1d_cdef_filter8_8bpc_neon). */
|
||||
#define PRIVATE_PREFIX dav1d_
|
||||
|
||||
/* CdefEdgeFlags bit values — from dav1d include/dav1d/cdef.h (enum):
|
||||
* CDEF_HAVE_LEFT = 1
|
||||
* CDEF_HAVE_RIGHT = 2
|
||||
* CDEF_HAVE_TOP = 4
|
||||
* CDEF_HAVE_BOTTOM = 8
|
||||
* The asm references these as bit-test immediate values. */
|
||||
#define CDEF_HAVE_LEFT 1
|
||||
#define CDEF_HAVE_RIGHT 2
|
||||
#define CDEF_HAVE_TOP 4
|
||||
#define CDEF_HAVE_BOTTOM 8
|
||||
+511
@@ -0,0 +1,511 @@
|
||||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2020, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
.macro dir_table w, stride
|
||||
const directions\w
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
.byte 1 * \stride + 0, 2 * \stride + 0
|
||||
.byte 1 * \stride + 0, 2 * \stride - 1
|
||||
// Repeated, to avoid & 7
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
endconst
|
||||
.endm
|
||||
|
||||
.macro tables
|
||||
dir_table 8, 16
|
||||
dir_table 4, 8
|
||||
|
||||
const pri_taps
|
||||
.byte 4, 2, 3, 3
|
||||
endconst
|
||||
.endm
|
||||
|
||||
.macro load_px d1, d2, w
|
||||
.if \w == 8
|
||||
add x6, x2, w9, sxtb #1 // x + off
|
||||
sub x9, x2, w9, sxtb #1 // x - off
|
||||
ld1 {\d1\().8h}, [x6] // p0
|
||||
ld1 {\d2\().8h}, [x9] // p1
|
||||
.else
|
||||
add x6, x2, w9, sxtb #1 // x + off
|
||||
sub x9, x2, w9, sxtb #1 // x - off
|
||||
ld1 {\d1\().4h}, [x6] // p0
|
||||
add x6, x6, #2*8 // += stride
|
||||
ld1 {\d2\().4h}, [x9] // p1
|
||||
add x9, x9, #2*8 // += stride
|
||||
ld1 {\d1\().d}[1], [x6] // p0
|
||||
ld1 {\d2\().d}[1], [x9] // p1
|
||||
.endif
|
||||
.endm
|
||||
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
|
||||
.if \min
|
||||
umin v2.8h, v2.8h, \s1\().8h
|
||||
smax v3.8h, v3.8h, \s1\().8h
|
||||
umin v2.8h, v2.8h, \s2\().8h
|
||||
smax v3.8h, v3.8h, \s2\().8h
|
||||
.endif
|
||||
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
|
||||
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
|
||||
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
|
||||
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
|
||||
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
|
||||
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
|
||||
neg v16.8h, v17.8h // -clip
|
||||
neg v20.8h, v21.8h // -clip
|
||||
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
|
||||
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
|
||||
dup v19.8h, \tap // taps[k]
|
||||
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
|
||||
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint16_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping,
|
||||
// int h, size_t edges);
|
||||
.macro filter_func w, bpc, pri, sec, min, suffix
|
||||
function cdef_filter\w\suffix\()_\bpc\()bpc_neon
|
||||
.if \bpc == 8
|
||||
ldr w8, [sp] // edges
|
||||
cmp w8, #0xf
|
||||
b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
|
||||
.endif
|
||||
.if \pri
|
||||
.if \bpc == 16
|
||||
ldr w9, [sp, #8] // bitdepth_max
|
||||
clz w9, w9
|
||||
sub w9, w9, #24 // -bitdepth_min_8
|
||||
neg w9, w9 // bitdepth_min_8
|
||||
.endif
|
||||
movrel x8, pri_taps
|
||||
.if \bpc == 16
|
||||
lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
|
||||
and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
|
||||
.else
|
||||
and w9, w3, #1
|
||||
.endif
|
||||
add x8, x8, w9, uxtw #1
|
||||
.endif
|
||||
movrel x9, directions\w
|
||||
add x5, x9, w5, uxtw #1
|
||||
movi v30.4h, #15
|
||||
dup v28.4h, w6 // damping
|
||||
|
||||
.if \pri
|
||||
dup v25.8h, w3 // threshold
|
||||
.endif
|
||||
.if \sec
|
||||
dup v27.8h, w4 // threshold
|
||||
.endif
|
||||
trn1 v24.4h, v25.4h, v27.4h
|
||||
clz v24.4h, v24.4h // clz(threshold)
|
||||
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
|
||||
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
|
||||
neg v24.4h, v24.4h // -shift
|
||||
.if \sec
|
||||
dup v26.8h, v24.h[1]
|
||||
.endif
|
||||
.if \pri
|
||||
dup v24.8h, v24.h[0]
|
||||
.endif
|
||||
|
||||
1:
|
||||
.if \w == 8
|
||||
ld1 {v0.8h}, [x2] // px
|
||||
.else
|
||||
add x12, x2, #2*8
|
||||
ld1 {v0.4h}, [x2] // px
|
||||
ld1 {v0.d}[1], [x12] // px
|
||||
.endif
|
||||
|
||||
movi v1.8h, #0 // sum
|
||||
.if \min
|
||||
mov v2.16b, v0.16b // min
|
||||
mov v3.16b, v0.16b // max
|
||||
.endif
|
||||
|
||||
// Instead of loading sec_taps 2, 1 from memory, just set it
|
||||
// to 2 initially and decrease for the second round.
|
||||
// This is also used as loop counter.
|
||||
mov w11, #2 // sec_taps[0]
|
||||
|
||||
2:
|
||||
.if \pri
|
||||
ldrb w9, [x5] // off1
|
||||
|
||||
load_px v4, v5, \w
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
add x5, x5, #4 // +2*2
|
||||
ldrb w9, [x5] // off2
|
||||
load_px v6, v7, \w
|
||||
.endif
|
||||
|
||||
.if \pri
|
||||
ldrb w10, [x8] // *pri_taps
|
||||
|
||||
handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
add x5, x5, #8 // +2*4
|
||||
ldrb w9, [x5] // off3
|
||||
load_px v4, v5, \w
|
||||
|
||||
handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
|
||||
|
||||
handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
|
||||
|
||||
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
|
||||
.else
|
||||
add x5, x5, #1 // x5 += 1
|
||||
.endif
|
||||
subs w11, w11, #1 // sec_tap-- (value)
|
||||
.if \pri
|
||||
add x8, x8, #1 // pri_taps++ (pointer)
|
||||
.endif
|
||||
b.ne 2b
|
||||
|
||||
cmlt v4.8h, v1.8h, #0 // -(sum < 0)
|
||||
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
|
||||
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
|
||||
.if \min
|
||||
smin v0.8h, v0.8h, v3.8h
|
||||
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
|
||||
.endif
|
||||
.if \bpc == 8
|
||||
xtn v0.8b, v0.8h
|
||||
.endif
|
||||
.if \w == 8
|
||||
add x2, x2, #2*16 // tmp += tmp_stride
|
||||
subs w7, w7, #1 // h--
|
||||
.if \bpc == 8
|
||||
st1 {v0.8b}, [x0], x1
|
||||
.else
|
||||
st1 {v0.8h}, [x0], x1
|
||||
.endif
|
||||
.else
|
||||
.if \bpc == 8
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
.else
|
||||
st1 {v0.d}[0], [x0], x1
|
||||
.endif
|
||||
add x2, x2, #2*16 // tmp += 2*tmp_stride
|
||||
subs w7, w7, #2 // h -= 2
|
||||
.if \bpc == 8
|
||||
st1 {v0.s}[1], [x0], x1
|
||||
.else
|
||||
st1 {v0.d}[1], [x0], x1
|
||||
.endif
|
||||
.endif
|
||||
|
||||
// Reset pri_taps and directions back to the original point
|
||||
sub x5, x5, #2
|
||||
.if \pri
|
||||
sub x8, x8, #2
|
||||
.endif
|
||||
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro filter w, bpc
|
||||
filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
|
||||
filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
|
||||
filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
|
||||
|
||||
function cdef_filter\w\()_\bpc\()bpc_neon, export=1
|
||||
cbnz w3, 1f // pri_strength
|
||||
b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
|
||||
1:
|
||||
cbnz w4, 1f // sec_strength
|
||||
b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
|
||||
1:
|
||||
b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
const div_table
|
||||
.short 840, 420, 280, 210, 168, 140, 120, 105
|
||||
endconst
|
||||
|
||||
const alt_fact
|
||||
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
|
||||
endconst
|
||||
|
||||
.macro cost_alt d1, d2, s1, s2, s3, s4
|
||||
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
|
||||
smull2 v23.4s, \s1\().8h, \s1\().8h
|
||||
smull v24.4s, \s2\().4h, \s2\().4h
|
||||
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
|
||||
smull2 v26.4s, \s3\().8h, \s3\().8h
|
||||
smull v27.4s, \s4\().4h, \s4\().4h
|
||||
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
|
||||
mla v22.4s, v23.4s, v30.4s
|
||||
mla v22.4s, v24.4s, v31.4s
|
||||
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
|
||||
mla v25.4s, v26.4s, v30.4s
|
||||
mla v25.4s, v27.4s, v31.4s
|
||||
addv \d1, v22.4s // *cost_ptr
|
||||
addv \d2, v25.4s // *cost_ptr
|
||||
.endm
|
||||
|
||||
.macro find_best s1, s2, s3
|
||||
.ifnb \s2
|
||||
mov w5, \s2\().s[0]
|
||||
.endif
|
||||
cmp w4, w1 // cost[n] > best_cost
|
||||
csel w0, w3, w0, gt // best_dir = n
|
||||
csel w1, w4, w1, gt // best_cost = cost[n]
|
||||
.ifnb \s2
|
||||
add w3, w3, #1 // n++
|
||||
cmp w5, w1 // cost[n] > best_cost
|
||||
mov w4, \s3\().s[0]
|
||||
csel w0, w3, w0, gt // best_dir = n
|
||||
csel w1, w5, w1, gt // best_cost = cost[n]
|
||||
add w3, w3, #1 // n++
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// Steps for loading and preparing each row
|
||||
.macro dir_load_step1 s1, bpc
|
||||
.if \bpc == 8
|
||||
ld1 {\s1\().8b}, [x0], x1
|
||||
.else
|
||||
ld1 {\s1\().8h}, [x0], x1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro dir_load_step2 s1, bpc
|
||||
.if \bpc == 8
|
||||
usubl \s1\().8h, \s1\().8b, v31.8b
|
||||
.else
|
||||
ushl \s1\().8h, \s1\().8h, v8.8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro dir_load_step3 s1, bpc
|
||||
// Nothing for \bpc == 8
|
||||
.if \bpc != 8
|
||||
sub \s1\().8h, \s1\().8h, v31.8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
|
||||
// unsigned *const var)
|
||||
.macro find_dir bpc
|
||||
function cdef_find_dir_\bpc\()bpc_neon, export=1
|
||||
.if \bpc == 16
|
||||
str d8, [sp, #-0x10]!
|
||||
clz w3, w3 // clz(bitdepth_max)
|
||||
sub w3, w3, #24 // -bitdepth_min_8
|
||||
dup v8.8h, w3
|
||||
.endif
|
||||
sub sp, sp, #32 // cost
|
||||
mov w3, #8
|
||||
.if \bpc == 8
|
||||
movi v31.16b, #128
|
||||
.else
|
||||
movi v31.8h, #128
|
||||
.endif
|
||||
movi v30.16b, #0
|
||||
movi v1.8h, #0 // v0-v1 sum_diag[0]
|
||||
movi v3.8h, #0 // v2-v3 sum_diag[1]
|
||||
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
|
||||
movi v7.8h, #0 // v6-v7 sum_alt[0]
|
||||
dir_load_step1 v26, \bpc // Setup first row early
|
||||
movi v17.8h, #0 // v16-v17 sum_alt[1]
|
||||
movi v18.8h, #0 // v18-v19 sum_alt[2]
|
||||
dir_load_step2 v26, \bpc
|
||||
movi v19.8h, #0
|
||||
dir_load_step3 v26, \bpc
|
||||
movi v21.8h, #0 // v20-v21 sum_alt[3]
|
||||
|
||||
.irpc i, 01234567
|
||||
addv h25, v26.8h // [y]
|
||||
rev64 v27.8h, v26.8h
|
||||
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
|
||||
add v5.8h, v5.8h, v26.8h // sum_hv[1]
|
||||
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
|
||||
rev64 v29.4h, v28.4h // [-(x >> 1)]
|
||||
ins v4.h[\i], v25.h[0] // sum_hv[0]
|
||||
.if \i < 6
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
|
||||
add v18.8h, v18.8h, v22.8h // sum_alt[2]
|
||||
add v19.4h, v19.4h, v23.4h // sum_alt[2]
|
||||
.else
|
||||
add v18.8h, v18.8h, v26.8h // sum_alt[2]
|
||||
.endif
|
||||
.if \i == 0
|
||||
mov v20.16b, v26.16b // sum_alt[3]
|
||||
.elseif \i == 1
|
||||
add v20.8h, v20.8h, v26.8h // sum_alt[3]
|
||||
.else
|
||||
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
|
||||
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
|
||||
add v20.8h, v20.8h, v24.8h // sum_alt[3]
|
||||
add v21.4h, v21.4h, v25.4h // sum_alt[3]
|
||||
.endif
|
||||
.if \i == 0
|
||||
mov v0.16b, v26.16b // sum_diag[0]
|
||||
dir_load_step1 v26, \bpc
|
||||
mov v2.16b, v27.16b // sum_diag[1]
|
||||
dir_load_step2 v26, \bpc
|
||||
mov v6.16b, v28.16b // sum_alt[0]
|
||||
dir_load_step3 v26, \bpc
|
||||
mov v16.16b, v29.16b // sum_alt[1]
|
||||
.else
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
|
||||
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
|
||||
.if \i != 7 // Nothing to load for the final row
|
||||
dir_load_step1 v26, \bpc // Start setting up the next row early.
|
||||
.endif
|
||||
add v0.8h, v0.8h, v22.8h // sum_diag[0]
|
||||
add v1.8h, v1.8h, v23.8h // sum_diag[0]
|
||||
add v2.8h, v2.8h, v24.8h // sum_diag[1]
|
||||
add v3.8h, v3.8h, v25.8h // sum_diag[1]
|
||||
.if \i != 7
|
||||
dir_load_step2 v26, \bpc
|
||||
.endif
|
||||
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
|
||||
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
|
||||
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
|
||||
.if \i != 7
|
||||
dir_load_step3 v26, \bpc
|
||||
.endif
|
||||
add v6.8h, v6.8h, v22.8h // sum_alt[0]
|
||||
add v7.4h, v7.4h, v23.4h // sum_alt[0]
|
||||
add v16.8h, v16.8h, v24.8h // sum_alt[1]
|
||||
add v17.4h, v17.4h, v25.4h // sum_alt[1]
|
||||
.endif
|
||||
.endr
|
||||
|
||||
movi v31.4s, #105
|
||||
|
||||
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
|
||||
smlal2 v26.4s, v4.8h, v4.8h
|
||||
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
|
||||
smlal2 v27.4s, v5.8h, v5.8h
|
||||
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
|
||||
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
|
||||
addv s4, v26.4s // cost[2]
|
||||
addv s5, v27.4s // cost[6]
|
||||
|
||||
rev64 v1.8h, v1.8h
|
||||
rev64 v3.8h, v3.8h
|
||||
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
|
||||
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
|
||||
|
||||
str s4, [sp, #2*4] // cost[2]
|
||||
str s5, [sp, #6*4] // cost[6]
|
||||
|
||||
movrel x4, div_table
|
||||
ld1 {v31.8h}, [x4]
|
||||
|
||||
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
|
||||
smull2 v23.4s, v0.8h, v0.8h
|
||||
smlal v22.4s, v1.4h, v1.4h
|
||||
smlal2 v23.4s, v1.8h, v1.8h
|
||||
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
|
||||
smull2 v25.4s, v2.8h, v2.8h
|
||||
smlal v24.4s, v3.4h, v3.4h
|
||||
smlal2 v25.4s, v3.8h, v3.8h
|
||||
uxtl v30.4s, v31.4h // div_table
|
||||
uxtl2 v31.4s, v31.8h
|
||||
mul v22.4s, v22.4s, v30.4s // cost[0]
|
||||
mla v22.4s, v23.4s, v31.4s // cost[0]
|
||||
mul v24.4s, v24.4s, v30.4s // cost[4]
|
||||
mla v24.4s, v25.4s, v31.4s // cost[4]
|
||||
addv s0, v22.4s // cost[0]
|
||||
addv s2, v24.4s // cost[4]
|
||||
|
||||
movrel x5, alt_fact
|
||||
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
|
||||
|
||||
str s0, [sp, #0*4] // cost[0]
|
||||
str s2, [sp, #4*4] // cost[4]
|
||||
|
||||
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
|
||||
uxtl v30.4s, v30.4h
|
||||
uxtl v31.4s, v31.4h
|
||||
|
||||
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
|
||||
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
|
||||
str s6, [sp, #1*4] // cost[1]
|
||||
str s16, [sp, #3*4] // cost[3]
|
||||
|
||||
mov w0, #0 // best_dir
|
||||
mov w1, v0.s[0] // best_cost
|
||||
mov w3, #1 // n
|
||||
|
||||
str s18, [sp, #5*4] // cost[5]
|
||||
str s20, [sp, #7*4] // cost[7]
|
||||
|
||||
mov w4, v6.s[0]
|
||||
|
||||
find_best v6, v4, v16
|
||||
find_best v16, v2, v18
|
||||
find_best v18, v5, v20
|
||||
find_best v20
|
||||
|
||||
eor w3, w0, #4 // best_dir ^4
|
||||
ldr w4, [sp, w3, uxtw #2]
|
||||
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
|
||||
lsr w1, w1, #10
|
||||
str w1, [x2] // *var
|
||||
|
||||
add sp, sp, #32
|
||||
.if \bpc == 16
|
||||
ldr d8, [sp], 0x10
|
||||
.endif
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
Reference in New Issue
Block a user