Cycle 5 setup (Phase 1+2): vendor dav1d 1.4.3 CDEF sources

First AV1 kernel cycle and first dav1d-vendored sources. Phase 1+2
docs lay out the structural complexity (CDEF needs pre-padded 12x12
working buffer + external edge context + direction lookup +
constraint function — meaningfully more complex than cycles 1-4).

Phase 3+ deferred to next session — CDEF is the first cycle that
doesn't fit cleanly into a single autonomous run.

Vendored from dav1d 1.4.3 (BSD-2-Clause, cleaner license than
FFmpeg's LGPL-2.1+):

  src/arm/64/cdef.S            520 lines — NEON impl
  src/arm/64/util.S            278 lines — NEON helpers
  src/arm/asm.S                335 lines — GAS preamble
  src/cdef_tmpl.c              331 lines — C reference (templated)
  include/common/intops.h       84 lines — utility helpers
  src/tables_cdef_subset.c      hand-extracted — dav1d_cdef_directions
                                only (avoids dragging full 1013-line
                                tables.c + transitive includes)

Discovery from Phase 2 analysis:
- Filter type and shape: dav1d_cdef_filter8_pri_sec_8bpc_neon takes
  (dst, dst_stride, tmp, pri_strength, sec_strength, dir, damping, h).
  The 'tmp' arg is the pre-padded 12x12 buffer constructed externally
  by the dav1d C-side padding() function.
- Tap weights are inline-computed (not table): pri_tap = 4 or 3
  (based on pri_strength bit), sec_tap = 2 or 1. Only
  dav1d_cdef_directions[12][2] is an external table.
- Constraint function: constrain(diff, threshold, shift) =
  apply_sign(min(abs(diff), max(0, threshold - (abs(diff) >> shift))),
             diff)

Predicted R5 band: 0.15-0.30 (ORANGE). CDEF is compute-heavier than
LPF (per-pixel min/max conditional logic), so likely worse R than
cycle 2/4 but better than cycle 3 MC. M4 gate likely required.

What Phase 3+ needs (next session):
1. config.h shim for dav1d's asm preamble (defines TBD on first build)
2. Standalone C reference for cdef_filter_block_8x8_c
   (cdef_tmpl.c references several dav1d private headers; cleaner to
   transcribe to a self-contained tests/cdef_ref.c)
3. tests/bench_neon_cdef.c — M1+M3 bench
4. Phase 4 plan, Phase 5 review (mandatory), Phase 6 shader, Phase 7 measure

PROVENANCE.md documents pin + per-file role + re-vendoring procedure.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 13:12:25 +00:00
parent 20e3d004ae
commit 2cd2258a7b
8 changed files with 1879 additions and 0 deletions
+520
View File
@@ -0,0 +1,520 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#include "cdef_tmpl.S"
.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
tst w7, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
sub \s1, \s1, #2
sub \s2, \s2, #2
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr s1, [\s1, #\w]
ldr \rn\()2, [\s2]
ldr s3, [\s2, #\w]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
str \rw\()0, [x0]
str d1, [x0, #2*\w]
add x0, x0, #2*\stride
str \rw\()2, [x0]
str d3, [x0, #2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr h1, [\s1, #\w]
ldr \rn\()2, [\s2]
ldr h3, [\s2, #\w]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
str \rw\()0, [x0]
str s1, [x0, #2*\w]
str s31, [x0, #2*\w+4]
add x0, x0, #2*\stride
str \rw\()2, [x0]
str s3, [x0, #2*\w]
str s31, [x0, #2*\w+4]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
2:
// !CDEF_HAVE_LEFT
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr h1, [\s1, #\w]
ldr \rn\()2, [\s2]
ldr h3, [\s2, #\w]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \rw\()2, [x0, #4]
str s3, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr \rn\()1, [\s2]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \rw\()1, [x0, #4]
str s31, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
.endif
3:
.endm
.macro load_n_incr dst, src, incr, w
.if \w == 4
ld1 {\dst\().s}[0], [\src], \incr
.else
ld1 {\dst\().8b}, [\src], \incr
.endif
.endm
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top,
// const pixel *const bottom, int h,
// enum CdefEdgeFlags edges);
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_8bpc_neon, export=1
cmp w7, #0xf // fully edged
b.eq cdef_padding\w\()_edged_8bpc_neon
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
tst w7, #4 // CDEF_HAVE_TOP
b.ne 1f
// !CDEF_HAVE_TOP
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
b 3f
1:
// CDEF_HAVE_TOP
add x9, x4, x2
pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
// Middle section
3:
tst w7, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ld1 {v0.h}[0], [x3], #2
ldr h2, [x1, #\w]
load_n_incr v1, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
str s0, [x0]
stur \rw\()1, [x0, #4]
str s2, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ld1 {v0.h}[0], [x3], #2
load_n_incr v1, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
str s0, [x0]
stur \rw\()1, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
b 3f
2:
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldr h1, [x1, #\w]
load_n_incr v0, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
load_n_incr v0, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
3:
tst w7, #8 // CDEF_HAVE_BOTTOM
b.ne 1f
// !CDEF_HAVE_BOTTOM
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
ret
1:
// CDEF_HAVE_BOTTOM
add x9, x5, x2
pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1
endfunc
.endm
padding_func 8, 16, d, q
padding_func 4, 8, s, d
// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top,
// const pixel *const bottom, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg
function cdef_padding\w\()_edged_8bpc_neon, export=1
sub x4, x4, #2
sub x5, x5, #2
sub x0, x0, #(2*\stride+2)
.if \w == 4
ldr d0, [x4]
ldr d1, [x4, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x4, x2
ldr d0, [x4]
ldr s1, [x4, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
add x0, x0, #2*\stride
.endif
0:
ld1 {v0.h}[0], [x3], #2
ldr h2, [x1, #\w]
load_n_incr v1, x1, x2, \w
subs w6, w6, #1
str h0, [x0]
stur \reg\()1, [x0, #2]
str h2, [x0, #2+\w]
add x0, x0, #\stride
b.gt 0b
.if \w == 4
ldr d0, [x5]
ldr d1, [x5, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x5, x2
ldr d0, [x5]
ldr s1, [x5, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
.endif
ret
endfunc
.endm
padding_func_edged 8, 16, d
padding_func_edged 4, 8, s
tables
filter 8, 8
filter 4, 8
find_dir 8
.macro load_px_8 d1, d2, w
.if \w == 8
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().d}[0], [x6] // p0
add x6, x6, #16 // += stride
ld1 {\d2\().d}[0], [x9] // p1
add x9, x9, #16 // += stride
ld1 {\d1\().d}[1], [x6] // p0
ld1 {\d2\().d}[1], [x9] // p0
.else
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().s}[0], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[0], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[1], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[1], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[2], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[2], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[3], [x6] // p0
ld1 {\d2\().s}[3], [x9] // p1
.endif
.endm
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
.if \min
umin v3.16b, v3.16b, \s1\().16b
umax v4.16b, v4.16b, \s1\().16b
umin v3.16b, v3.16b, \s2\().16b
umax v4.16b, v4.16b, \s2\().16b
.endif
uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
cmhi v18.16b, v0.16b, \s1\().16b // px > p0
cmhi v22.16b, v0.16b, \s2\().16b // px > p1
umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
dup v19.16b, \tap // taps[k]
neg v16.16b, v17.16b // -imin()
neg v20.16b, v21.16b // -imin()
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain()
mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain()
.endm
// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint8_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h);
.macro filter_func_8 w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_edged_8bpc_neon
.if \pri
movrel x8, pri_taps
and w9, w3, #1
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.8b, #7
dup v28.8b, w6 // damping
.if \pri
dup v25.16b, w3 // threshold
.endif
.if \sec
dup v27.16b, w4 // threshold
.endif
trn1 v24.8b, v25.8b, v27.8b
clz v24.8b, v24.8b // clz(threshold)
sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
neg v24.8b, v24.8b // -shift
.if \sec
dup v26.16b, v24.b[1]
.endif
.if \pri
dup v24.16b, v24.b[0]
.endif
1:
.if \w == 8
add x12, x2, #16
ld1 {v0.d}[0], [x2] // px
ld1 {v0.d}[1], [x12] // px
.else
add x12, x2, #1*8
add x13, x2, #2*8
add x14, x2, #3*8
ld1 {v0.s}[0], [x2] // px
ld1 {v0.s}[1], [x12] // px
ld1 {v0.s}[2], [x13] // px
ld1 {v0.s}[3], [x14] // px
.endif
// We need 9-bits or two 8-bit accululators to fit the sum.
// Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
// Start sum at -1 instead of 0 to help handle rounding later.
movi v1.16b, #255 // sum
movi v2.16b, #0 // sum
.if \min
mov v3.16b, v0.16b // min
mov v4.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px_8 v5, v6, \w
.endif
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px_8 v28, v29, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px_8 v5, v6, \w
handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
// Perform halving adds since the value won't fit otherwise.
// To handle the offset for negative values, use both halving w/ and w/o rounding.
srhadd v5.16b, v1.16b, v2.16b // sum >> 1
shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1
cmlt v1.16b, v5.16b, #0 // sum < 0
bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1
srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4
usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4
.if \min
umin v0.16b, v0.16b, v4.16b
umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
.endif
.if \w == 8
st1 {v0.d}[0], [x0], x1
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
st1 {v0.d}[1], [x0], x1
.else
st1 {v0.s}[0], [x0], x1
add x2, x2, #4*8 // tmp += 4*tmp_stride
st1 {v0.s}[1], [x0], x1
subs w7, w7, #4 // h -= 4
st1 {v0.s}[2], [x0], x1
st1 {v0.s}[3], [x0], x1
.endif
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
.macro filter_8 w
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
.endm
filter_8 8
filter_8 4
+278
View File
@@ -0,0 +1,278 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2015 Martin Storsjo
* Copyright © 2015 Janne Grunau
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#ifndef DAV1D_SRC_ARM_64_UTIL_S
#define DAV1D_SRC_ARM_64_UTIL_S
#include "config.h"
#include "src/arm/asm.S"
#ifndef __has_feature
#define __has_feature(x) 0
#endif
.macro movrel rd, val, offset=0
#if defined(__APPLE__)
.if \offset < 0
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
.endif
#elif defined(PIC) && defined(_WIN32)
.if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
#elif __has_feature(hwaddress_sanitizer)
adrp \rd, :pg_hi21_nc:\val+(\offset)
movk \rd, #:prel_g3:\val+0x100000000
add \rd, \rd, :lo12:\val+(\offset)
#elif defined(PIC)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
#else
ldr \rd, =\val+\offset
#endif
.endm
.macro sub_sp space
#ifdef _WIN32
.if \space > 8192
// Here, we'd need to touch two (or more) pages while decrementing
// the stack pointer.
.error "sub_sp_align doesn't support values over 8K at the moment"
.elseif \space > 4096
sub x16, sp, #4096
ldr xzr, [x16]
sub sp, x16, #(\space - 4096)
.else
sub sp, sp, #\space
.endif
#else
.if \space >= 4096
sub sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
sub sp, sp, #(\space)%4096
.endif
#endif
.endm
.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
// a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
zip1 \r0\().16b, \r0\().16b, \r1\().16b
// c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
zip1 \r2\().16b, \r2\().16b, \r3\().16b
// e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
zip1 \r4\().16b, \r4\().16b, \r5\().16b
// g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
zip1 \r6\().16b, \r6\().16b, \r7\().16b
// a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
trn1 \r1\().8h, \r0\().8h, \r2\().8h
// a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
trn2 \r3\().8h, \r0\().8h, \r2\().8h
// e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
trn1 \r5\().8h, \r4\().8h, \r6\().8h
// e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
trn2 \r7\().8h, \r4\().8h, \r6\().8h
// a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
trn1 \r0\().4s, \r1\().4s, \r5\().4s
// a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
trn2 \r2\().4s, \r1\().4s, \r5\().4s
// a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
trn1 \r1\().4s, \r3\().4s, \r7\().4s
// a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
trn2 \r3\().4s, \r3\().4s, \r7\().4s
\xtl\()2 \r4\().8h, \r0\().16b
\xtl \r0\().8h, \r0\().8b
\xtl\()2 \r6\().8h, \r2\().16b
\xtl \r2\().8h, \r2\().8b
\xtl\()2 \r5\().8h, \r1\().16b
\xtl \r1\().8h, \r1\().8b
\xtl\()2 \r7\().8h, \r3\().16b
\xtl \r3\().8h, \r3\().8b
.endm
.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().8h, \r0\().8h, \r1\().8h
trn2 \t9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4s, \t9\().4s, \r3\().4s
trn2 \t9\().4s, \t9\().4s, \r3\().4s
trn1 \r3\().4s, \t8\().4s, \r1\().4s
trn2 \t8\().4s, \t8\().4s, \r1\().4s
trn1 \r0\().2d, \r3\().2d, \r4\().2d
trn2 \r4\().2d, \r3\().2d, \r4\().2d
trn1 \r1\().2d, \r5\().2d, \r6\().2d
trn2 \r5\().2d, \r5\().2d, \r6\().2d
trn2 \r6\().2d, \t8\().2d, \r2\().2d
trn1 \r2\().2d, \t8\().2d, \r2\().2d
trn1 \r3\().2d, \t9\().2d, \r7\().2d
trn2 \r7\().2d, \t9\().2d, \r7\().2d
.endm
.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7
trn1 \t8\().8h, \r0\().8h, \r1\().8h
trn2 \t9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4s, \t9\().4s, \r3\().4s
trn2 \t9\().4s, \t9\().4s, \r3\().4s
trn1 \r3\().4s, \t8\().4s, \r1\().4s
trn2 \t8\().4s, \t8\().4s, \r1\().4s
trn1 \o0\().2d, \r3\().2d, \r4\().2d
trn2 \o4\().2d, \r3\().2d, \r4\().2d
trn1 \o1\().2d, \r5\().2d, \r6\().2d
trn2 \o5\().2d, \r5\().2d, \r6\().2d
trn2 \o6\().2d, \t8\().2d, \r2\().2d
trn1 \o2\().2d, \t8\().2d, \r2\().2d
trn1 \o3\().2d, \t9\().2d, \r7\().2d
trn2 \o7\().2d, \t9\().2d, \r7\().2d
.endm
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().16b, \r0\().16b, \r1\().16b
trn2 \t9\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \t9\().8h, \r3\().8h
trn2 \t9\().8h, \t9\().8h, \r3\().8h
trn1 \r3\().8h, \t8\().8h, \r1\().8h
trn2 \t8\().8h, \t8\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \t8\().4s, \r2\().4s
trn1 \r2\().4s, \t8\().4s, \r2\().4s
trn1 \r3\().4s, \t9\().4s, \r7\().4s
trn2 \r7\().4s, \t9\().4s, \r7\().4s
.endm
.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().4h, \r0\().4h, \r1\().4h
trn2 \t5\().4h, \r0\().4h, \r1\().4h
trn1 \t6\().4h, \r2\().4h, \r3\().4h
trn2 \t7\().4h, \r2\().4h, \r3\().4h
trn1 \r0\().2s, \t4\().2s, \t6\().2s
trn2 \r2\().2s, \t4\().2s, \t6\().2s
trn1 \r1\().2s, \t5\().2s, \t7\().2s
trn2 \r3\().2s, \t5\().2s, \t7\().2s
.endm
.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().4s, \r0\().4s, \r1\().4s
trn2 \t5\().4s, \r0\().4s, \r1\().4s
trn1 \t6\().4s, \r2\().4s, \r3\().4s
trn2 \t7\().4s, \r2\().4s, \r3\().4s
trn1 \r0\().2d, \t4\().2d, \t6\().2d
trn2 \r2\().2d, \t4\().2d, \t6\().2d
trn1 \r1\().2d, \t5\().2d, \t7\().2d
trn2 \r3\().2d, \t5\().2d, \t7\().2d
.endm
.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
trn1 \t6\().8h, \r2\().8h, \r3\().8h
trn2 \t7\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().4s, \t4\().4s, \t6\().4s
trn2 \r2\().4s, \t4\().4s, \t6\().4s
trn1 \r1\().4s, \t5\().4s, \t7\().4s
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
.macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
trn1 \t6\().8h, \r2\().8h, \r3\().8h
trn2 \t7\().8h, \r2\().8h, \r3\().8h
trn1 \o0\().4s, \t4\().4s, \t6\().4s
trn2 \o2\().4s, \t4\().4s, \t6\().4s
trn1 \o1\().4s, \t5\().4s, \t7\().4s
trn2 \o3\().4s, \t5\().4s, \t7\().4s
.endm
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
+335
View File
@@ -0,0 +1,335 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Janne Grunau
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_ARM_ASM_S
#define DAV1D_SRC_ARM_ASM_S
#include "config.h"
#if ARCH_AARCH64
#define x18 do_not_use_x18
#define w18 do_not_use_w18
#if HAVE_AS_ARCH_DIRECTIVE
.arch AS_ARCH_LEVEL
#endif
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
#define ENABLE_DOTPROD .arch_extension dotprod
#define DISABLE_DOTPROD .arch_extension nodotprod
#else
#define ENABLE_DOTPROD
#define DISABLE_DOTPROD
#endif
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
#define ENABLE_I8MM .arch_extension i8mm
#define DISABLE_I8MM .arch_extension noi8mm
#else
#define ENABLE_I8MM
#define DISABLE_I8MM
#endif
#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
#define ENABLE_SVE .arch_extension sve
#define DISABLE_SVE .arch_extension nosve
#else
#define ENABLE_SVE
#define DISABLE_SVE
#endif
#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
#define ENABLE_SVE2 .arch_extension sve2
#define DISABLE_SVE2 .arch_extension nosve2
#else
#define ENABLE_SVE2
#define DISABLE_SVE2
#endif
/* If we do support the .arch_extension directives, disable support for all
* the extensions that we may use, in case they were implicitly enabled by
* the .arch level. This makes it clear if we try to assemble an instruction
* from an unintended extension set; we only allow assmbling such instructions
* within regions where we explicitly enable those extensions. */
DISABLE_DOTPROD
DISABLE_I8MM
DISABLE_SVE
DISABLE_SVE2
/* Support macros for
* - Armv8.3-A Pointer Authentication and
* - Armv8.5-A Branch Target Identification
* features which require emitting a .note.gnu.property section with the
* appropriate architecture-dependent feature bits set.
*
* |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
* PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
* used immediately before saving the LR register (x30) to the stack.
* |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
* it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
* with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
* have the same value at the two points. For example:
*
* .global f
* f:
* AARCH64_SIGN_LINK_REGISTER
* stp x29, x30, [sp, #-96]!
* mov x29, sp
* ...
* ldp x29, x30, [sp], #96
* AARCH64_VALIDATE_LINK_REGISTER
* ret
*
* |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
* |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
* indirect call target. In particular, all symbols exported from a file must
* begin with one of these macros. For example, a leaf function that does not
* save LR can instead use |AARCH64_VALID_CALL_TARGET|:
*
* .globl return_zero
* return_zero:
* AARCH64_VALID_CALL_TARGET
* mov x0, #0
* ret
*
* A non-leaf function which does not immediately save LR may need both macros
* because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
* may jump to an alternate implementation before setting up the stack:
*
* .globl with_early_jump
* with_early_jump:
* AARCH64_VALID_CALL_TARGET
* cmp x0, #128
* b.lt .Lwith_early_jump_128
* AARCH64_SIGN_LINK_REGISTER
* stp x29, x30, [sp, #-96]!
* mov x29, sp
* ...
* ldp x29, x30, [sp], #96
* AARCH64_VALIDATE_LINK_REGISTER
* ret
*
* .Lwith_early_jump_128:
* ...
* ret
*
* These annotations are only required with indirect calls. Private symbols that
* are only the target of direct calls do not require annotations. Also note
* that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
* indirect jumps (BR). Indirect jumps in assembly are supported through
* |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
* calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
*
* Although not necessary, it is safe to use these macros in 32-bit ARM
* assembly. This may be used to simplify dual 32-bit and 64-bit files.
*
* References:
* - "ELF for the Arm® 64-bit Architecture"
* https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
* - "Providing protection for complex software"
* https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
*/
#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification
#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc'
#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j'
#else
#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification
#define AARCH64_VALID_JUMP_CALL_TARGET
#define AARCH64_VALID_CALL_TARGET
#define AARCH64_VALID_JUMP_TARGET
#endif
#if defined(__ARM_FEATURE_PAC_DEFAULT)
#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
#define AARCH64_SIGN_LINK_REGISTER paciasp
#define AARCH64_VALIDATE_LINK_REGISTER autiasp
#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
#define AARCH64_SIGN_LINK_REGISTER pacibsp
#define AARCH64_VALIDATE_LINK_REGISTER autibsp
#else
#error Pointer authentication defines no valid key!
#endif
#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
#error Authentication of leaf functions is enabled but not supported in dav1d!
#endif
#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
#elif defined(__APPLE__) && defined(__arm64e__)
#define GNU_PROPERTY_AARCH64_PAC 0
#define AARCH64_SIGN_LINK_REGISTER pacibsp
#define AARCH64_VALIDATE_LINK_REGISTER autibsp
#else /* __ARM_FEATURE_PAC_DEFAULT */
#define GNU_PROPERTY_AARCH64_PAC 0
#define AARCH64_SIGN_LINK_REGISTER
#define AARCH64_VALIDATE_LINK_REGISTER
#endif /* !__ARM_FEATURE_PAC_DEFAULT */
#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
.pushsection .note.gnu.property, "a"
.balign 8
.long 4
.long 0x10
.long 0x5
.asciz "GNU"
.long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
.long 4
.long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
.long 0
.popsection
#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
#endif /* ARCH_AARCH64 */
#if ARCH_ARM
.syntax unified
#ifdef __ELF__
.arch armv7-a
.fpu neon
.eabi_attribute 10, 0 // suppress Tag_FP_arch
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
#endif /* __ELF__ */
#ifdef _WIN32
#define CONFIG_THUMB 1
#else
#define CONFIG_THUMB 0
#endif
#if CONFIG_THUMB
.thumb
#define A @
#define T
#else
#define A
#define T @
#endif /* CONFIG_THUMB */
#endif /* ARCH_ARM */
#if !defined(PIC)
#if defined(__PIC__)
#define PIC __PIC__
#elif defined(__pic__)
#define PIC __pic__
#endif
#endif
#ifndef PRIVATE_PREFIX
#define PRIVATE_PREFIX dav1d_
#endif
#define PASTE(a,b) a ## b
#define CONCAT(a,b) PASTE(a,b)
#ifdef PREFIX
#define EXTERN CONCAT(_,PRIVATE_PREFIX)
#else
#define EXTERN PRIVATE_PREFIX
#endif
.macro function name, export=0, align=2
.macro endfunc
#ifdef __ELF__
.size \name, . - \name
#endif
#if HAVE_AS_FUNC
.endfunc
#endif
.purgem endfunc
.endm
.text
.align \align
.if \export
.global EXTERN\name
#ifdef __ELF__
.type EXTERN\name, %function
.hidden EXTERN\name
#elif defined(__MACH__)
.private_extern EXTERN\name
#endif
#if HAVE_AS_FUNC
.func EXTERN\name
#endif
EXTERN\name:
.else
#ifdef __ELF__
.type \name, %function
#endif
#if HAVE_AS_FUNC
.func \name
#endif
.endif
\name:
#if ARCH_AARCH64
.if \export
AARCH64_VALID_CALL_TARGET
.endif
#endif
.endm
.macro const name, export=0, align=2
.macro endconst
#ifdef __ELF__
.size \name, . - \name
#endif
.purgem endconst
.endm
#if defined(_WIN32)
.section .rdata
#elif !defined(__MACH__)
.section .rodata
#else
.const_data
#endif
.align \align
.if \export
.global EXTERN\name
#ifdef __ELF__
.hidden EXTERN\name
#elif defined(__MACH__)
.private_extern EXTERN\name
#endif
EXTERN\name:
.endif
\name:
.endm
#ifdef __APPLE__
#define L(x) L ## x
#else
#define L(x) .L ## x
#endif
#define X(x) CONCAT(EXTERN, x)
#endif /* DAV1D_SRC_ARM_ASM_S */