Files
daedalus-fourier/external/dav1d-snapshot/src/arm/asm.S
T
marfrit 2cd2258a7b Cycle 5 setup (Phase 1+2): vendor dav1d 1.4.3 CDEF sources
First AV1 kernel cycle and first dav1d-vendored sources. Phase 1+2
docs lay out the structural complexity (CDEF needs pre-padded 12x12
working buffer + external edge context + direction lookup +
constraint function — meaningfully more complex than cycles 1-4).

Phase 3+ deferred to next session — CDEF is the first cycle that
doesn't fit cleanly into a single autonomous run.

Vendored from dav1d 1.4.3 (BSD-2-Clause, cleaner license than
FFmpeg's LGPL-2.1+):

  src/arm/64/cdef.S            520 lines — NEON impl
  src/arm/64/util.S            278 lines — NEON helpers
  src/arm/asm.S                335 lines — GAS preamble
  src/cdef_tmpl.c              331 lines — C reference (templated)
  include/common/intops.h       84 lines — utility helpers
  src/tables_cdef_subset.c      hand-extracted — dav1d_cdef_directions
                                only (avoids dragging full 1013-line
                                tables.c + transitive includes)

Discovery from Phase 2 analysis:
- Filter type and shape: dav1d_cdef_filter8_pri_sec_8bpc_neon takes
  (dst, dst_stride, tmp, pri_strength, sec_strength, dir, damping, h).
  The 'tmp' arg is the pre-padded 12x12 buffer constructed externally
  by the dav1d C-side padding() function.
- Tap weights are inline-computed (not table): pri_tap = 4 or 3
  (based on pri_strength bit), sec_tap = 2 or 1. Only
  dav1d_cdef_directions[12][2] is an external table.
- Constraint function: constrain(diff, threshold, shift) =
  apply_sign(min(abs(diff), max(0, threshold - (abs(diff) >> shift))),
             diff)

Predicted R5 band: 0.15-0.30 (ORANGE). CDEF is compute-heavier than
LPF (per-pixel min/max conditional logic), so likely worse R than
cycle 2/4 but better than cycle 3 MC. M4 gate likely required.

What Phase 3+ needs (next session):
1. config.h shim for dav1d's asm preamble (defines TBD on first build)
2. Standalone C reference for cdef_filter_block_8x8_c
   (cdef_tmpl.c references several dav1d private headers; cleaner to
   transcribe to a self-contained tests/cdef_ref.c)
3. tests/bench_neon_cdef.c — M1+M3 bench
4. Phase 4 plan, Phase 5 review (mandatory), Phase 6 shader, Phase 7 measure

PROVENANCE.md documents pin + per-file role + re-vendoring procedure.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:12:25 +00:00

336 lines
9.8 KiB
ArmAsm

/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Janne Grunau
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_ARM_ASM_S
#define DAV1D_SRC_ARM_ASM_S
#include "config.h"
#if ARCH_AARCH64
#define x18 do_not_use_x18
#define w18 do_not_use_w18
#if HAVE_AS_ARCH_DIRECTIVE
.arch AS_ARCH_LEVEL
#endif
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
#define ENABLE_DOTPROD .arch_extension dotprod
#define DISABLE_DOTPROD .arch_extension nodotprod
#else
#define ENABLE_DOTPROD
#define DISABLE_DOTPROD
#endif
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
#define ENABLE_I8MM .arch_extension i8mm
#define DISABLE_I8MM .arch_extension noi8mm
#else
#define ENABLE_I8MM
#define DISABLE_I8MM
#endif
#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
#define ENABLE_SVE .arch_extension sve
#define DISABLE_SVE .arch_extension nosve
#else
#define ENABLE_SVE
#define DISABLE_SVE
#endif
#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
#define ENABLE_SVE2 .arch_extension sve2
#define DISABLE_SVE2 .arch_extension nosve2
#else
#define ENABLE_SVE2
#define DISABLE_SVE2
#endif
/* If we do support the .arch_extension directives, disable support for all
* the extensions that we may use, in case they were implicitly enabled by
* the .arch level. This makes it clear if we try to assemble an instruction
* from an unintended extension set; we only allow assmbling such instructions
* within regions where we explicitly enable those extensions. */
DISABLE_DOTPROD
DISABLE_I8MM
DISABLE_SVE
DISABLE_SVE2
/* Support macros for
* - Armv8.3-A Pointer Authentication and
* - Armv8.5-A Branch Target Identification
* features which require emitting a .note.gnu.property section with the
* appropriate architecture-dependent feature bits set.
*
* |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
* PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
* used immediately before saving the LR register (x30) to the stack.
* |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
* it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
* with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
* have the same value at the two points. For example:
*
* .global f
* f:
* AARCH64_SIGN_LINK_REGISTER
* stp x29, x30, [sp, #-96]!
* mov x29, sp
* ...
* ldp x29, x30, [sp], #96
* AARCH64_VALIDATE_LINK_REGISTER
* ret
*
* |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
* |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
* indirect call target. In particular, all symbols exported from a file must
* begin with one of these macros. For example, a leaf function that does not
* save LR can instead use |AARCH64_VALID_CALL_TARGET|:
*
* .globl return_zero
* return_zero:
* AARCH64_VALID_CALL_TARGET
* mov x0, #0
* ret
*
* A non-leaf function which does not immediately save LR may need both macros
* because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
* may jump to an alternate implementation before setting up the stack:
*
* .globl with_early_jump
* with_early_jump:
* AARCH64_VALID_CALL_TARGET
* cmp x0, #128
* b.lt .Lwith_early_jump_128
* AARCH64_SIGN_LINK_REGISTER
* stp x29, x30, [sp, #-96]!
* mov x29, sp
* ...
* ldp x29, x30, [sp], #96
* AARCH64_VALIDATE_LINK_REGISTER
* ret
*
* .Lwith_early_jump_128:
* ...
* ret
*
* These annotations are only required with indirect calls. Private symbols that
* are only the target of direct calls do not require annotations. Also note
* that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
* indirect jumps (BR). Indirect jumps in assembly are supported through
* |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
* calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
*
* Although not necessary, it is safe to use these macros in 32-bit ARM
* assembly. This may be used to simplify dual 32-bit and 64-bit files.
*
* References:
* - "ELF for the Arm® 64-bit Architecture"
* https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
* - "Providing protection for complex software"
* https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
*/
#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification
#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc'
#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j'
#else
#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification
#define AARCH64_VALID_JUMP_CALL_TARGET
#define AARCH64_VALID_CALL_TARGET
#define AARCH64_VALID_JUMP_TARGET
#endif
#if defined(__ARM_FEATURE_PAC_DEFAULT)
#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
#define AARCH64_SIGN_LINK_REGISTER paciasp
#define AARCH64_VALIDATE_LINK_REGISTER autiasp
#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
#define AARCH64_SIGN_LINK_REGISTER pacibsp
#define AARCH64_VALIDATE_LINK_REGISTER autibsp
#else
#error Pointer authentication defines no valid key!
#endif
#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
#error Authentication of leaf functions is enabled but not supported in dav1d!
#endif
#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
#elif defined(__APPLE__) && defined(__arm64e__)
#define GNU_PROPERTY_AARCH64_PAC 0
#define AARCH64_SIGN_LINK_REGISTER pacibsp
#define AARCH64_VALIDATE_LINK_REGISTER autibsp
#else /* __ARM_FEATURE_PAC_DEFAULT */
#define GNU_PROPERTY_AARCH64_PAC 0
#define AARCH64_SIGN_LINK_REGISTER
#define AARCH64_VALIDATE_LINK_REGISTER
#endif /* !__ARM_FEATURE_PAC_DEFAULT */
#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
.pushsection .note.gnu.property, "a"
.balign 8
.long 4
.long 0x10
.long 0x5
.asciz "GNU"
.long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
.long 4
.long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
.long 0
.popsection
#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
#endif /* ARCH_AARCH64 */
#if ARCH_ARM
.syntax unified
#ifdef __ELF__
.arch armv7-a
.fpu neon
.eabi_attribute 10, 0 // suppress Tag_FP_arch
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
#endif /* __ELF__ */
#ifdef _WIN32
#define CONFIG_THUMB 1
#else
#define CONFIG_THUMB 0
#endif
#if CONFIG_THUMB
.thumb
#define A @
#define T
#else
#define A
#define T @
#endif /* CONFIG_THUMB */
#endif /* ARCH_ARM */
#if !defined(PIC)
#if defined(__PIC__)
#define PIC __PIC__
#elif defined(__pic__)
#define PIC __pic__
#endif
#endif
#ifndef PRIVATE_PREFIX
#define PRIVATE_PREFIX dav1d_
#endif
#define PASTE(a,b) a ## b
#define CONCAT(a,b) PASTE(a,b)
#ifdef PREFIX
#define EXTERN CONCAT(_,PRIVATE_PREFIX)
#else
#define EXTERN PRIVATE_PREFIX
#endif
.macro function name, export=0, align=2
.macro endfunc
#ifdef __ELF__
.size \name, . - \name
#endif
#if HAVE_AS_FUNC
.endfunc
#endif
.purgem endfunc
.endm
.text
.align \align
.if \export
.global EXTERN\name
#ifdef __ELF__
.type EXTERN\name, %function
.hidden EXTERN\name
#elif defined(__MACH__)
.private_extern EXTERN\name
#endif
#if HAVE_AS_FUNC
.func EXTERN\name
#endif
EXTERN\name:
.else
#ifdef __ELF__
.type \name, %function
#endif
#if HAVE_AS_FUNC
.func \name
#endif
.endif
\name:
#if ARCH_AARCH64
.if \export
AARCH64_VALID_CALL_TARGET
.endif
#endif
.endm
.macro const name, export=0, align=2
.macro endconst
#ifdef __ELF__
.size \name, . - \name
#endif
.purgem endconst
.endm
#if defined(_WIN32)
.section .rdata
#elif !defined(__MACH__)
.section .rodata
#else
.const_data
#endif
.align \align
.if \export
.global EXTERN\name
#ifdef __ELF__
.hidden EXTERN\name
#elif defined(__MACH__)
.private_extern EXTERN\name
#endif
EXTERN\name:
.endif
\name:
.endm
#ifdef __APPLE__
#define L(x) L ## x
#else
#define L(x) .L ## x
#endif
#define X(x) CONCAT(EXTERN, x)
#endif /* DAV1D_SRC_ARM_ASM_S */