dcbbc77038
This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.
Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.
Phases closed:
Phase 0 — substrate audit; Path A blocked, Path B open;
codec-back-end-fits-QPU finding (docs/phase0.md)
Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
publish-before-measure R = M2/M3 decision rules
(docs/phase1.md)
Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
under external/ffmpeg-snapshot/ (PROVENANCE.md pins
commit f46e514 + per-file SHA-256s) (docs/phase2.md)
Phase 3 — real baseline measurements on hertz (docs/phase3.md):
M1 bit-exact 100.0000 % (10000/10000)
M3 NEON IDCT8 single 8.171 Mblock/s (122.4 ns/block)
M5a empty Vulkan submit 22.66 us
M5b 1-WG noop dispatch 55.60 us
M5 delta 32.95 us/dispatch
=> per-dispatch overhead is ~455x per-NEON-block cost;
Phase 4 must batch at frame level or close to it.
Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.
Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
174 lines
7.3 KiB
ArmAsm
174 lines
7.3 KiB
ArmAsm
/*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
.macro clip min, max, regs:vararg
|
|
.irp x, \regs
|
|
smax \x, \x, \min
|
|
.endr
|
|
.irp x, \regs
|
|
smin \x, \x, \max
|
|
.endr
|
|
.endm
|
|
|
|
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
|
trn1 \r8\().8b, \r0\().8b, \r1\().8b
|
|
trn2 \r9\().8b, \r0\().8b, \r1\().8b
|
|
trn1 \r1\().8b, \r2\().8b, \r3\().8b
|
|
trn2 \r3\().8b, \r2\().8b, \r3\().8b
|
|
trn1 \r0\().8b, \r4\().8b, \r5\().8b
|
|
trn2 \r5\().8b, \r4\().8b, \r5\().8b
|
|
trn1 \r2\().8b, \r6\().8b, \r7\().8b
|
|
trn2 \r7\().8b, \r6\().8b, \r7\().8b
|
|
|
|
trn1 \r4\().4h, \r0\().4h, \r2\().4h
|
|
trn2 \r2\().4h, \r0\().4h, \r2\().4h
|
|
trn1 \r6\().4h, \r5\().4h, \r7\().4h
|
|
trn2 \r7\().4h, \r5\().4h, \r7\().4h
|
|
trn1 \r5\().4h, \r9\().4h, \r3\().4h
|
|
trn2 \r9\().4h, \r9\().4h, \r3\().4h
|
|
trn1 \r3\().4h, \r8\().4h, \r1\().4h
|
|
trn2 \r8\().4h, \r8\().4h, \r1\().4h
|
|
|
|
trn1 \r0\().2s, \r3\().2s, \r4\().2s
|
|
trn2 \r4\().2s, \r3\().2s, \r4\().2s
|
|
|
|
trn1 \r1\().2s, \r5\().2s, \r6\().2s
|
|
trn2 \r5\().2s, \r5\().2s, \r6\().2s
|
|
|
|
trn2 \r6\().2s, \r8\().2s, \r2\().2s
|
|
trn1 \r2\().2s, \r8\().2s, \r2\().2s
|
|
|
|
trn1 \r3\().2s, \r9\().2s, \r7\().2s
|
|
trn2 \r7\().2s, \r9\().2s, \r7\().2s
|
|
.endm
|
|
|
|
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
|
trn1 \t0\().16b, \r0\().16b, \r1\().16b
|
|
trn2 \t1\().16b, \r0\().16b, \r1\().16b
|
|
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
|
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
|
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
|
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
|
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
|
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
|
|
|
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
|
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
|
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
|
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
|
trn1 \r5\().8h, \t1\().8h, \r3\().8h
|
|
trn2 \t1\().8h, \t1\().8h, \r3\().8h
|
|
trn1 \r3\().8h, \t0\().8h, \r1\().8h
|
|
trn2 \t0\().8h, \t0\().8h, \r1\().8h
|
|
|
|
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
|
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
|
|
|
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
|
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
|
|
|
trn2 \r6\().4s, \t0\().4s, \r2\().4s
|
|
trn1 \r2\().4s, \t0\().4s, \r2\().4s
|
|
|
|
trn1 \r3\().4s, \t1\().4s, \r7\().4s
|
|
trn2 \r7\().4s, \t1\().4s, \r7\().4s
|
|
.endm
|
|
|
|
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
|
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
|
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
|
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
|
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
|
|
|
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
|
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
|
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
|
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
|
.endm
|
|
|
|
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
|
trn1 \t4\().8b, \r0\().8b, \r1\().8b
|
|
trn2 \t5\().8b, \r0\().8b, \r1\().8b
|
|
trn1 \t6\().8b, \r2\().8b, \r3\().8b
|
|
trn2 \t7\().8b, \r2\().8b, \r3\().8b
|
|
|
|
trn1 \r0\().4h, \t4\().4h, \t6\().4h
|
|
trn2 \r2\().4h, \t4\().4h, \t6\().4h
|
|
trn1 \r1\().4h, \t5\().4h, \t7\().4h
|
|
trn2 \r3\().4h, \t5\().4h, \t7\().4h
|
|
.endm
|
|
|
|
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
|
trn1 \r4\().4h, \r0\().4h, \r1\().4h
|
|
trn2 \r5\().4h, \r0\().4h, \r1\().4h
|
|
trn1 \r6\().4h, \r2\().4h, \r3\().4h
|
|
trn2 \r7\().4h, \r2\().4h, \r3\().4h
|
|
|
|
trn1 \r0\().2s, \r4\().2s, \r6\().2s
|
|
trn2 \r2\().2s, \r4\().2s, \r6\().2s
|
|
trn1 \r1\().2s, \r5\().2s, \r7\().2s
|
|
trn2 \r3\().2s, \r5\().2s, \r7\().2s
|
|
.endm
|
|
|
|
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
|
|
trn1 \t4\().8h, \r0\().8h, \r1\().8h
|
|
trn2 \t5\().8h, \r0\().8h, \r1\().8h
|
|
trn1 \t6\().8h, \r2\().8h, \r3\().8h
|
|
trn2 \t7\().8h, \r2\().8h, \r3\().8h
|
|
|
|
trn1 \r0\().4s, \t4\().4s, \t6\().4s
|
|
trn2 \r2\().4s, \t4\().4s, \t6\().4s
|
|
trn1 \r1\().4s, \t5\().4s, \t7\().4s
|
|
trn2 \r3\().4s, \t5\().4s, \t7\().4s
|
|
.endm
|
|
|
|
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
|
trn1 \r8\().8h, \r0\().8h, \r1\().8h
|
|
trn2 \r9\().8h, \r0\().8h, \r1\().8h
|
|
trn1 \r1\().8h, \r2\().8h, \r3\().8h
|
|
trn2 \r3\().8h, \r2\().8h, \r3\().8h
|
|
trn1 \r0\().8h, \r4\().8h, \r5\().8h
|
|
trn2 \r5\().8h, \r4\().8h, \r5\().8h
|
|
trn1 \r2\().8h, \r6\().8h, \r7\().8h
|
|
trn2 \r7\().8h, \r6\().8h, \r7\().8h
|
|
|
|
trn1 \r4\().4s, \r0\().4s, \r2\().4s
|
|
trn2 \r2\().4s, \r0\().4s, \r2\().4s
|
|
trn1 \r6\().4s, \r5\().4s, \r7\().4s
|
|
trn2 \r7\().4s, \r5\().4s, \r7\().4s
|
|
trn1 \r5\().4s, \r9\().4s, \r3\().4s
|
|
trn2 \r9\().4s, \r9\().4s, \r3\().4s
|
|
trn1 \r3\().4s, \r8\().4s, \r1\().4s
|
|
trn2 \r8\().4s, \r8\().4s, \r1\().4s
|
|
|
|
trn1 \r0\().2d, \r3\().2d, \r4\().2d
|
|
trn2 \r4\().2d, \r3\().2d, \r4\().2d
|
|
|
|
trn1 \r1\().2d, \r5\().2d, \r6\().2d
|
|
trn2 \r5\().2d, \r5\().2d, \r6\().2d
|
|
|
|
trn2 \r6\().2d, \r8\().2d, \r2\().2d
|
|
trn1 \r2\().2d, \r8\().2d, \r2\().2d
|
|
|
|
trn1 \r3\().2d, \r9\().2d, \r7\().2d
|
|
trn2 \r7\().2d, \r9\().2d, \r7\().2d
|
|
|
|
.endm
|