Path B pivot + Phase 0-3 closed with first baseline numbers

This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.

Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.

Phases closed:
  Phase 0 — substrate audit; Path A blocked, Path B open;
            codec-back-end-fits-QPU finding (docs/phase0.md)
  Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
            publish-before-measure R = M2/M3 decision rules
            (docs/phase1.md)
  Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
            under external/ffmpeg-snapshot/ (PROVENANCE.md pins
            commit f46e514 + per-file SHA-256s) (docs/phase2.md)
  Phase 3 — real baseline measurements on hertz (docs/phase3.md):
              M1 bit-exact            100.0000 % (10000/10000)
              M3 NEON IDCT8 single    8.171 Mblock/s (122.4 ns/block)
              M5a empty Vulkan submit 22.66 us
              M5b 1-WG noop dispatch  55.60 us
              M5 delta                32.95 us/dispatch
            => per-dispatch overhead is ~455x per-NEON-block cost;
               Phase 4 must batch at frame level or close to it.

Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.

Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 11:30:12 +00:00
commit dcbbc77038
22 changed files with 9030 additions and 0 deletions
+13
View File
@@ -0,0 +1,13 @@
build/
build-*/
*.o
*.spv
.cache/
.vscode/
.idea/
*.swp
*~
# Forensic snapshot of the corrupted .git from 2026-05-18 10:25
# working-tree wipe. Retained on disk for inspection; not tracked.
.git-broken-2026-05-18/
+103
View File
@@ -0,0 +1,103 @@
# daedalus-fourier — Phase 3 baseline + (later) Phase 6 implementation.
#
# Builds:
# bench_neon_idct — NEON throughput baseline (Phase 3 M3) +
# bit-exact correctness gate (Phase 1 M1).
# bench_vulkan_dispatch — Vulkan compute dispatch-overhead baseline (M5).
#
# Linkage note: bench_neon_idct statically links the vendored
# FFmpeg n7.1.3 NEON snapshot (LGPL-2.1+); see
# external/ffmpeg-snapshot/PROVENANCE.md.
cmake_minimum_required(VERSION 3.20)
project(daedalus-fourier C ASM)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED ON)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
message(FATAL_ERROR
"daedalus-fourier targets aarch64 (Pi 5 / BCM2712). "
"Cross-compile not yet wired.")
endif()
add_compile_options(-Wall -Wextra -Wno-unused-parameter)
# ---- Vendored FFmpeg snapshot (LGPL-2.1+) -----------------------------------
set(FFSNAP ${CMAKE_SOURCE_DIR}/external/ffmpeg-snapshot)
# Assembly preamble (config.h shim + FFmpeg's asm helpers) used by the
# vendored .S file. -I flags expose:
# - FFSNAP/ so `#include "config.h"` finds our shim
# - FFSNAP/libavcodec/aarch64/ so `#include "neon.S"` finds the helper
# - FFSNAP/ so `#include "libavutil/aarch64/asm.S"`
# resolves against the vendored copy
set(FFASM_FLAGS
-I${FFSNAP}
-I${FFSNAP}/libavcodec/aarch64
-I${FFSNAP}
)
set(FFASM_SOURCES
${FFSNAP}/libavcodec/aarch64/vp9itxfm_neon.S
)
# Tell CMake/gas to preprocess .S sources.
set_source_files_properties(${FFASM_SOURCES} PROPERTIES
COMPILE_OPTIONS "${FFASM_FLAGS}"
LANGUAGE ASM)
# ---- NEON baseline microbench ----------------------------------------------
add_executable(bench_neon_idct
tests/bench_neon_idct.c
tests/vp9_idct8_ref.c
${FFASM_SOURCES}
)
target_compile_options(bench_neon_idct PRIVATE -O3 -march=armv8-a+simd)
# bench_neon_idct doesn't need vulkan/drm — pure CPU baseline.
# ---- Vulkan dispatch-overhead microbench (next chunk) ----------------------
# Stub: written in a follow-up step. Toggle ON with -DDAEDALUS_BUILD_VULKAN=ON
# once tests/bench_vulkan_dispatch.c exists.
option(DAEDALUS_BUILD_VULKAN "Build Vulkan compute-dispatch microbench" ON)
if (DAEDALUS_BUILD_VULKAN)
find_package(Vulkan REQUIRED)
# Compile GLSL compute shaders to SPIR-V via glslangValidator.
# The binary loads them at runtime from the build dir (cwd-relative).
find_program(GLSLANG_VALIDATOR
NAMES glslangValidator glslang
REQUIRED)
set(NOOP_SPV ${CMAKE_BINARY_DIR}/noop.spv)
add_custom_command(
OUTPUT ${NOOP_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V -o ${NOOP_SPV}
${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
DEPENDS ${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
COMMENT "glslang: noop.comp -> noop.spv"
VERBATIM
)
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV})
add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c)
add_dependencies(bench_vulkan_dispatch daedalus_shaders)
target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan)
target_compile_options(bench_vulkan_dispatch PRIVATE -O2)
endif()
# ---- Summary ----------------------------------------------------------------
message(STATUS "daedalus-fourier build configured for ${CMAKE_SYSTEM_PROCESSOR}")
message(STATUS " FFmpeg snapshot: ${FFSNAP}")
message(STATUS " Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS " Targets: bench_neon_idct"
"$<$<BOOL:${DAEDALUS_BUILD_VULKAN}>:; bench_vulkan_dispatch>")
+177
View File
@@ -0,0 +1,177 @@
# daedalus-fourier
Community-built VP9 / AV1 software-decode back-end running on the
VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 /
Compute Module 5), via the existing Mesa `v3d` userspace driver.
ARM keeps the serial entropy front-end; the QPU takes the parallel
back-end (inverse transforms, deblocking, CDEF, loop restoration,
MC residual add).
> Daedalus built the Labyrinth for King Minos, then escaped from it
> by hand-forging flight firmware out of feathers and wax when no
> sanctioned exit existed.
That's the project shape. The Broadcom-locked VideoCore VII is the
Labyrinth; the Pi Foundation's "use the HEVC block and live with
software decode for everything else" is the official non-exit;
the QPU sits unused inside the labyrinth's walls.
**Status: Phase 0 closed (substrate audit). Phase 1 in progress
(first-kernel proof on hertz).** This is research-track work that
may take months or may yield a single proof-of-concept kernel that
loses to ARM NEON, in which case the negative result ships and the
project closes.
## Why this exists
higgs is a Raspberry Pi Compute Module 5 in a small portable
chassis with a battery. Watching nerds review *Star Wars* on YouTube
while putting Mac Studios into virtual shopping baskets is a
core workload for the higgs class of device.
YouTube serves H.264 (legacy), VP9 (typical 4K), and AV1 (newer
high-bitrate / high-resolution content). It does not serve HEVC.
Pi 5's BCM2712 has one HW decoder block: HEVC. The intersection
of {what YouTube serves} ∩ {what BCM2712 decodes in HW} = ∅.
Every YouTube frame on higgs today is software-decoded on Cortex-A76
cores at ~5090% CPU per video stream. Offloading the parallel
back-end of that decode to the otherwise-idle QPU complex *might*
recover meaningful CPU time and battery on higgs. The honest
prior — measured in Phase 0 — is that the QPU has roughly equal
raw compute to the A76 cluster but a smaller slice of the shared
LPDDR4x bandwidth, so the win, if any, comes from offloading
*concurrent* work the CPU would have done anyway.
The Pi Foundation isn't going to do this work (per their own
statement: chromium-patch sustainment was too much; codec
sustainment would be moreso). The kernel `rpi-hevc-dec` series has
been 17 months in review for one decoder block they DID write
themselves. Whatever ships here ships through the community.
## Architecture (Path B)
Phase 0 closed two paths:
- **Path A — custom VPU firmware on the VC7 scalar cores.**
Blocked. BCM2712 has a silicon root of trust: the mask ROM
hardcodes RPi's public key and unconditionally verifies the
second-stage bootloader. `EXECUTE_CODE` mailbox removed on Pi 5.
No software-only bypass exists. See `docs/phase0.md §3`.
- **Path B — QPU compute kernels via the existing Mesa `v3d` /
DRM / Vulkan-compute path.** This is the path. The QPU is
reachable from userspace today on a stock signed Pi 5 / CM5
via `/dev/dri/card0`. No firmware loading. No signing fight.
`Idein/py-videocore7` (SGEMM 21 GFLOPS sustained) is the
existence proof.
The build:
```
┌───────────────────────────────┐
│ userspace VP9 / AV1 decoder │
│ (fork of dav1d / libvpx) │
├───────────────────────────────┤
│ ARM: entropy decode │ ← Cortex-A76 + NEON
│ (Bool coder / ANS) │ structurally serial
├───────────────────────────────┤
│ QPU: parallel back-end │ ← V3D 7.1 via Mesa v3dv
│ (IDCT, CDEF, │ Vulkan compute shaders
│ deblock, LR, MC) │ or direct DRM submit
├───────────────────────────────┤
│ V4L2 stateless wrapper │ ← out-of-tree kernel module
│ (eventual, kernel-agent) │ exposing /dev/videoN
└───────────────────────────────┘
```
The first deliverable is *not* the V4L2 wrapper. The first
deliverable is one back-end kernel running on the QPU, bit-exact
against a libavcodec reference, with measured throughput. If that
single kernel can't beat NEON or get within 50% of it, the project
closes here with a documented negative result.
## In scope
- A small set of codec back-end kernels (IDCT 8×8, CDEF, deblocking,
loop restoration filter, MC interpolation) compiled as SPIR-V
compute shaders for Mesa `v3dv`, dispatched via Vulkan compute
from userspace.
- A test harness on hertz that runs each kernel against libavcodec
reference outputs and measures throughput (megapixels/sec or
blocks/sec) against the equivalent NEON path.
- Phase 1 = one kernel, bit-exact, with numbers. Phase 2+ = more
kernels only if Phase 1 numbers justify it.
## Out of scope (for now)
- HEVC (Pi 5 has dedicated silicon; `rpi-hevc-dec` covers it).
- Pi 4 / BCM2711 / VideoCore VI. Different ISA, smaller compute
budget. Path B *could* extend but isn't the priority.
- Encode. Pi Foundation removed all HW encode in Pi 5; encode on
VC7 is a separate, larger project.
- Custom VPU firmware (Path A — blocked by silicon RoT, see
`docs/phase0.md`).
- V4L2 stateless driver wrapping the userspace decoder. Eventual
consumption point, but Phase 1 lives entirely in userspace.
- Beating ARM NEON unconditionally. The honest target is
*concurrent* work: QPU runs while CPU does something else.
## Dev substrate
- **hertz** (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75-rpt-rpi-2712,
Mesa 25.0.7 with v3dv, V3D 7.1.7) — the dev / test / measurement
host. Watchdog-protected for crash recovery. See
`docs/vulkaninfo_v3d_7_1_7_hertz.txt` for the inside-view device
profile.
- **higgs** (CM5 in portable battery chassis) — the eventual user
target. Not a dev unit; sealed chassis.
## Conventions
This project follows the 9(+1)-phase dev process. See
`docs/dev_process.md`. Phase 0 is closed (`docs/phase0.md`);
Phase 1 is `docs/phase1.md`.
Gitea identity: `claude-noether` (per
`feedback_gitea_as_claude_noether.md`). No `marfrit` pushes from
Claude sessions.
## Layout
```
daedalus-fourier/
├── README.md ← this file
├── docs/
│ ├── dev_process.md ← reference copy of the 9(+1)-phase loop
│ ├── phase0.md ← substrate audit (closes Paths A and B)
│ ├── phase1.md ← first-kernel goal + measurement plan
│ └── vulkaninfo_v3d_7_1_7_hertz.txt
│ ← inside-view device profile from hertz
├── src/ ← kernels + Vulkan dispatch harness
└── tests/ ← bit-exact vs libavcodec, throughput
```
No build system yet. Adding CMake when the first kernel lands.
## Sibling projects in the same orbit
- `libva-v4l2-request-fourier` — VA-API consumer-side backend.
Eventual consumer if daedalus produces a V4L2 stateless node.
- `firefox-fourier` — Firefox fork that routes stateless V4L2
through libavcodec's `v4l2_request` hwaccel. Same pickup point.
- `chromium-fourier` — sibling for Chromium.
- `kernel-agent` — would house the V4L2 driver wrapping the
userspace decoder, once one exists.
- `ampere-av1-enablement` — software-side AV1 bring-up on RK3588
(rkvdec / vpu981). Provides the userspace conformance harness
daedalus reuses for VC7-AV1 verification.
## Source attribution
Daedalus-the-myth is public domain. The wax-and-feathers
metaphor is older than software engineering.
Anyone wanting to fail at this project: please file your failures
under `branches/icarus/`. Built-in self-deprecation slot, with
honor.
+96
View File
@@ -0,0 +1,96 @@
---
name: Claude-Assisted Development Process (9(+1)-phase loop)
description: Default workflow for any non-trivial implementation — substrate/motivation/inventory, formulate, analyze, baseline, plan, second-model review, implement, verify, closing (package+ship), memory-update; with explicit loopback edges
type: feedback
originSessionId: 83898ac9-e61f-4c44-8429-0154cb12d124
---
Markus's standardized loop for our implementation work. Apply by default whenever a task is bigger than a one-liner. Skipping phases is a deliberate choice that should be flagged, not a default.
## Phase 0 — Substrate / Motivation / Inventory
Pre-formulation. Lock the research question and assemble the substrate *before* Phase 1 commits to a measurable goal. Output: a `phase0_findings.md` artifact that future phases can refer back to without re-deriving.
- **Research question + mechanism captured.** State the question in one sentence. Capture any operator-supplied mechanism (the "why this question, how does it work" insight) verbatim — it's the load-bearing claim Phase 1 binds against.
- **Predecessor carry-over: state vs data.** When a campaign succeeds another, categorize what transfers. *State* (installed packages, governor settings, system tweaks, source-read file:line pointers, protocol designs, parser scripts) carries forward. *Data* (drop counts, perf percentages, threshold values, baseline floors) does not — it is reference history only. Binding cells in this campaign anchor to in-session-acquired numbers, even if the predecessor measured an identical condition.
- **Tooling and measurement-instrument inventory.** What's installed, what would need installing, what extensions/protocols the live system actually supports. Live verification, not paper compatibility.
- **In-session baseline anchor.** Re-run the reference rep — N=3 minimum if the baseline is load-bearing for the campaign's premise — *before* any instrument changes. **If the predecessor's reference floor doesn't replicate at N=3 in the same session, that is the campaign result.** Don't build multi-phase infrastructure on an N=1 historical floor. See `feedback_replicate_baseline_first.md`.
- **Open questions tabled.** What's not known going into Phase 1. Phase 1 locks against the knowns; Phase 0 surfaces the unknowns explicitly so they don't slip into binding cells unverified.
## Phase 1 — Goal Formulation
Define the objective in measurable terms. State what success looks like *before* touching anything. The chosen metric is a **hypothesis** about what to measure, not an axiom — Phase 3 may invalidate it.
## Phase 2 — Situation Analysis
Document current state. Identify constraints, dependencies, known failure modes. **Reset context here** — do not carry assumptions from prior sessions; re-read CLAUDE.md, relevant memory files, run `git status`, re-verify reachability.
## Phase 3 — Baseline Measurements
Take concrete measurements *before* any changes. Paste raw output into DokuWiki at capture time — verbatim, not paraphrased. The Phase 5 artifact is the raw data, not Claude's summary.
**Real data, not theatre.** Phase 3 exists to use AI capacity for absorbing wide, low-level instrumentation a human reader would skim past. Attaching strace / perf / ftrace / eBPF / custom tripwires to the process under test is real Phase 3; scraping mpv's stdout dropped-frame counter is not. Discriminator: if a human with bash and grep could produce the same baseline, it isn't Phase 3 yet — go down to the syscall / call-path / MMIO / register layer. See `feedback_phase3_no_theatre.md`.
**Anti-fabrication:**
- Every cited value traces to a visible tool invocation or verbatim paste-in. If a measurement wasn't taken, write "not measured" — never an estimate, inference, or recall from training / prior sessions / sibling-host memory.
- Raw before derived. A derived number (FPS, p99, error rate) appears alongside the raw stream it came from, never alone.
- Rig failure is the finding. Empty strace, dead UART, perf counter that didn't increment → that *is* the Phase 3 result. Loop back to Phase 2 to fix the rig; do not synthesize plausible-looking baseline data to keep momentum.
- **If baseline reveals the Phase 1 metric was tracking the wrong thing → loop back to Phase 1** with the corrected target. (Example: "max H.264 FPS" Phase 1 metric, but baseline shows DMA-setup + sync overhead dwarfs decode → real metric is bytes-copied-per-second / EGL surface-import time, not FPS.)
**Measurements describe what the system *does*, not what it *should do*.** Baseline data is evidence, not a specification. Do NOT derive API call sequences, struct layouts, or parameter values from observed behaviour (strace, perf, example output). Observable behaviour may reflect bugs, workarounds, or implementation accidents — anything you copy from it inherits those.
## Phase 4 — Plan
Formulate the approach. Identify what will and will not be touched. State expected outcome of implementation in the *same* measurable terms used in Phase 1/3.
## Phase 5 — Second Model Review
Goal, situation, measurements, plan get pasted into **DokuWiki**. Markus reviews and redacts, then initiates the handover to a fresh model instance. **Claude does not curate the artifact going to the reviewer** — that would re-introduce the blind-spot accumulation the review is meant to escape. Do not summarize when handing over; paste the actual artifacts.
## Phase 6 — Implementation
Execute the plan. Scope strictly to what was planned — resist feature creep, refactor-creep, "while I'm here" cleanups, and over-eager scope expansion. If a plan revision is needed mid-implementation, surface it explicitly and re-enter Phase 4.
**Contract before code.** Before writing or modifying any call site:
- Read the API contract — kernel docs, header comments, and upstream source for every call touched.
- State the contract explicitly before implementing against it (in the plan, the commit message, or a comment — somewhere reviewable).
- If the contract cannot be found: stop and surface the gap. Don't infer it from baseline behaviour or sibling code.
**Copying from baseline measurements is not implementation. It is transcription of potentially broken behaviour.** A deliverable that matches baseline bytes but violates the API contract is not a deliverable — it is a deferred bug.
### What "state the contract explicitly" looks like
Worked example: `0012-h264-omit-scaling-matrix-frame-based.patch` in `~/src/ohm_gl_fix/phase6/step1/`. The commit message opens with the contract before any code:
> VAAPI signals "explicit scaling lists are present in the bitstream" implicitly: the consumer (ffmpeg-vaapi, mpv, etc.) sends a `VAIQMatrixBufferH264` alongside `RenderPicture` iff `sps_scaling_matrix_present_flag || pps_scaling_matrix_present_flag`. When the bitstream uses default (flat) scaling, no IQMatrixBuffer arrives […]
>
> Earlier draft of this patch unconditionally omitted SCALING_MATRIX in FRAME_BASED. That's **corpus-correct** (bbb has no explicit scaling lists) but the **wrong predicate**: the kernel-side gating is by "matrix-supplied vs. not," not by decode mode. […]
>
> Contract verification (audit_0008_decode_params_2026-05-01.md + hantro_h264.c::assemble_scaling_list): the kernel uses the supplied matrix when SCALING_MATRIX is in the control batch and falls back to spec-defined defaults when absent. Mode-independent.
What this gets right:
- **Contract first**: per-control rules cited from kernel doc (`ext-ctrls-codec-stateless.rst:752`), kernel driver (`hantro_h264.c::assemble_scaling_list`), and sibling implementation (gst-plugins-bad commit 9e3e775) — *before* any patch hunks.
- **Corpus-correct ≠ spec-correct, called out by name**: the rejected predicate ("omit SCALING_MATRIX in FRAME_BASED") *did* match the BBB baseline. It still got rejected, because the contract said the gate is "matrix-supplied vs. not," not "decode mode." This is exactly the Phase 3-derived-implementation trap.
- **Then** the diff implements one branch per contract clause: SPS/PPS/DECODE_PARAMS always, SCALING_MATRIX iff `matrix_set`, SLICE_PARAMS iff SLICE_BASED, PRED_WEIGHTS iff SLICE_BASED + `V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED`.
Mirror format anywhere reviewable: PR description, commit message body, plan section, or a header comment block. The shape is "contract clauses with citations → code that maps 1:1 to those clauses."
## Phase 7 — Verification Measurements
Repeat measurements from Phase 3. Compare explicitly against baseline.
- **If the delta does not match Phase 4's prediction → loop back to Phase 4** (re-plan). Do not declare success when the numbers say otherwise; an unexplained delta is a finding, not a footnote.
## Phase 8 — Closing (Package & Ship)
Ship the deliverable to its consumption point. Working code that lives only in a checkout is half a deliverable — the next session has to re-discover it, the fleet doesn't get the fix, and the loop's value evaporates.
- **Kernel patch → kernel-agent package.** Route through the kernel-agent flow (`fleet/<host>.yaml` + scope-tagged patches) so the kernel package gets properly built, signed, and published. Don't leave loose `.patch` files in a working tree. See `project_kernel_agent.md` for the manifest shape; `linux-ampere-fourier` and `linux-fresnel-fourier` are the canonical examples.
- **Program / library change → marfrit-packages.** Add or update a PKGBUILD (Arch/ALARM) or debian/ tree (deb), push to `git.reauktion.de/marfrit/marfrit-packages`, and let `.gitea/workflows/build.yml` produce + sign + publish to `packages.reauktion.de`. See `project_marfrit_packages.md`. Local-only fixes go upstream as PR-quality diffs into the same overlay.
- **Skipping is a deliberate choice.** If the change is one-shot scratch work (debugging tripwire, throw-away script), say so explicitly in the closing note. The default is: it gets packaged.
- **Re-verify on the deploy host with the packaged artifact.** A clean Phase 7 result from a hand-rolled dev build (e.g. `meson -Dbuildtype=release && ninja`) is **not** the same as the `.pkg.tar.zst` / `.deb` that the deploy host installs. Distro packaging flags (Arch makepkg's `-O2 + FORTIFY + stack-protector-strong + stack-clash-protection` vs meson's `-O3 -DNDEBUG`, debhelper's hardening defaults, lto toggles) vectorise / unroll loops differently and routinely unmask latent UB the dev build folded away. Pull the published package down via the package manager and re-run the Phase 7 success criterion against it before closing — until that PASSes, the loop is not done. See `feedback_package_build_flags_unmask_bugs.md` for the iter39 incident that codified this.
## Phase 9 — Memory Update
Loop terminates here. Distill the lesson into a memory entry — what was the mistake the loop caught, what's the rule that would shorten the next cycle. Do not let the lesson rot in chat history.
---
## Loopback edges (summary)
- Phase 3 → Phase 1 (metric was wrong)
- Phase 7 → Phase 4 (plan didn't deliver predicted delta)
- Any phase → Phase 0 (substrate was wrong: predecessor baseline didn't replicate, mechanism doesn't engage on this stack, or the data inverts the premise → re-anchor or honest close)
- Phase 9 closes the loop
## Why this exists
Several recurring failures in prior work codify into individual rules — observer-first, simulate-before-flash, three-strikes-then-verify, "trust eyes not vibes," scope-strictly-to-plan, no-fake-dry-run. Those are all symptoms; this loop is the structural fix. Use it as the spine and let those rules show up as rejection patterns inside the appropriate phases.
+239
View File
@@ -0,0 +1,239 @@
---
phase: 0
status: closed 2026-05-18
date_opened: 2026-05-17
date_closed: 2026-05-18
research_method: three rounds of parallel web research (Sonnet via Agent), plus hands-on hertz substrate inventory and live `vulkaninfo` capture
target_hardware: hertz (Pi 5 8 GB) for dev; higgs (CM5) eventual user target
---
# Phase 0 — Substrate / motivation / inventory
This is the consolidated Phase 0 record. Path A (custom VPU firmware)
is **closed at the silicon-RoT step**; Path B (QPU compute via the
existing Mesa `v3d` driver) is **open**. The remainder of the
project lives in Path B.
The earlier session produced two separate Phase 0 artifacts that
were lost when the working tree was wiped at 2026-05-18 10:25
(`.git-broken-2026-05-18/` retains the corrupted state if needed).
This document supersedes both.
---
## 1. Research question
Verbatim from `README.md`:
> Community-built VP9 / AV1 software-decode back-end running on the
> VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 /
> Compute Module 5), via the existing Mesa `v3d` userspace driver.
The load-bearing claim: *the QPU is programmable by us, on stock
production hardware, and the codec back-end is a workload class
where that programmability buys CPU time on the A76 cluster.*
Phase 0's job is to test that claim before Phase 1 binds a metric.
## 2. Substrate inventory — hertz
Captured live 2026-05-17 via SSH. Full `vulkaninfo` in
`vulkaninfo_v3d_7_1_7_hertz.txt`.
| | |
|---|---|
| Host | hertz, Pi 5, 8 GB, eMMC + 1 TB SATA |
| Role | LXD host for 11 containers (home-LAN spine — DNS / VPN / HA proxy / NCP / SMTP) |
| OS | Debian 13 Trixie |
| Kernel | `6.12.75+rpt-rpi-2712` (RPi Foundation kernel, 2026-03-11) |
| CPU | 4× Cortex-A76 @ 2.8 GHz |
| GPU clock | V3D 7.1 @ 1000 MHz (slight OC; spec 960 MHz) |
| Mesa | `25.0.7-2+rpt4` (`libvulkan_broadcom.so` v3dv ICD) |
| Vulkan loader | `1.4.309` |
| Vulkan device API | 1.3.305 (conformance 1.3.8.3) |
| DRM nodes | `card0 → v3d` (compute target), `card1 → vc4-drm` (display), `renderD128` |
| kernel uAPI hdr | `/usr/include/drm/v3d_drm.h` present |
| Build tools | cmake 3.31, ninja 1.12, libvulkan-dev 1.4.309, glslang-tools 15.1.0, spirv-tools 2025.1, libdrm-dev 2.4.131 (installed 2026-05-17) |
| User groups | mfritsche ∈ `render`, `video`, `lxd`, `sudo` |
| Memory pressure | 7.9 GiB RAM, ~3 GiB available; 6 GiB zram, ~2.8 GiB in use (cohabitation with LXD spine) |
| Watchdog | yes — power-cut reboot via Himbeere plug if hertz crashes (acknowledged dev cost: household DNS/VPN drops during each reboot cycle) |
**Inside-view V3D 7.1 compute envelope** (from
`vulkaninfo_v3d_7_1_7_hertz.txt`):
| Property | Value | Implication |
|---|---|---|
| `maxStorageBufferRange` | 1 GiB | Bounds single-tensor size; codec working sets (frames, planes) fit trivially |
| `maxPerStageDescriptorStorageBuffers` | 8 | Forces ≤8 SSBO bindings per dispatch — ggml-vulkan binds more, doesn't fit |
| `maxComputeSharedMemorySize` | 16 KiB | Small tiled kernels only; codec block work (8×8, 16×16) fits easily |
| `maxComputeWorkGroupInvocations` | 256 | Standard |
| `maxComputeWorkGroupSize` | 256 / 256 / ? | Standard |
| `subgroupSize` | 16 (fixed) | Matches QPU SIMD width |
| `subgroupSupportedOperations` | BASIC + VOTE only | No arithmetic reductions — accumulate via shared memory |
| `shaderFloat16` | **false** | Storage only; arithmetic runs FP32 |
| `shaderInt8` | **false** | Storage only; arithmetic on widened ints |
| `shaderInt16` | **false** | Same |
| `storageBuffer8/16BitAccess` | true | Can load tightly-packed quantized / packed pixel data |
| `subgroupSizeControl`, `computeFullSubgroups`, `synchronization2` | true | Modern compute features available |
**Throughput envelopes** (from prior community measurements,
not yet re-confirmed in-session):
| Metric | Value | Source |
|---|---|---|
| V3D 7.1 theoretical FP32 peak | ~92 GFLOPS at 960 MHz | 12 QPU × 4 ALU × 2 op/cycle |
| Direct-DRM SGEMM sustained | 21.4 GFLOPS (~23%) | `Idein/py-videocore7` |
| Vulkan-compute `vkpeak` fp32-vec4 | 6.9 GFLOPS (~7.5%) | RPi forum benchmark thread |
| A76 NEON sustained for matmul | ~50 GFLOPS | Multiple benchmark sources |
| Shared LPDDR4x bus | ~17 GB/s nominal | LPDDR4x-4267 × 32 bit / 8 |
| GPU-measured BW share | 47 GB/s | py-videocore7 scopy benchmark |
| CPU NEON BW achievable | 1215 GB/s | Pi 5 STREAM benchmarks |
## 3. Path A — closed
**Custom VPU firmware loaded onto VC7 scalar cores.** This was the
README's original framing.
Blocked at the silicon-RoT step:
- **BCM2712 mask ROM hardcodes RPi's public key** and unconditionally
verifies the second-stage bootloader (`bootsys`) on every boot
path (SPI flash, USB rpiboot, SD recovery). RPi holds the
corresponding private key.
- `EXECUTE_CODE` mailbox tag (the only documented Pi 14 runtime
"run code on a VPU core" mechanism) **confirmed removed on Pi 5**
by Pi Foundation engineer (forum.raspberrypi.com).
- Pre-CRA EEPROM downgrade is possible (no anti-rollback fuse) but
only yields *older RPi-signed* EEPROMs — doesn't help.
- OTP fuse state on stock CM5 is already the most permissive
possible (customer key hash = zero); the RPi-key check is
silicon-unconditional, not gated by OTP.
- CM5 vs retail Pi 5: same silicon, same chain, no meaningful
security delta.
- One non-software escape exists: VPU JTAG via documented test
points (`schlae/cm5-reveng`, Dec 2025). Hardware mod only,
sealed-chassis higgs not the dev unit, novel research with no
published firmware-injection workflow. Out of scope for this
project.
Verdict: **structurally blocked for community use without RPi
cooperation or hardware-RE-grade work on a sacrificial CM5.**
## 4. Path B — open
**QPU compute kernels via the existing Mesa `v3d` driver.** Reachable
from userspace today on a stock signed Pi 5 / CM5 via
`/dev/dri/card0` (Vulkan compute through `v3dv`) or `renderD128`
(direct DRM submit, py-videocore7 style). No firmware loading.
No signing fight. mfritsche on hertz is in the `render` group and
can hit the device without sudo.
The substrate is real:
- `Idein/py-videocore7` runs SGEMM at 21 GFLOPS sustained on stock
Pi 5 with no special setup — existence proof of arbitrary QPU
programs.
- Mesa v3dv is Vulkan 1.3-conformant on V3D 7.1 (Mesa 24.3+;
hertz runs 25.0.7).
- The kernel `v3d` DRM driver is fully upstream and open.
Phase 0 does **not** assume Path B leads to a winning result. It
asserts only that Path B is *reachable*, where Path A isn't.
## 5. Why this isn't the same project as "v3d backend for llama.cpp"
A llama.cpp v3d backend was investigated mid-session and rejected
as structurally infeasible. The verdict was decisive: GPU loses
to CPU on raw FP32 (21 vs ~50 GFLOPS), on memory bandwidth share
(47 vs 1215 GB/s), and on quantized instruction support (no
INT8 MAC vs A76 SDOT/UDOT). For LLM matmul, the QPU is the wrong
substrate.
**Codec back-end work is a different workload class** with
properties that fit the QPU substantively better:
| Property | LLM matmul | Codec back-end (post-entropy) |
|---|---|---|
| Working set per dispatch | Whole weight matrices (GB) | Per-block (8×8 / 16×16, hundreds of bytes) — fits in 16 KiB shared mem |
| Dominant op | INT8 MAC | Integer add / shift / small-constant multiply |
| Why GPU misses | No INT8 MAC | Less impact — fewer multiplies, mostly add/shift |
| Memory pattern | Full-tensor stream | Sequential plane reads, TMU-friendly |
| Parallelism | One big GEMM | Thousands of independent small blocks per frame |
| A76 advantage | NEON SDOT/UDOT crushing it | Less specialized; QPU advantage real |
| Bandwidth-bound? | Yes (kills the GPU) | Compute-bound at block scale |
This is the load-bearing reframe between the failed llama.cpp
investigation and the daedalus-fourier scope. Codec back-end
*might* live on the QPU. Phase 1 measures whether it actually does.
## 6. Honest probability assessment
A competent outside reviewer should rate the project as **hard but
viable**, with one concrete prior precedent (MulticoreWare /
Imagination PowerVR OpenCL VP9 decoder, 2014, achieved 1080p30 in
a hybrid model with CPU entropy + GPU back-end on a comparable
embedded GPU) and one concrete recent failure (FFmpeg 8.0 VP9-on-
Vulkan-compute, 2025, produced corrupted output on a much more
capable NVIDIA target — but the failure was in the *attempt to
move entropy onto GPU*, not the back-end).
The win condition is **not** "GPU beats CPU at the same work." The
win condition is **"GPU work overlaps with CPU work that has to
happen anyway"** — concurrent decode where ARM does entropy and
the QPU finishes the block-level back-end on the previous frame,
recovering CPU time for the rest of the system (browser, audio,
UI, the 11 LXD containers on hertz).
Phase 1 measures the building block: one kernel, bit-exact, with
numbers. Phase 2+ only if Phase 1 numbers justify it.
## 7. Open questions for Phase 1
1. **What's the actual single-kernel QPU throughput on a
codec-shaped workload?** SGEMM at 21 GFLOPS is the only public
number, and SGEMM is not block-IDCT-shaped. We need an in-session
N=3 measurement on a real codec kernel.
2. **What's the ARM NEON baseline for the same kernel on the same
hertz?** libavcodec ships highly-tuned NEON paths for IDCT,
deblocking, etc. Without measuring NEON in-session, "the QPU
wins" or "the QPU loses" is unverifiable.
3. **Vulkan compute vs direct DRM submit — which path?** Vulkan
has tooling, documentation, debuggability. Direct DRM has
~1015% lower per-dispatch overhead and bypasses the
v3dv-imposed 16 KiB shared-mem / 8-SSBO limits, at the cost
of writing QPU asm against the NDA ISA. Phase 1 picks one.
4. **Memory bandwidth contention with concurrent ARM decode.**
The shared 17 GB/s bus is the floor. If QPU+ARM-NEON both
running collide for bandwidth, the "concurrent work" win
disappears. Needs in-session measurement once any kernel exists.
5. **VC7 thermal headroom under sustained mixed CPU+GPU load.**
Pi 5 throttles GPU at 85°C, CPU at 80°C. hertz idles at ~64°C
with the LXD spine; mixed compute will push higher. With or
without active cooling on hertz is an open question.
These are Phase 1's burden, not Phase 0's. Phase 0 closes here.
## 8. Sources
Earlier session's web research produced ~7000 words of substrate
references across 6 parallel threads. The full source list lived
in the deleted `phase0_findings.md` and `phase0_wall1_bypass.md`.
The high-value pointers that should follow this project forward:
- [Mesa `src/broadcom/qpu/qpu_instr.h`](https://github.com/Mesa3D/mesa/blob/main/src/broadcom/qpu/qpu_instr.h) — de-facto VC7 QPU ISA reference (no Broadcom-published doc; ISA under NDA)
- [Mesa `src/broadcom/compiler/`](https://github.com/Mesa3D/mesa/tree/main/src/broadcom/compiler) — NIR→QPU compiler, the open ground truth for what V3D 7.1 can do
- [`Idein/py-videocore7`](https://github.com/Idein/py-videocore7) — working QPU GPGPU runtime via DRM; SGEMM benchmark; existence proof
- [`Towdo/py-videocore7`](https://github.com/Towdo/py-videocore7) — fork with more fixes
- [Mesa `v3dv` driver source](https://gitlab.freedesktop.org/mesa/mesa/-/tree/main/src/broadcom/vulkan) — Vulkan compute path
- [Pi 5 HEVC kernel driver patch series](https://patchwork.kernel.org) — closest architectural template for ARM-side V4L2 stateless wrapping a Pi-5 hardware accelerator (search "rpi-hevc-dec")
- [raspberrypi/usbboot secure-boot.md](https://github.com/raspberrypi/usbboot/blob/master/docs/secure-boot.md) — Wall 1 silicon-RoT confirmation
- [schlae/cm5-reveng](https://github.com/schlae/cm5-reveng) — CM5 PCB RE; VPU JTAG test points (Dec 2025; out of Path B scope, kept as escape hatch reference)
- [MulticoreWare / Imagination PowerVR VP9 OpenCL decoder press](https://www.design-reuse.com/news/34030/vp9-decoder-imagination-powervr-series6-gpus.html) — 2014 precedent for hybrid codec back-end on embedded GPU compute
- [FFmpeg 8.0 part-3 VP9 Vulkan failure post](https://www.rendi.dev/blog/ffmpeg-8-0-part-3-failed-attempts-to-use-vulkan-for-av1-encoding-vp9-decoding) — recent cautionary tale; failure was in entropy stage, not back-end
- [`Halide/Halide` Vulkan Pi 5 issue #8494](https://github.com/halide/Halide/issues/8494) — known runtime edge cases on Pi 5 Vulkan
- [Pi Forum p=2330030](https://forums.raspberrypi.com/viewtopic.php?p=2330030) — RPi engineer confirms VC7 ISA NDA + EU CRA signing rationale
Future phases should add citations here as they're consumed, not
re-derive Phase 0's substrate findings.
+128
View File
@@ -0,0 +1,128 @@
---
phase: 1
status: open
date_opened: 2026-05-18
parent: phase0.md
target_kernel: VP9 / AV1 8×8 inverse DCT (integer fixed-point)
dev_host: hertz
---
# Phase 1 — Goal formulation
Per `dev_process.md`:
> Define the objective in measurable terms. State what success looks
> like *before* touching anything. The chosen metric is a **hypothesis**
> about what to measure, not an axiom — Phase 3 may invalidate it.
## Kernel under test
**VP9 / AV1 8×8 inverse DCT (DCT_DCT variant), integer 16-bit
fixed-point input, 8-bit output, with reconstructed-block add.**
Mirrors the `ff_vp9_idct_idct_8x8_add_neon` shape in libavcodec
(see `libavcodec/aarch64/vp9itxfm_neon.S`) and the equivalent
dav1d / rav1d / libgav1 implementations for AV1's `IDTX_DCT` /
`DCT_DCT` 8×8 path.
I/O contract (per VP9 spec § 8.7 inverse transform process):
```
input: int16_t coeffs[64] // dequantized transform coefficients
input: uint8_t pred[64] // predicted block (intra/inter)
input: ptrdiff_t stride // typically 8 for an isolated test
output: uint8_t dst[64] // clamp(pred + idct(coeffs)) per pixel
```
Bit-exact: integer arithmetic per spec, no rounding ambiguity.
## Measurable success criteria
Three numbers must come out of Phase 7, all measured in-session on
hertz, all N≥3:
| ID | Measurement | What it tells us |
|---|---|---|
| **M1** | **Bit-exactness rate** vs libavcodec C reference, across ≥10 000 random coefficient blocks | Correctness gate. Must be 100.000 %. Anything less and the kernel is wrong, no other number matters. |
| **M2** | **QPU throughput** in million-blocks-per-second (MblockS), single-threaded host driver, sustained over ≥1 s | The substrate's actual delivered capacity for this kernel shape. |
| **M3** | **NEON throughput** in MblockS on the same hertz, single-threaded, running `ff_vp9_idct_idct_8x8_add_neon` via a microbench harness | The floor any GPU offload has to beat or get close to. |
Derived figure for go/no-go: **R = M2 / M3**.
## Decision rules (set before measuring, per `feedback_no_motivated_reasoning`)
| R | Interpretation | Next step |
|---|---|---|
| ≥ 1.0 | QPU beats NEON on this kernel in isolation. Strong substrate signal. | Phase 9 lessons → Phase 1 of next kernel (deblocking or CDEF). |
| 0.5 ≤ R < 1.0 | QPU loses in isolation but is in the same order of magnitude. *Concurrent-work* hypothesis becomes viable: at R≈0.5 the QPU can roughly handle half of decode while the CPU does the other half + everything else. | Add a Phase 1' measurement: M4 = combined CPU+QPU throughput when both run concurrently (does total system delivery exceed pure-CPU?). Then decide. |
| 0.1 ≤ R < 0.5 | QPU is materially slower. Concurrent-work win unlikely to be worth the integration cost. | Honest close. Phase 9 documents the negative result. |
| < 0.1 | QPU is structurally wrong for this kernel shape. | Honest close. Phase 9 documents the failure, project shelves. |
These thresholds are deliberately published *before* measurement so
the result can't be retroactively reframed.
## Secondary measurements (not gating, but recorded)
- **M5** — per-kernel-launch overhead in µs, isolated (run with 0
blocks, measure submit+wait round-trip). Tells us the floor below
which kernel batching is required.
- **M6** — workgroup-size sweep across {8, 16, 32, 64, 128, 256}
invocations to identify the v3dv-optimal launch shape for this
kernel. Records the Pareto curve, doesn't change R unless the
best-WG result invalidates M2.
- **M7** — power draw delta at the wall (via the Himbeere Fritz!DECT
plug telemetry, if reachable) under idle vs CPU-only vs QPU-only
vs CPU+QPU concurrent. Order-of-magnitude only; informs the higgs
battery argument that motivates the project.
## What Phase 1 does *not* lock
- The dispatch path (Vulkan compute via `v3dv` vs direct DRM
submit via `v3d_drm.h` ioctl). Phase 4 picks. Default for
Phase 1 = **Vulkan compute** unless Phase 4 has reason to flip:
documented, debuggable, doesn't require QPU asm against the
NDA ISA.
- The shader source (GLSL → glslang → SPIR-V) vs hand-written
SPIR-V. Default = GLSL.
- Workgroup partitioning (one-block-per-WG vs many-blocks-per-WG).
Phase 4 chooses based on subgroup width and tile cost; Phase 1
records the sweep (M6).
## Non-goals for Phase 1
- No V4L2 driver work.
- No end-to-end VP9 / AV1 decode (entropy + back-end). Just one
kernel, isolated, measured.
- No optimization beyond what's needed to hit the bit-exact gate
and produce a single throughput number. Tuning is Phase 7's
feedback if R is borderline.
- No build-system perfection. A CMakeLists that compiles the test
harness on hertz is enough.
## Phase 2 → Phase 3 hand-off conditions
Phase 1 closes when:
- The above metrics + decision rules are reviewed (second-model
review per dev_process.md Phase 5? No — this is *Phase 1* not
Phase 5. The Phase 5 second-model review comes after Phase 4
plan).
- The metrics are recorded in this file or a sibling
`phase1_metrics.md` artifact (TBD).
The next phase (Phase 2 — situation analysis) inventories:
- libavcodec's NEON IDCT reference (file, function, calling
convention, expected I/O contract).
- VP9 spec § 8.7 transform process (which the C reference
implements verbatim).
- AV1 spec § 7.7 (same transform structure, larger transform set;
8×8 DCT_DCT path is identical to VP9's at this size).
- Mesa v3dv's compute-shader compilation path and any known
v3dv-specific shader idioms that perform better on V3D 7.1.
- The hertz Vulkan dispatch overhead floor (M5 candidate, but
measured as part of Phase 3 baseline).
## Open questions Phase 1 hands forward
None new. Phase 0 § 7's open questions are the standing list;
Phase 1 picks off Q1 (single-kernel throughput) and Q2 (NEON
baseline) directly via M2 and M3.
+212
View File
@@ -0,0 +1,212 @@
---
phase: 2
status: closed 2026-05-18
date_opened: 2026-05-18
parent: phase1.md
target_kernel: VP9 8×8 inverse DCT (DCT_DCT variant, 8-bit pixels)
---
# Phase 2 — Situation analysis
Per `dev_process.md`:
> Document current state. Identify constraints, dependencies, known
> failure modes. Reset context here — do not carry assumptions from
> prior sessions; re-read CLAUDE.md, relevant memory files, run
> `git status`, re-verify reachability.
## 1. Context reset
- Working tree state: dirty (Phase 0/1/2 docs not yet committed).
`.git-broken-2026-05-18/` preserved as a forensic artifact of
the 2026-05-18 10:25 working-tree wipe (cause undetermined).
- CLAUDE.md re-read: no contradictions with the Path B scope set
in README §"Architecture (Path B)".
- hertz reachability: confirmed via SSH; `vcgencmd`, `vulkaninfo`,
`apt`, sudo NOPASSWD all working as of 2026-05-17 inventory.
Mesa 25.0.7 / Vulkan 1.3.305 / V3D 7.1.7 stable.
## 2. Reference implementations — VP9 8×8 IDCT (DCT_DCT)
The Phase 1 kernel has *two* canonical reference implementations
in FFmpeg n7.1.3 (the version installed on hertz). The harness
will link both: the C path as the bit-exact gate (M1), the NEON
path as the throughput baseline (M3).
### 2.1 C reference
- **Source**: `libavcodec/vp9dsp_template.c`, function `idct_idct_8x8_add_c`
- **Spec basis**: VP9 specification §8.7 — Inverse transform process
- **Signature**:
```c
static void idct_idct_8x8_add_c(uint8_t *_dst, ptrdiff_t stride,
int16_t *_block, int eob);
```
- **Algorithm** (8-bit path):
1. If `eob == 1` (DC-only): single `(coef * 11585 * 11585)` round, broadcast to 8×8 with `+pred, clamp[0,255]`.
2. Otherwise: 8 column passes through `idct8_1d` → tmp[64]. Zero the input block. 8 row passes through `idct8_1d` → out[8]. Per-element `(out + 16) >> 5`, add to `dst`, `av_clip_pixel`.
- **`idct8_1d`**: 1-D 8-point inverse DCT, 8 trigonometric multiply-add stages with Q14 fixed-point constants then 8-butterfly add/sub stages. All arithmetic is signed int32 (`dctint`).
- **Q14 constants** (matched against VP9 spec §8.7.1.4):
| symbol | value | trig identity |
|---|---|---|
| cospi_16_64 | 11585 | cos(π/4) × 2^14 ≈ 0.70711 |
| cospi_24_64 | 6270 | cos(3π/8) × 2^14 ≈ 0.38268 |
| cospi_8_64 | 15137 | sin(3π/8) × 2^14 ≈ 0.92388 |
| cospi_28_64 | 3196 | cos(7π/16) × 2^14 ≈ 0.19509 |
| cospi_4_64 | 16069 | sin(7π/16) × 2^14 ≈ 0.98079 |
| cospi_20_64 | 9102 | cos(5π/16) × 2^14 ≈ 0.55557 |
| cospi_12_64 | 13623 | sin(5π/16) × 2^14 ≈ 0.83147 |
Rounding convention: `(product + (1 << 13)) >> 14`, i.e. round-half-up at bit 14.
- **License**: LGPL-2.1-or-later (FFmpeg).
- **Side effect**: zeroes the input `block[]` (idempotency requirement; matches spec).
### 2.2 NEON reference
- **Source**: `libavcodec/aarch64/vp9itxfm_neon.S`, symbol `ff_vp9_idct_idct_8x8_add_neon`
- **Signature** (same as C):
```
void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
```
Registers: `x0=dst, x1=stride, x2=block, w3=eob`.
- **Internal dependencies** (must be copied alongside the .S):
| macro / symbol | location | role |
|---|---|---|
| `idct8` | `vp9itxfm_neon.S` | 1-D 8-pt IDCT, fully unrolled with `dmbutterfly*` |
| `dmbutterfly0` | `vp9itxfm_neon.S` | rotation by π/4 (the `cospi_16_64` case) |
| `dmbutterfly` | `vp9itxfm_neon.S` | general 2-input rotation `[a,b] → [a·c1b·c2, a·c2+b·c1]` (`Q14`) |
| `dmbutterfly_l` | `vp9itxfm_neon.S` | wide-form (4×i32 acc) for `dmbutterfly` |
| `butterfly_8h` | `vp9itxfm_neon.S` | trivial `[a+b, ab]` on `int16x8_t` |
| `transpose_8x8H` | `libavcodec/aarch64/neon.S` | in-place 8×8 i16 transpose |
| `idct_coeffs` | `vp9itxfm_neon.S` (`const`) | Q14 trig constants table, aligned 4 |
| `movrel` | `libavutil/aarch64/asm.S` | PIC-aware constant-pool relocation helper |
- **License**: LGPL-2.1-or-later (Google, 2016).
- **Performance shape**: full unrolled 8-pt butterfly with NEON `smull/smlsl/smlal` + `rshrn` for the Q14 round-shift; output uses `sqxtun` for saturated narrow to u8. Estimated ~80 NEON instructions for the steady state (non-DC) path.
### 2.3 AV1 equivalence note
AV1's 8×8 DCT_DCT transform (`av1_iidentity8_iidentity8_c` vs `av1_idct8_idct8_c` family in `libavcodec/av1dsp/...`) shares the same 1-D 8-point structure but with **different** scaling: AV1 uses 12-bit fixed-point (`>> 12`) and a slightly different rounding shift due to its different transform-stage bit growth model. Calling our VP9 IDCT shader on AV1 coefficients will produce wrong output. **AV1 support is out of scope for Phase 1.** A Phase-N variant can fork the shader with the AV1 constants once Phase 1 has proven the VP9 path.
## 3. Vulkan compute dispatch path
Hertz exposes V3D 7.1 via Mesa's v3dv driver as Vulkan
`PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU`, API 1.3.305, conformance
1.3.8.3. The compute-only dispatch path is:
```
host program
├─ vkCreateInstance / vkEnumeratePhysicalDevices (picks V3D 7.1.7.0)
├─ vkCreateDevice (queue family with COMPUTE_BIT, no graphics needed)
├─ vkCreateBuffer x N (SSBOs for block coeffs in / dst pixels in+out)
│ - buffer flags: STORAGE_BUFFER_BIT | TRANSFER_SRC/DST
│ - memory type: HOST_VISIBLE | HOST_COHERENT (zero-copy on shared LPDDR4x)
├─ vkCreateDescriptorSetLayout (≤8 SSBOs per layout — Pi 5 limit)
├─ vkCreateShaderModule (SPIR-V from glslang)
├─ vkCreateComputePipeline
├─ vkBeginCommandBuffer
│ vkCmdBindPipeline / vkCmdBindDescriptorSets / vkCmdPushConstants
│ vkCmdDispatch(group_count_x, 1, 1) # one WG per ~K blocks
├─ vkQueueSubmit + vkQueueWaitIdle (or fence) — this is the measured op
└─ (read back via the HOST_VISIBLE buffer, or alias it to the same memory the CPU populated)
```
Per Phase 0 §2 inside-view limits, the relevant constraints
for this kernel:
- ≤8 SSBOs per stage → group inputs/outputs into ≤8 bindings (we
only need 2: `block[]` in, `dst[]` in/out).
- Shared mem ≤16 KiB → each 8×8 block fits trivially (256 B in
i16 plus 64 B in u8). One WG can carry dozens of blocks of
shared state if useful.
- Subgroup size = 16 (fixed). One workgroup of 64 invocations =
4 subgroups; one block per subgroup is a natural shape (each
16-lane subgroup processes 8×8 = 64 pixels in 4 cycles of
subgroup work).
## 4. Build path on hertz
Already installed (2026-05-17): cmake 3.31, ninja 1.12, gcc (Debian
trixie default), `libvulkan-dev 1.4.309`, `glslang-tools 15.1.0`,
`spirv-tools 2025.1`, `libdrm-dev 2.4.131`, `vulkan-tools 1.4.304`.
Missing but cheap:
- `libavcodec-dev` — only needed if the harness wants to link
against system libavcodec for cross-checks against the dynamic
dispatcher. *Not* needed for the source-copy approach (preferred,
see §5).
## 5. Reference-copy strategy (vs system-libavcodec link)
**Decision: source-copy the 3 FFmpeg files into `external/ffmpeg-snapshot/`.**
Rationale:
- System `libavcodec.so` on hertz is symbol-stripped (`nm` returns
empty for `ff_vp9_idct_*`). Internal NEON entry points are not
reachable via `dlsym`.
- The two reference implementations (C, NEON) plus their macro/
data dependencies total ~3 files / ~600 lines. Source-copy is
smaller than the dlopen plumbing would be.
- LGPL-2.1-or-later (FFmpeg license) is propagation-compatible
with the harness binary if the harness binary itself is GPL
or LGPL. The kernel shaders and dispatch library stay
separately-licensed (BSD-2-Clause, default for this project).
- Pinning to `n7.1.3` matches hertz's runtime libavcodec version,
so any in-session sanity cross-check against the running Mesa
/ video tooling stays consistent.
Files to vendor:
| Source | License | Target path under `daedalus-fourier/` |
|---|---|---|
| `libavcodec/vp9dsp_template.c` | LGPL-2.1+ | `external/ffmpeg-snapshot/vp9dsp_template.c` |
| `libavcodec/aarch64/vp9itxfm_neon.S` | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/vp9itxfm_neon.S` |
| `libavcodec/aarch64/neon.S` (for `transpose_8x8H`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/neon.S` |
| `libavutil/aarch64/asm.S` (for `movrel`, `function`, `endfunc`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/asm.S` |
| (whatever else `vp9dsp_template.c` transitively needs) | LGPL-2.1+ | as required |
A `external/ffmpeg-snapshot/COPYING.LGPL` and `external/ffmpeg-snapshot/PROVENANCE.md` document the upstream commit (n7.1.3 tag, commit hash) and the verbatim-copy guarantee.
## 6. Known constraints / failure modes carried from Phase 0
Repeated here so Phase 4 (plan) can bind against them without
re-derivation:
- **C1**: shaderFloat16 = false → all shader arithmetic must be int32 (we are int anyway — no risk).
- **C2**: maxComputeSharedMemorySize = 16 KiB → kernel must not require more (8×8 IDCT trivially fits even with many blocks per WG).
- **C3**: maxPerStageDescriptorStorageBuffers = 8 → we need only 2 (coeffs + dst), no risk.
- **C4**: subgroupSupportedOperations = BASIC + VOTE only → no `subgroupAdd`/etc. for accumulator reductions. Workaround: the IDCT structure is fully data-parallel without reductions; this constraint doesn't bite.
- **C5**: VC7 has SMUL24 but no INT8 MAC. Our Q14 multiplies are i16×i16→i32 — the multiplicands fit in 17 bits, so SMUL24 covers it. No INT8/INT4 issues.
- **C6**: shared LPDDR4x bus; GPU sees ~47 GB/s vs CPU ~1215 GB/s. For 8×8 IDCT, working set is tiny (≤320 B/block), so per-block bandwidth is not the bottleneck; per-dispatch submit overhead is.
- **C7**: VPM read-stall serialization. If we hand-write QPU asm (we won't, in Phase 1) this would matter; the Vulkan compute path lets the v3d_compiler schedule for us.
- **C8**: VC7 thermal throttle at 85°C GPU / 80°C CPU. Phase 7 measurements should record temp before/during/after to flag throttling.
## 7. What Phase 2 does *not* close
- The harness architecture (single binary? Two binaries — one for
bit-exact, one for throughput?). Phase 4 picks.
- Block-per-WG dispatch geometry. Phase 4 + Phase 6 sweep.
- Random-coefficient generation strategy (uniform i16 vs
realistic-distribution; the latter affects DC-only path
frequency). Phase 4 picks; Phase 7 may re-evaluate.
- Whether NEON measurement uses `clock_gettime(CLOCK_MONOTONIC_RAW)`
per-call (high overhead) or batched (more realistic for codec
use). Phase 3 picks during baseline collection.
## 8. Hand-off to Phase 3
Phase 3 measures:
- **M3-prelim**: NEON `ff_vp9_idct_idct_8x8_add_neon` throughput
on hertz, batched over 10⁶ random blocks, single-threaded,
4-thread, sched-isolated. This is the *floor*.
- **M5-prelim**: Vulkan dispatch overhead — pipeline create cost
(one-time), per-`vkCmdDispatch` cost (per-frame-equivalent),
per-`vkQueueSubmit + vkQueueWaitIdle` cost (per-completion).
Bound below which kernel batching is mandatory.
Both are measurements on the *existing* substrate. Neither
requires writing any shader code. Phase 3 closes before Phase 4
(plan) begins.
+105
View File
@@ -0,0 +1,105 @@
---
phase: 3
status: closed 2026-05-18
date_opened: 2026-05-18
date_closed: 2026-05-18
parent: phase2.md
host: hertz (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75+rpt-rpi-2712, Mesa 25.0.7-2+rpt4, V3D 7.1.7 @ 1 GHz, A76 @ 2.8 GHz)
artifacts: build/bench_neon_idct, build/bench_vulkan_dispatch, build/noop.spv
---
# Phase 3 — Baseline measurements
Per `dev_process.md`:
> Take concrete measurements *before* any changes. Raw before
> derived. Real data, not theatre.
These numbers anchor every Phase 4+ decision. Re-run with the
same harness on the same hertz before drawing any new conclusions
in later phases.
## M1 — bit-exact correctness gate (Phase 1)
| | |
|---|---|
| Method | 10 000 random VP9-plausible coefficient blocks + random `pred[64]`, compare `daedalus_vp9_idct_idct_8x8_add_ref` C output vs vendored FFmpeg `ff_vp9_idct_idct_8x8_add_neon` |
| Run | `./bench_neon_idct --blocks 1000000 --iters 5` (built 2026-05-18) |
| **Result** | **10 000 / 10 000 = 100.0000 %** |
| DC-only path frequency | 11 / 10 000 = 0.11 % |
| Notes | Random generator: xorshift64, biased toward 116 non-zero coeffs per block; eob mostly ∈ [4, 63]. DC-only frequency is incidental; Phase 7 may revisit if it materially affects the throughput number. |
**Gate passes. Throughput measurement was authorized to run.**
## M3 — NEON throughput (single-core)
| | |
|---|---|
| Kernel | `ff_vp9_idct_idct_8x8_add_neon` from FFmpeg n7.1.3 (vendored, see `external/ffmpeg-snapshot/PROVENANCE.md`) |
| Method | Pre-generate 1 M random blocks + preds. Per iteration: memcpy refresh of all blocks/preds (NEON path zeroes blocks), then call NEON kernel 1 M times. Subtract setup memcpy time from the measured wall-clock. 5 iterations, single thread, no CPU pinning. |
| Compiler flags | `-O3 -march=armv8-a+simd` |
| Run | `./bench_neon_idct --blocks 1000000 --iters 5` |
| **Throughput** | **8.171 Mblock/s** |
| Per-block | 122.4 ns |
| Equivalent 1080p frame rate | 252.2 FPS (32 400 blocks per 1080p frame, assuming pure 8×8 work) |
| Elapsed (kernel) | 0.612 s / 5 M blocks |
| Elapsed (setup-only) | 0.250 s / 5 M iters |
| Cross-check | Cycle estimate at 2.8 GHz: 122.4 ns × 2.8 GHz ≈ 342 cycles/block. Plausible for a fully-unrolled NEON 8-point IDCT with butterflies + saturated narrow stores; the FFmpeg implementation interleaves loads/computes/stores aggressively. |
### M3 implications
- A single A76 core handles ~8 M blocks/s = **252 FPS at 1080p**. Real decode needs ~60 FPS = 4.2× headroom on one core, ~16× headroom on all four cores. **NEON is not the bottleneck for current YouTube workloads on Pi 5.**
- The QPU offload story is not "make decode faster" — decode is already fast enough single-threaded. The story has to be "free CPU cycles for the rest of the system" (browser, audio, the 11 LXD containers on hertz).
- For a per-kernel R = QPU / NEON measurement (per `phase1.md §"Decision rules"`), the QPU has to hit ≥4 M blocks/s to score R ≥ 0.5. That's the gate.
## M5 — Vulkan compute dispatch overhead
| | |
|---|---|
| Method | Allocate empty pipeline (no descriptors, no push constants), bind+dispatch a `void main(){}` shader on `local_size_x=64`. Time `vkQueueSubmit` + `vkQueueWaitIdle` round-trip. 50 000 iterations, warm. |
| Device | V3D 7.1.7.0 via Mesa v3dv 25.0.7 (selected past llvmpipe by `strstr("V3D")`) |
| Run | `./bench_vulkan_dispatch --iters 50000` |
| **M5a — empty CB submit+wait** | **22.66 µs / op** |
| **M5b — 1-WG noop dispatch submit+wait** | **55.60 µs / op** |
| **M5 delta — per-vkCmdDispatch + pipeline-bind** | **32.95 µs** |
### M5 implications — the load-bearing finding for Phase 4
This is the single most important number from Phase 3.
- Per-dispatch cost (55.6 µs) is **~455× the NEON per-block cost** (122 ns).
- A per-block QPU dispatch is structurally impossible — overhead dominates by two-and-a-half orders of magnitude.
- Break-even batch size for a *hypothetical* zero-cost QPU kernel: **≥ 556 blocks per dispatch**. Real kernel cost on top of that.
- Frame-level batching is mandatory: a 1080p frame has 32 400 8×8 blocks; one dispatch per frame amortizes M5b to 1.7 ns/block — well below NEON's 122 ns.
- Tile-level batching is borderline: a typical VP9 64×64 superblock has 64 sub-blocks; 55.6 µs / 64 ≈ 870 ns/block, ~7× NEON. Probably too coarse — frame-level or full-plane is the right granularity.
### M5 measurement caveats
- `vkQueueWaitIdle` after each submit forces a full GPU sync, modelling the "submit and need the result now" case. Real decode pipelines can submit multiple frames ahead and wait less often — the per-dispatch cost in a pipelined deployment will be lower (probably bounded below by M5a ≈ 22.66 µs as the pure submit cost).
- Empty CB (M5a) at 22.66 µs is the *floor*. This is Mesa command-list construction + kernel `DRM_IOCTL_V3D_SUBMIT_CL` + scheduler RTT. Cannot be optimised at the userspace level without changing Mesa or kernel.
- Both numbers include `vkQueueWaitIdle` overhead; pure submit-without-wait would be lower. For Phase 1's threshold analysis the with-wait number is the right one to use because end-to-end frame decode must wait for its output to be readable.
## Phase 3 closure
Two anchor measurements captured, both with verbatim raw output
(see `bench_neon_idct` and `bench_vulkan_dispatch` source for the
print format). No estimates, no inferences, no recall from prior
sessions or sibling-host memory.
Phase 4 (plan) opens against these numbers. Its first decision:
**given the 32.95 µs per-dispatch floor, what is the
batch granularity for the first kernel?** The answer is either
frame-level (32 400 blocks/dispatch) or row-level (~120
blocks/dispatch for one 1920-wide row of 8×8 → still ~460 ns/block
overhead, ~4× NEON). Frame-level is the only granularity that
amortises overhead enough to leave kernel compute room to win.
Open thread for a later phase (not blocking Phase 4):
- Multi-core NEON sweep (M3'): single-core NEON is the right
*competitor floor*, but the actual ARM headroom on hertz is
4× this number under load.
- Memory-bandwidth contention measurement (M6): does NEON's
rate change when concurrent QPU is reading the same LPDDR4x
bus? Needs the QPU kernel to exist first.
- Power-draw delta via Himbeere plug (M7): same — needs a real
GPU workload to differentiate from idle.
File diff suppressed because it is too large Load Diff
+502
View File
@@ -0,0 +1,502 @@
GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply,
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License may add
an explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms of the
ordinary General Public License).
To apply these terms, attach the following notices to the library. It is
safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.
<one line to give the library's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
<signature of Ty Coon>, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!
+92
View File
@@ -0,0 +1,92 @@
# FFmpeg source snapshot
Verbatim subset of FFmpeg source pinned for use as reference
implementations of the VP9 8×8 inverse DCT (Phase 1 target of
`daedalus-fourier`). See `../../docs/phase2.md §2` and `§5` for
the rationale.
## Upstream pin
- **Repository**: https://github.com/FFmpeg/FFmpeg
- **Tag**: `n7.1.3` (matches `libavcodec61 8:7.1.3-0+deb13u1+rpt1`
shipping in Debian Trixie on the dev host `hertz`)
- **Annotated tag object**: `0a9a757e96fdf053697084bbd1f620edeac9d084`
- **Commit object (tag target)**: `f46e514491172d15bd74b4abb1814cd2f05a763e`
- **Snapshot fetched**: 2026-05-18 (UTC), via
`https://raw.githubusercontent.com/FFmpeg/FFmpeg/n7.1.3/<path>`
## Files in this snapshot
All files are byte-for-byte copies of the upstream source at the
tagged commit, no modifications.
| Path | Lines | Bytes | SHA-256 |
|---|---|---|---|
| `libavcodec/vp9dsp_template.c` | 2578 | 89045 | `41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f` |
| `libavcodec/aarch64/vp9itxfm_neon.S` | 1580 | 63534 | `82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6` |
| `libavcodec/aarch64/neon.S` | 173 | 7496 | `72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538` |
| `libavutil/aarch64/asm.S` | 260 | 8069 | `c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3` |
| `COPYING.LGPLv2.1` | 502 | — | `b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe` |
Verify with:
```sh
( cd external/ffmpeg-snapshot && sha256sum -c <<'EOF'
41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f libavcodec/vp9dsp_template.c
82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6 libavcodec/aarch64/vp9itxfm_neon.S
72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538 libavcodec/aarch64/neon.S
c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3 libavutil/aarch64/asm.S
b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe COPYING.LGPLv2.1
EOF
)
```
## License
LGPL-2.1-or-later. See `COPYING.LGPLv2.1`. Original copyright
holders include the FFmpeg authors and Google Inc. (2016) for
the aarch64 NEON paths. The snapshot inherits FFmpeg's license
in full.
## Why each file is in this snapshot
- `libavcodec/vp9dsp_template.c` — contains `idct_idct_8x8_add_c`,
the bit-exact C reference for the Phase 1 kernel under test (M1).
- `libavcodec/aarch64/vp9itxfm_neon.S` — contains
`ff_vp9_idct_idct_8x8_add_neon`, the NEON throughput baseline
(M3). Also defines `idct8`, `dmbutterfly0`, `dmbutterfly`,
`dmbutterfly_l`, `butterfly_8h`, and the `idct_coeffs` constant
table.
- `libavcodec/aarch64/neon.S` — defines `transpose_8x8H` used by
`vp9itxfm_neon.S`.
- `libavutil/aarch64/asm.S` — defines `function`, `endfunc`,
`movrel`, `const`, `endconst`, and other assembly preamble
macros required to assemble the above NEON files.
## Re-vendoring procedure
If the upstream pin needs to change (e.g., hertz updates to a
newer libavcodec):
```sh
TAG=nX.Y.Z
BASE=https://raw.githubusercontent.com/FFmpeg/FFmpeg/$TAG
cd external/ffmpeg-snapshot
for f in libavcodec/vp9dsp_template.c \
libavcodec/aarch64/vp9itxfm_neon.S \
libavcodec/aarch64/neon.S \
libavutil/aarch64/asm.S \
COPYING.LGPLv2.1; do
curl -sSf -o "$f" "$BASE/$f"
done
sha256sum libavcodec/vp9dsp_template.c \
libavcodec/aarch64/vp9itxfm_neon.S \
libavcodec/aarch64/neon.S \
libavutil/aarch64/asm.S \
COPYING.LGPLv2.1
# update this PROVENANCE.md with the new tag, commit hash, and hashes
```
After re-vendoring, re-run the bit-exact gate (M1) and throughput
baseline (M3) — both can shift across FFmpeg versions even when
the VP9 spec doesn't change (e.g., NEON micro-optimizations).
+27
View File
@@ -0,0 +1,27 @@
/*
* Minimal config.h shim for assembling the vendored FFmpeg .S files
* outside the FFmpeg build tree.
*
* The vendored .S files (vp9itxfm_neon.S, neon.S, asm.S) reference
* exactly 7 preprocessor symbols, enumerated below. Values target
* aarch64-Linux with modern binutils (2.41) matches the Debian
* Trixie environment on hertz (the project's dev host).
*
* See ../../docs/phase2.md §5 for the source-copy rationale and
* PROVENANCE.md for the upstream pin (FFmpeg n7.1.3).
*/
#pragma once
#define HAVE_AS_FUNC 1
#define HAVE_AS_ARCH_DIRECTIVE 1
#define AS_ARCH_LEVEL armv8-a
#define HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE 1
#define HAVE_AS_ARCHEXT_I8MM_DIRECTIVE 1
#define HAVE_SECTION_DATA_REL_RO 1
#define CONFIG_PIC 1
/* Symbol prefix for exported labels. On ELF/Linux this is empty
* (no leading underscore). FFmpeg's configure script normally
* defines this in the generated config.h; we replicate the
* Linux-target value here. */
#define EXTERN_ASM
+173
View File
@@ -0,0 +1,173 @@
/*
* This file is part of FFmpeg.
*
* Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
.macro clip min, max, regs:vararg
.irp x, \regs
smax \x, \x, \min
.endr
.irp x, \regs
smin \x, \x, \max
.endr
.endm
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8b, \r0\().8b, \r1\().8b
trn2 \r9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8b, \r4\().8b, \r5\().8b
trn2 \r5\().8b, \r4\().8b, \r5\().8b
trn1 \r2\().8b, \r6\().8b, \r7\().8b
trn2 \r7\().8b, \r6\().8b, \r7\().8b
trn1 \r4\().4h, \r0\().4h, \r2\().4h
trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4h, \r9\().4h, \r3\().4h
trn2 \r9\().4h, \r9\().4h, \r3\().4h
trn1 \r3\().4h, \r8\().4h, \r1\().4h
trn2 \r8\().4h, \r8\().4h, \r1\().4h
trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2s, \r8\().2s, \r2\().2s
trn1 \r2\().2s, \r8\().2s, \r2\().2s
trn1 \r3\().2s, \r9\().2s, \r7\().2s
trn2 \r7\().2s, \r9\().2s, \r7\().2s
.endm
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
trn1 \t0\().16b, \r0\().16b, \r1\().16b
trn2 \t1\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \t1\().8h, \r3\().8h
trn2 \t1\().8h, \t1\().8h, \r3\().8h
trn1 \r3\().8h, \t0\().8h, \r1\().8h
trn2 \t0\().8h, \t0\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \t0\().4s, \r2\().4s
trn1 \r2\().4s, \t0\().4s, \r2\().4s
trn1 \r3\().4s, \t1\().4s, \r7\().4s
trn2 \r7\().4s, \t1\().4s, \r7\().4s
.endm
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8b, \r0\().8b, \r1\().8b
trn2 \t5\().8b, \r0\().8b, \r1\().8b
trn1 \t6\().8b, \r2\().8b, \r3\().8b
trn2 \t7\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().4h, \t4\().4h, \t6\().4h
trn2 \r2\().4h, \t4\().4h, \t6\().4h
trn1 \r1\().4h, \t5\().4h, \t7\().4h
trn2 \r3\().4h, \t5\().4h, \t7\().4h
.endm
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4h, \r0\().4h, \r1\().4h
trn2 \r5\().4h, \r0\().4h, \r1\().4h
trn1 \r6\().4h, \r2\().4h, \r3\().4h
trn2 \r7\().4h, \r2\().4h, \r3\().4h
trn1 \r0\().2s, \r4\().2s, \r6\().2s
trn2 \r2\().2s, \r4\().2s, \r6\().2s
trn1 \r1\().2s, \r5\().2s, \r7\().2s
trn2 \r3\().2s, \r5\().2s, \r7\().2s
.endm
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
trn1 \t6\().8h, \r2\().8h, \r3\().8h
trn2 \t7\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().4s, \t4\().4s, \t6\().4s
trn2 \r2\().4s, \t4\().4s, \t6\().4s
trn1 \r1\().4s, \t5\().4s, \t7\().4s
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8h, \r0\().8h, \r1\().8h
trn2 \r9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4s, \r9\().4s, \r3\().4s
trn2 \r9\().4s, \r9\().4s, \r3\().4s
trn1 \r3\().4s, \r8\().4s, \r1\().4s
trn2 \r8\().4s, \r8\().4s, \r1\().4s
trn1 \r0\().2d, \r3\().2d, \r4\().2d
trn2 \r4\().2d, \r3\().2d, \r4\().2d
trn1 \r1\().2d, \r5\().2d, \r6\().2d
trn2 \r5\().2d, \r5\().2d, \r6\().2d
trn2 \r6\().2d, \r8\().2d, \r2\().2d
trn1 \r2\().2d, \r8\().2d, \r2\().2d
trn1 \r3\().2d, \r9\().2d, \r7\().2d
trn2 \r7\().2d, \r9\().2d, \r7\().2d
.endm
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+260
View File
@@ -0,0 +1,260 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#ifdef __ELF__
# define ELF
#else
# define ELF #
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
# define FUNC #
#endif
#ifndef __has_feature
# define __has_feature(x) 0
#endif
#if HAVE_AS_ARCH_DIRECTIVE
.arch AS_ARCH_LEVEL
#endif
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
#define ENABLE_DOTPROD .arch_extension dotprod
#define DISABLE_DOTPROD .arch_extension nodotprod
#else
#define ENABLE_DOTPROD
#define DISABLE_DOTPROD
#endif
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
#define ENABLE_I8MM .arch_extension i8mm
#define DISABLE_I8MM .arch_extension noi8mm
#else
#define ENABLE_I8MM
#define DISABLE_I8MM
#endif
DISABLE_DOTPROD
DISABLE_I8MM
/* Support macros for
* - Armv8.3-A Pointer Authentication and
* - Armv8.5-A Branch Target Identification
* features which require emitting a .note.gnu.property section with the
* appropriate architecture-dependent feature bits set.
*
* |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
* PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
* used immediately before saving the LR register (x30) to the stack.
* |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
* it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
* with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
* have the same value at the two points. For example:
*
* .global f
* f:
* AARCH64_SIGN_LINK_REGISTER
* stp x29, x30, [sp, #-96]!
* mov x29, sp
* ...
* ldp x29, x30, [sp], #96
* AARCH64_VALIDATE_LINK_REGISTER
* ret
*
* |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
* |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
* indirect call target. In particular, all symbols exported from a file must
* begin with one of these macros. For example, a leaf function that does not
* save LR can instead use |AARCH64_VALID_CALL_TARGET|:
*
* .globl return_zero
* return_zero:
* AARCH64_VALID_CALL_TARGET
* mov x0, #0
* ret
*
* A non-leaf function which does not immediately save LR may need both macros
* because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
* may jump to an alternate implementation before setting up the stack:
*
* .globl with_early_jump
* with_early_jump:
* AARCH64_VALID_CALL_TARGET
* cmp x0, #128
* b.lt .Lwith_early_jump_128
* AARCH64_SIGN_LINK_REGISTER
* stp x29, x30, [sp, #-96]!
* mov x29, sp
* ...
* ldp x29, x30, [sp], #96
* AARCH64_VALIDATE_LINK_REGISTER
* ret
*
* .Lwith_early_jump_128:
* ...
* ret
*
* These annotations are only required with indirect calls. Private symbols that
* are only the target of direct calls do not require annotations. Also note
* that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
* indirect jumps (BR). Indirect jumps in assembly are supported through
* |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
* calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
*
* Although not necessary, it is safe to use these macros in 32-bit ARM
* assembly. This may be used to simplify dual 32-bit and 64-bit files.
*
* References:
* - "ELF for the Arm® 64-bit Architecture"
* https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
* - "Providing protection for complex software"
* https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
*/
#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
# define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has BTI
# define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
# define AARCH64_VALID_JUMP_TARGET hint #38 // BTI 'j'
#else
# define GNU_PROPERTY_AARCH64_BTI 0 // No BTI
# define AARCH64_VALID_CALL_TARGET
# define AARCH64_VALID_JUMP_TARGET
#endif
#if defined(__ARM_FEATURE_PAC_DEFAULT)
# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
# define AARCH64_SIGN_LINK_REGISTER paciasp
# define AARCH64_VALIDATE_LINK_REGISTER autiasp
# elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
# define AARCH64_SIGN_LINK_REGISTER pacibsp
# define AARCH64_VALIDATE_LINK_REGISTER autibsp
# else
# error Pointer authentication defines no valid key!
# endif
# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0)
# error Authentication of leaf functions is enabled but not supported in FFmpeg!
# endif
# define GNU_PROPERTY_AARCH64_PAC (1 << 1)
#else
# define GNU_PROPERTY_AARCH64_PAC 0
# define AARCH64_SIGN_LINK_REGISTER
# define AARCH64_VALIDATE_LINK_REGISTER
#endif
#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
.pushsection .note.gnu.property, "a"
.balign 8
.long 4
.long 0x10
.long 0x5
.asciz "GNU"
.long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
.long 4
.long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
.long 0
.popsection
#endif
.macro function name, export=0, align=2
.macro endfunc
ELF .size \name, . - \name
FUNC .endfunc
.purgem endfunc
.endm
.text
.align \align
.if \export
.global EXTERN_ASM\name
ELF .type EXTERN_ASM\name, %function
FUNC .func EXTERN_ASM\name
EXTERN_ASM\name:
AARCH64_VALID_CALL_TARGET
.else
ELF .type \name, %function
FUNC .func \name
\name:
.endif
.endm
.macro const name, align=2, relocate=0
.macro endconst
ELF .size \name, . - \name
.purgem endconst
.endm
#if HAVE_SECTION_DATA_REL_RO
.if \relocate
.section .data.rel.ro
.else
.section .rodata
.endif
#elif defined(_WIN32)
.section .rdata
#elif !defined(__MACH__)
.section .rodata
#else
.const_data
#endif
.align \align
\name:
.endm
.macro movrel rd, val, offset=0
#if CONFIG_PIC && defined(__APPLE__)
.if \offset < 0
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
.endif
#elif CONFIG_PIC && defined(_WIN32)
.if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
#elif CONFIG_PIC
# if __has_feature(hwaddress_sanitizer)
adrp \rd, :pg_hi21_nc:\val+(\offset)
# else
adrp \rd, \val+(\offset)
# endif
add \rd, \rd, :lo12:\val+(\offset)
#else
ldr \rd, =\val+\offset
#endif
.endm
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#define X(s) JOIN(EXTERN_ASM, s)
#define x18 do_not_use_x18
#define w18 do_not_use_w18
View File
View File
+248
View File
@@ -0,0 +1,248 @@
/*
* Phase 3 NEON baseline microbench for VP9 8×8 DCT_DCT IDCT add.
*
* Reports two numbers:
* M1 (correctness): bit-exact match rate, our C reference vs
* FFmpeg's NEON, across N random blocks.
* M3 (throughput): NEON sustained MblockS on this host.
*
* Both are gating measurements for Phase 1 (see docs/phase1.md).
* NO QPU work happens here that's later phases.
*
* Build: see CMakeLists.txt at project root.
* Run: ./bench_neon_idct [--blocks N] [--iters K] [--seed S]
*
* License: BSD-2-Clause (daedalus-fourier), but this binary
* statically links the LGPL-2.1+ FFmpeg NEON snapshot
* distribute the binary under LGPL-2.1+ in that case.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <time.h>
#include <getopt.h>
/* Our C reference (tests/vp9_idct8_ref.c). */
extern void daedalus_vp9_idct_idct_8x8_add_ref(
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
/* FFmpeg NEON entry point (vendored vp9itxfm_neon.S). */
extern void ff_vp9_idct_idct_8x8_add_neon(
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
/* ---- Random-block generation ----------------------------------- */
/* xorshift64 — deterministic per seed, fast enough not to dominate
* the measurement. */
static uint64_t xs64_state;
static inline uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
/* Random VP9-plausible coefficient block: most coefficients zero,
* a handful of nonzero ones in low-frequency positions. Bias chosen
* so eob is typically in [4, 32], hitting the general (non-DC) path.
* For Phase 3 baseline this isn't load-balanced against a real
* bitstream distribution Phase 7 may revisit. */
static int gen_block(int16_t block[64])
{
memset(block, 0, 64 * sizeof(*block));
int eob = 0;
int n_nonzero = 1 + (int)(xs64() % 16);
for (int i = 0; i < n_nonzero; i++) {
/* Bias toward low-freq positions via xs64() % (xs64() % 64 + 1). */
int pos = (int)(xs64() % 64);
/* Coefficient range: signed 12-bit (typical dequant output). */
int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096);
block[pos] = coef;
if (pos + 1 > eob) eob = pos + 1;
}
if (eob == 0) eob = 1;
return eob;
}
static void gen_pred(uint8_t pred[64])
{
for (int i = 0; i < 64; i++)
pred[i] = (uint8_t)(xs64() & 0xff);
}
/* ---- Wall-clock timing (CLOCK_MONOTONIC_RAW) ------------------- */
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
/* ---- Phase 1 M1: bit-exact gate -------------------------------- */
static int correctness_check(uint64_t seed, int n_blocks)
{
xs64_state = seed ? seed : 0xdeadbeefcafebabeULL;
int mismatches = 0;
int dc_only_seen = 0;
int16_t block_a[64], block_b[64];
uint8_t pred[64];
uint8_t dst_a[64], dst_b[64];
for (int i = 0; i < n_blocks; i++) {
int eob = gen_block(block_a);
memcpy(block_b, block_a, sizeof(block_a));
gen_pred(pred);
memcpy(dst_a, pred, 64);
memcpy(dst_b, pred, 64);
daedalus_vp9_idct_idct_8x8_add_ref(dst_a, 8, block_a, eob);
ff_vp9_idct_idct_8x8_add_neon(dst_b, 8, block_b, eob);
if (memcmp(dst_a, dst_b, 64) != 0) {
if (mismatches < 4) {
fprintf(stderr, "MISMATCH block %d eob=%d:\n", i, eob);
for (int r = 0; r < 8; r++) {
fprintf(stderr, " row %d ref ", r);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r * 8 + c]);
fprintf(stderr, " neon ");
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r * 8 + c]);
fprintf(stderr, "\n");
}
}
mismatches++;
}
if (eob == 1) dc_only_seen++;
}
printf("M1 correctness: %d / %d blocks bit-exact match (%.4f%%)\n",
n_blocks - mismatches, n_blocks,
100.0 * (n_blocks - mismatches) / n_blocks);
printf(" dc-only path frequency: %d / %d (%.2f%%)\n",
dc_only_seen, n_blocks, 100.0 * dc_only_seen / n_blocks);
return mismatches;
}
/* ---- Phase 1 M3: NEON throughput ------------------------------- */
static void throughput_neon(uint64_t seed, int n_blocks, int iters)
{
xs64_state = seed ? seed : 0xfeedfacecafebeefULL;
/* Pre-generate all blocks + preds so generation cost is excluded
* from the timed region. Each block is consumed once per iteration
* (NEON path zeroes the block, so we restore from the master). */
int16_t *blocks_master = malloc(n_blocks * 64 * sizeof(int16_t));
int16_t *blocks_work = malloc(n_blocks * 64 * sizeof(int16_t));
uint8_t *preds = malloc(n_blocks * 64);
uint8_t *dsts = malloc(n_blocks * 64);
int *eobs = malloc(n_blocks * sizeof(int));
if (!blocks_master || !blocks_work || !preds || !dsts || !eobs) {
fprintf(stderr, "alloc failed\n");
exit(1);
}
for (int i = 0; i < n_blocks; i++) {
eobs[i] = gen_block(blocks_master + i * 64);
gen_pred(preds + i * 64);
}
/* Warm-up. */
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
memcpy(dsts, preds, n_blocks * 64);
for (int i = 0; i < n_blocks; i++)
ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
blocks_work + i * 64, eobs[i]);
/* Timed region. */
double t0 = now_seconds();
for (int it = 0; it < iters; it++) {
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
memcpy(dsts, preds, n_blocks * 64);
for (int i = 0; i < n_blocks; i++)
ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
blocks_work + i * 64, eobs[i]);
}
double t1 = now_seconds();
/* memcpy cost-only run, to subtract setup overhead. */
double s0 = now_seconds();
for (int it = 0; it < iters; it++) {
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
memcpy(dsts, preds, n_blocks * 64);
}
double s1 = now_seconds();
double total_seconds = (t1 - t0) - (s1 - s0);
double total_blocks = (double) n_blocks * iters;
double mblocks_s = total_blocks / total_seconds / 1e6;
printf("M3 NEON throughput:\n");
printf(" blocks=%d iters=%d total=%.0f\n", n_blocks, iters, total_blocks);
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", total_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" throughput = %.3f Mblock/s\n", mblocks_s);
printf(" per-block = %.1f ns\n", total_seconds / total_blocks * 1e9);
/* Equivalent at 1920x1080: 32 400 blocks/frame -> FPS. */
printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n",
mblocks_s * 1e6 / 32400.0);
free(blocks_master); free(blocks_work); free(preds);
free(dsts); free(eobs);
}
/* ---- CLI ------------------------------------------------------- */
static void usage(const char *p)
{
fprintf(stderr,
"Usage: %s [--blocks N] [--iters K] [--seed S] [--no-correctness]\n"
"Defaults: N=1000000, K=10, S=0 (uses fixed default).\n", p);
}
int main(int argc, char **argv)
{
int n_blocks = 1000000;
int iters = 10;
uint64_t seed = 0;
int do_correctness = 1;
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"iters", required_argument, 0, 'i'},
{"seed", required_argument, 0, 's'},
{"no-correctness", no_argument, 0, 'C'},
{"help", no_argument, 0, 'h'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:i:s:Ch", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'i': iters = atoi(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'C': do_correctness = 0; break;
case 'h': usage(argv[0]); return 0;
default: usage(argv[0]); return 2;
}
}
if (do_correctness) {
printf("=== M1: bit-exact correctness (10000 random blocks) ===\n");
int miss = correctness_check(seed, 10000);
if (miss != 0) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
return 1;
}
printf("\n");
}
printf("=== M3: NEON throughput ===\n");
throughput_neon(seed, n_blocks, iters);
return 0;
}
+279
View File
@@ -0,0 +1,279 @@
/*
* Phase 3 Vulkan compute dispatch-overhead microbench (M5).
*
* Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa
* v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a
* noop compute shader. Establishes the floor below which kernel
* batching is mandatory.
*
* Two measurements:
* M5a: empty command-buffer submit (no dispatch at all)
* M5b: 1-workgroup dispatch of an empty shader
*
* The delta M5b - M5a isolates the per-vkCmdDispatch cost from
* the per-vkQueueSubmit cost.
*
* Build: cmake -DDAEDALUS_BUILD_VULKAN=ON ..
* Run: ./bench_vulkan_dispatch [--iters N] [--spv PATH]
*
* License: BSD-2-Clause (daedalus-fourier).
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \
exit(1); } } while (0)
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static uint32_t *read_spv(const char *path, size_t *out_size)
{
FILE *f = fopen(path, "rb");
if (!f) { perror(path); exit(1); }
fseek(f, 0, SEEK_END);
long sz = ftell(f);
fseek(f, 0, SEEK_SET);
if (sz <= 0 || (sz & 3)) {
fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
exit(1);
}
uint32_t *buf = malloc(sz);
if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
perror("read"); exit(1);
}
fclose(f);
*out_size = sz;
return buf;
}
int main(int argc, char **argv)
{
int iters = 100000;
const char *spv_path = "noop.spv";
static struct option opts[] = {
{"iters", required_argument, 0, 'i'},
{"spv", required_argument, 0, 's'},
{"help", no_argument, 0, 'h'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) {
switch (c) {
case 'i': iters = atoi(optarg); break;
case 's': spv_path = optarg; break;
case 'h':
fprintf(stderr,
"Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]);
return 0;
default:
return 2;
}
}
/* ---- Instance ---- */
VkApplicationInfo app = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "daedalus-fourier-bench",
.apiVersion = VK_API_VERSION_1_3,
};
VkInstanceCreateInfo ici = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app,
};
VkInstance instance;
CHK(vkCreateInstance(&ici, NULL, &instance));
/* ---- Pick V3D physical device (skip llvmpipe) ---- */
uint32_t pd_count = 0;
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL));
VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds));
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds));
VkPhysicalDevice phys = VK_NULL_HANDLE;
VkPhysicalDeviceProperties props = {0};
for (uint32_t i = 0; i < pd_count; i++) {
vkGetPhysicalDeviceProperties(pds[i], &props);
printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n",
i, props.deviceName,
VK_VERSION_MAJOR(props.apiVersion),
VK_VERSION_MINOR(props.apiVersion),
VK_VERSION_PATCH(props.apiVersion),
props.vendorID);
if (strstr(props.deviceName, "V3D") != NULL) {
phys = pds[i];
}
}
if (phys == VK_NULL_HANDLE) {
fprintf(stderr, "no V3D device found; bailing.\n");
return 1;
}
vkGetPhysicalDeviceProperties(phys, &props);
printf("selected: %s\n", props.deviceName);
free(pds);
/* ---- Compute queue family ---- */
uint32_t qfc = 0;
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL);
VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp));
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp);
uint32_t qfi = (uint32_t) -1;
for (uint32_t i = 0; i < qfc; i++) {
if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
qfi = i; break;
}
}
if (qfi == (uint32_t) -1) {
fprintf(stderr, "no compute queue family\n");
return 1;
}
free(qfp);
/* ---- Logical device ---- */
float qprio = 1.0f;
VkDeviceQueueCreateInfo dqci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = qfi,
.queueCount = 1,
.pQueuePriorities = &qprio,
};
VkDeviceCreateInfo dci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &dqci,
};
VkDevice dev;
CHK(vkCreateDevice(phys, &dci, NULL, &dev));
VkQueue queue;
vkGetDeviceQueue(dev, qfi, 0, &queue);
/* ---- Command pool + buffers ---- */
VkCommandPoolCreateInfo cpci = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
.queueFamilyIndex = qfi,
};
VkCommandPool pool;
CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool));
VkCommandBuffer cb_empty, cb_dispatch;
VkCommandBufferAllocateInfo cbai = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty));
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch));
/* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */
VkPipelineLayoutCreateInfo plci = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
};
VkPipelineLayout playout;
CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout));
/* ---- Compute pipeline from noop SPIR-V ---- */
size_t spv_size = 0;
uint32_t *spv = read_spv(spv_path, &spv_size);
VkShaderModuleCreateInfo smci = {
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
.codeSize = spv_size,
.pCode = spv,
};
VkShaderModule shader;
CHK(vkCreateShaderModule(dev, &smci, NULL, &shader));
free(spv);
VkComputePipelineCreateInfo cpci2 = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = shader,
.pName = "main",
},
.layout = playout,
};
VkPipeline pipe;
CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe));
/* ---- Record both command buffers once, reuse for every iteration ---- */
VkCommandBufferBeginInfo cbbi = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
};
CHK(vkBeginCommandBuffer(cb_empty, &cbbi));
CHK(vkEndCommandBuffer(cb_empty));
CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi));
vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe);
vkCmdDispatch(cb_dispatch, 1, 1, 1);
CHK(vkEndCommandBuffer(cb_dispatch));
VkSubmitInfo si_empty = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1, .pCommandBuffers = &cb_empty,
};
VkSubmitInfo si_disp = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1, .pCommandBuffers = &cb_dispatch,
};
/* ---- Warm-up ---- */
for (int i = 0; i < 100; i++) {
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(queue));
}
/* ---- M5a: empty CB submit+wait ---- */
double t0 = now_seconds();
for (int i = 0; i < iters; i++) {
CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(queue));
}
double t1 = now_seconds();
double m5a_per = (t1 - t0) / iters * 1e6; /* µs */
/* ---- M5b: 1-WG noop dispatch submit+wait ---- */
double t2 = now_seconds();
for (int i = 0; i < iters; i++) {
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(queue));
}
double t3 = now_seconds();
double m5b_per = (t3 - t2) / iters * 1e6; /* µs */
printf("\n=== M5: Vulkan compute dispatch overhead ===\n");
printf(" iters per measurement: %d\n", iters);
printf(" M5a empty CB submit+wait: %.2f µs/op\n", m5a_per);
printf(" M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per);
printf(" delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n",
m5b_per - m5a_per);
printf("\n");
printf(" Implication for kernel batching:\n");
printf(" if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n");
printf(" a single-block dispatch costs %.0fx more in overhead\n",
m5b_per * 1e3 / 100.0);
printf(" -> batch at least %.0f blocks per dispatch to break even.\n",
m5b_per * 1e3 / 100.0);
/* ---- Tear down (minimal — process exit handles the rest) ---- */
vkDestroyPipeline(dev, pipe, NULL);
vkDestroyShaderModule(dev, shader, NULL);
vkDestroyPipelineLayout(dev, playout, NULL);
vkDestroyCommandPool(dev, pool, NULL);
vkDestroyDevice(dev, NULL);
vkDestroyInstance(instance, NULL);
return 0;
}
+5
View File
@@ -0,0 +1,5 @@
#version 450
// Empty compute shader for measuring Vulkan dispatch overhead (M5).
// Reads nothing, writes nothing — pure dispatch round-trip floor.
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
void main() {}
+114
View File
@@ -0,0 +1,114 @@
/*
* Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
* transform + add (8-bit pixels), transcribed from the spec
* structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
* (vendored under external/ffmpeg-snapshot/ at commit f46e514).
*
* Provided as a self-contained translation unit so the harness
* doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
* expansion. Cross-checked against the vendored reference at
* runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
*
* License: LGPL-2.1-or-later (matches the upstream reference).
*
* Spec source: VP9 specification §8.7 Inverse transform process.
*/
#include <stdint.h>
#include <stddef.h>
#include <string.h>
/* Q14 trig constants — VP9 spec table 8.7.1.4. */
#define COSPI_16_64 11585 /* cos(pi/4) * 2^14 */
#define COSPI_24_64 6270 /* cos(3pi/8) * 2^14 */
#define COSPI_8_64 15137 /* sin(3pi/8) * 2^14 */
#define COSPI_28_64 3196 /* cos(7pi/16)* 2^14 */
#define COSPI_4_64 16069 /* sin(7pi/16)* 2^14 */
#define COSPI_20_64 9102 /* cos(5pi/16)* 2^14 */
#define COSPI_12_64 13623 /* sin(5pi/16)* 2^14 */
/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
static inline int32_t qround14(int64_t x)
{
return (int32_t) ((x + (1 << 13)) >> 14);
}
static inline uint8_t clip_u8(int x)
{
return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
}
/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
* idct8_1d in libavcodec/vp9dsp_template.c (with the stride
* collapsed to indexed access; identical arithmetic). */
static void idct8_1d(const int32_t in[8], int32_t out[8])
{
int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64 + (int64_t)in[6] * COSPI_24_64);
int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64 + (int64_t)in[7] * COSPI_28_64);
int32_t t0 = t0a + t3a, t1 = t1a + t2a;
int32_t t2 = t1a - t2a, t3 = t0a - t3a;
int32_t t4 = t4a + t5a;
int32_t t5p = t4a - t5a;
int32_t t7 = t7a + t6a;
int32_t t6p = t7a - t6a;
int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);
out[0] = t0 + t7; out[1] = t1 + t6;
out[2] = t2 + t5; out[3] = t3 + t4;
out[4] = t3 - t4; out[5] = t2 - t5;
out[6] = t1 - t6; out[7] = t0 - t7;
}
/* Public reference entry point. Signature matches
* ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
* zeroed (matches FFmpeg behaviour). */
void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob)
{
int32_t tmp[64];
int32_t out[8];
int32_t col[8];
/* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
* broadcast (+16) >> 5 added to every pixel. */
if (eob == 1) {
int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
* (int64_t) COSPI_16_64);
block[0] = 0;
int32_t add = (dc + 16) >> 5;
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
return;
}
/* 8 column passes, transposed write: IDCT of block column i lands
* in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
* uses `tmp + i*8` as the column-pass output base the transpose
* is implicit in the offset pattern, making the row pass below
* read columns of tmp and write columns of dst. */
for (int i = 0; i < 8; i++) {
for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
idct8_1d(col, out);
for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
}
memset(block, 0, 64 * sizeof(*block));
/* 8 row passes: column i of tmp -> column i of dst (matches
* FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
for (int i = 0; i < 8; i++) {
for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
idct8_1d(col, out);
for (int r = 0; r < 8; r++)
dst[r * stride + i] = clip_u8(dst[r * stride + i]
+ ((out[r] + 16) >> 5));
}
}