Path B pivot + Phase 0-3 closed with first baseline numbers
This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.
Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.
Phases closed:
Phase 0 — substrate audit; Path A blocked, Path B open;
codec-back-end-fits-QPU finding (docs/phase0.md)
Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
publish-before-measure R = M2/M3 decision rules
(docs/phase1.md)
Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
under external/ffmpeg-snapshot/ (PROVENANCE.md pins
commit f46e514 + per-file SHA-256s) (docs/phase2.md)
Phase 3 — real baseline measurements on hertz (docs/phase3.md):
M1 bit-exact 100.0000 % (10000/10000)
M3 NEON IDCT8 single 8.171 Mblock/s (122.4 ns/block)
M5a empty Vulkan submit 22.66 us
M5b 1-WG noop dispatch 55.60 us
M5 delta 32.95 us/dispatch
=> per-dispatch overhead is ~455x per-NEON-block cost;
Phase 4 must batch at frame level or close to it.
Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.
Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+13
@@ -0,0 +1,13 @@
|
||||
build/
|
||||
build-*/
|
||||
*.o
|
||||
*.spv
|
||||
.cache/
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*~
|
||||
|
||||
# Forensic snapshot of the corrupted .git from 2026-05-18 10:25
|
||||
# working-tree wipe. Retained on disk for inspection; not tracked.
|
||||
.git-broken-2026-05-18/
|
||||
+103
@@ -0,0 +1,103 @@
|
||||
# daedalus-fourier — Phase 3 baseline + (later) Phase 6 implementation.
|
||||
#
|
||||
# Builds:
|
||||
# bench_neon_idct — NEON throughput baseline (Phase 3 M3) +
|
||||
# bit-exact correctness gate (Phase 1 M1).
|
||||
# bench_vulkan_dispatch — Vulkan compute dispatch-overhead baseline (M5).
|
||||
#
|
||||
# Linkage note: bench_neon_idct statically links the vendored
|
||||
# FFmpeg n7.1.3 NEON snapshot (LGPL-2.1+); see
|
||||
# external/ffmpeg-snapshot/PROVENANCE.md.
|
||||
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
project(daedalus-fourier C ASM)
|
||||
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_C_STANDARD_REQUIRED ON)
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif()
|
||||
|
||||
if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
message(FATAL_ERROR
|
||||
"daedalus-fourier targets aarch64 (Pi 5 / BCM2712). "
|
||||
"Cross-compile not yet wired.")
|
||||
endif()
|
||||
|
||||
add_compile_options(-Wall -Wextra -Wno-unused-parameter)
|
||||
|
||||
# ---- Vendored FFmpeg snapshot (LGPL-2.1+) -----------------------------------
|
||||
|
||||
set(FFSNAP ${CMAKE_SOURCE_DIR}/external/ffmpeg-snapshot)
|
||||
|
||||
# Assembly preamble (config.h shim + FFmpeg's asm helpers) used by the
|
||||
# vendored .S file. -I flags expose:
|
||||
# - FFSNAP/ so `#include "config.h"` finds our shim
|
||||
# - FFSNAP/libavcodec/aarch64/ so `#include "neon.S"` finds the helper
|
||||
# - FFSNAP/ so `#include "libavutil/aarch64/asm.S"`
|
||||
# resolves against the vendored copy
|
||||
set(FFASM_FLAGS
|
||||
-I${FFSNAP}
|
||||
-I${FFSNAP}/libavcodec/aarch64
|
||||
-I${FFSNAP}
|
||||
)
|
||||
|
||||
set(FFASM_SOURCES
|
||||
${FFSNAP}/libavcodec/aarch64/vp9itxfm_neon.S
|
||||
)
|
||||
|
||||
# Tell CMake/gas to preprocess .S sources.
|
||||
set_source_files_properties(${FFASM_SOURCES} PROPERTIES
|
||||
COMPILE_OPTIONS "${FFASM_FLAGS}"
|
||||
LANGUAGE ASM)
|
||||
|
||||
# ---- NEON baseline microbench ----------------------------------------------
|
||||
|
||||
add_executable(bench_neon_idct
|
||||
tests/bench_neon_idct.c
|
||||
tests/vp9_idct8_ref.c
|
||||
${FFASM_SOURCES}
|
||||
)
|
||||
target_compile_options(bench_neon_idct PRIVATE -O3 -march=armv8-a+simd)
|
||||
# bench_neon_idct doesn't need vulkan/drm — pure CPU baseline.
|
||||
|
||||
# ---- Vulkan dispatch-overhead microbench (next chunk) ----------------------
|
||||
# Stub: written in a follow-up step. Toggle ON with -DDAEDALUS_BUILD_VULKAN=ON
|
||||
# once tests/bench_vulkan_dispatch.c exists.
|
||||
|
||||
option(DAEDALUS_BUILD_VULKAN "Build Vulkan compute-dispatch microbench" ON)
|
||||
|
||||
if (DAEDALUS_BUILD_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
|
||||
# Compile GLSL compute shaders to SPIR-V via glslangValidator.
|
||||
# The binary loads them at runtime from the build dir (cwd-relative).
|
||||
find_program(GLSLANG_VALIDATOR
|
||||
NAMES glslangValidator glslang
|
||||
REQUIRED)
|
||||
|
||||
set(NOOP_SPV ${CMAKE_BINARY_DIR}/noop.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${NOOP_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V -o ${NOOP_SPV}
|
||||
${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
|
||||
COMMENT "glslang: noop.comp -> noop.spv"
|
||||
VERBATIM
|
||||
)
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV})
|
||||
|
||||
add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c)
|
||||
add_dependencies(bench_vulkan_dispatch daedalus_shaders)
|
||||
target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan)
|
||||
target_compile_options(bench_vulkan_dispatch PRIVATE -O2)
|
||||
endif()
|
||||
|
||||
# ---- Summary ----------------------------------------------------------------
|
||||
|
||||
message(STATUS "daedalus-fourier build configured for ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
message(STATUS " FFmpeg snapshot: ${FFSNAP}")
|
||||
message(STATUS " Build type: ${CMAKE_BUILD_TYPE}")
|
||||
message(STATUS " Targets: bench_neon_idct"
|
||||
"$<$<BOOL:${DAEDALUS_BUILD_VULKAN}>:; bench_vulkan_dispatch>")
|
||||
@@ -0,0 +1,177 @@
|
||||
# daedalus-fourier
|
||||
|
||||
Community-built VP9 / AV1 software-decode back-end running on the
|
||||
VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 /
|
||||
Compute Module 5), via the existing Mesa `v3d` userspace driver.
|
||||
ARM keeps the serial entropy front-end; the QPU takes the parallel
|
||||
back-end (inverse transforms, deblocking, CDEF, loop restoration,
|
||||
MC residual add).
|
||||
|
||||
> Daedalus built the Labyrinth for King Minos, then escaped from it
|
||||
> by hand-forging flight firmware out of feathers and wax when no
|
||||
> sanctioned exit existed.
|
||||
|
||||
That's the project shape. The Broadcom-locked VideoCore VII is the
|
||||
Labyrinth; the Pi Foundation's "use the HEVC block and live with
|
||||
software decode for everything else" is the official non-exit;
|
||||
the QPU sits unused inside the labyrinth's walls.
|
||||
|
||||
**Status: Phase 0 closed (substrate audit). Phase 1 in progress
|
||||
(first-kernel proof on hertz).** This is research-track work that
|
||||
may take months or may yield a single proof-of-concept kernel that
|
||||
loses to ARM NEON, in which case the negative result ships and the
|
||||
project closes.
|
||||
|
||||
## Why this exists
|
||||
|
||||
higgs is a Raspberry Pi Compute Module 5 in a small portable
|
||||
chassis with a battery. Watching nerds review *Star Wars* on YouTube
|
||||
while putting Mac Studios into virtual shopping baskets is a
|
||||
core workload for the higgs class of device.
|
||||
|
||||
YouTube serves H.264 (legacy), VP9 (typical 4K), and AV1 (newer
|
||||
high-bitrate / high-resolution content). It does not serve HEVC.
|
||||
Pi 5's BCM2712 has one HW decoder block: HEVC. The intersection
|
||||
of {what YouTube serves} ∩ {what BCM2712 decodes in HW} = ∅.
|
||||
|
||||
Every YouTube frame on higgs today is software-decoded on Cortex-A76
|
||||
cores at ~50–90% CPU per video stream. Offloading the parallel
|
||||
back-end of that decode to the otherwise-idle QPU complex *might*
|
||||
recover meaningful CPU time and battery on higgs. The honest
|
||||
prior — measured in Phase 0 — is that the QPU has roughly equal
|
||||
raw compute to the A76 cluster but a smaller slice of the shared
|
||||
LPDDR4x bandwidth, so the win, if any, comes from offloading
|
||||
*concurrent* work the CPU would have done anyway.
|
||||
|
||||
The Pi Foundation isn't going to do this work (per their own
|
||||
statement: chromium-patch sustainment was too much; codec
|
||||
sustainment would be moreso). The kernel `rpi-hevc-dec` series has
|
||||
been 17 months in review for one decoder block they DID write
|
||||
themselves. Whatever ships here ships through the community.
|
||||
|
||||
## Architecture (Path B)
|
||||
|
||||
Phase 0 closed two paths:
|
||||
|
||||
- **Path A — custom VPU firmware on the VC7 scalar cores.**
|
||||
Blocked. BCM2712 has a silicon root of trust: the mask ROM
|
||||
hardcodes RPi's public key and unconditionally verifies the
|
||||
second-stage bootloader. `EXECUTE_CODE` mailbox removed on Pi 5.
|
||||
No software-only bypass exists. See `docs/phase0.md §3`.
|
||||
|
||||
- **Path B — QPU compute kernels via the existing Mesa `v3d` /
|
||||
DRM / Vulkan-compute path.** This is the path. The QPU is
|
||||
reachable from userspace today on a stock signed Pi 5 / CM5
|
||||
via `/dev/dri/card0`. No firmware loading. No signing fight.
|
||||
`Idein/py-videocore7` (SGEMM 21 GFLOPS sustained) is the
|
||||
existence proof.
|
||||
|
||||
The build:
|
||||
|
||||
```
|
||||
┌───────────────────────────────┐
|
||||
│ userspace VP9 / AV1 decoder │
|
||||
│ (fork of dav1d / libvpx) │
|
||||
├───────────────────────────────┤
|
||||
│ ARM: entropy decode │ ← Cortex-A76 + NEON
|
||||
│ (Bool coder / ANS) │ structurally serial
|
||||
├───────────────────────────────┤
|
||||
│ QPU: parallel back-end │ ← V3D 7.1 via Mesa v3dv
|
||||
│ (IDCT, CDEF, │ Vulkan compute shaders
|
||||
│ deblock, LR, MC) │ or direct DRM submit
|
||||
├───────────────────────────────┤
|
||||
│ V4L2 stateless wrapper │ ← out-of-tree kernel module
|
||||
│ (eventual, kernel-agent) │ exposing /dev/videoN
|
||||
└───────────────────────────────┘
|
||||
```
|
||||
|
||||
The first deliverable is *not* the V4L2 wrapper. The first
|
||||
deliverable is one back-end kernel running on the QPU, bit-exact
|
||||
against a libavcodec reference, with measured throughput. If that
|
||||
single kernel can't beat NEON or get within 50% of it, the project
|
||||
closes here with a documented negative result.
|
||||
|
||||
## In scope
|
||||
|
||||
- A small set of codec back-end kernels (IDCT 8×8, CDEF, deblocking,
|
||||
loop restoration filter, MC interpolation) compiled as SPIR-V
|
||||
compute shaders for Mesa `v3dv`, dispatched via Vulkan compute
|
||||
from userspace.
|
||||
- A test harness on hertz that runs each kernel against libavcodec
|
||||
reference outputs and measures throughput (megapixels/sec or
|
||||
blocks/sec) against the equivalent NEON path.
|
||||
- Phase 1 = one kernel, bit-exact, with numbers. Phase 2+ = more
|
||||
kernels only if Phase 1 numbers justify it.
|
||||
|
||||
## Out of scope (for now)
|
||||
|
||||
- HEVC (Pi 5 has dedicated silicon; `rpi-hevc-dec` covers it).
|
||||
- Pi 4 / BCM2711 / VideoCore VI. Different ISA, smaller compute
|
||||
budget. Path B *could* extend but isn't the priority.
|
||||
- Encode. Pi Foundation removed all HW encode in Pi 5; encode on
|
||||
VC7 is a separate, larger project.
|
||||
- Custom VPU firmware (Path A — blocked by silicon RoT, see
|
||||
`docs/phase0.md`).
|
||||
- V4L2 stateless driver wrapping the userspace decoder. Eventual
|
||||
consumption point, but Phase 1 lives entirely in userspace.
|
||||
- Beating ARM NEON unconditionally. The honest target is
|
||||
*concurrent* work: QPU runs while CPU does something else.
|
||||
|
||||
## Dev substrate
|
||||
|
||||
- **hertz** (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75-rpt-rpi-2712,
|
||||
Mesa 25.0.7 with v3dv, V3D 7.1.7) — the dev / test / measurement
|
||||
host. Watchdog-protected for crash recovery. See
|
||||
`docs/vulkaninfo_v3d_7_1_7_hertz.txt` for the inside-view device
|
||||
profile.
|
||||
- **higgs** (CM5 in portable battery chassis) — the eventual user
|
||||
target. Not a dev unit; sealed chassis.
|
||||
|
||||
## Conventions
|
||||
|
||||
This project follows the 9(+1)-phase dev process. See
|
||||
`docs/dev_process.md`. Phase 0 is closed (`docs/phase0.md`);
|
||||
Phase 1 is `docs/phase1.md`.
|
||||
|
||||
Gitea identity: `claude-noether` (per
|
||||
`feedback_gitea_as_claude_noether.md`). No `marfrit` pushes from
|
||||
Claude sessions.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
daedalus-fourier/
|
||||
├── README.md ← this file
|
||||
├── docs/
|
||||
│ ├── dev_process.md ← reference copy of the 9(+1)-phase loop
|
||||
│ ├── phase0.md ← substrate audit (closes Paths A and B)
|
||||
│ ├── phase1.md ← first-kernel goal + measurement plan
|
||||
│ └── vulkaninfo_v3d_7_1_7_hertz.txt
|
||||
│ ← inside-view device profile from hertz
|
||||
├── src/ ← kernels + Vulkan dispatch harness
|
||||
└── tests/ ← bit-exact vs libavcodec, throughput
|
||||
```
|
||||
|
||||
No build system yet. Adding CMake when the first kernel lands.
|
||||
|
||||
## Sibling projects in the same orbit
|
||||
|
||||
- `libva-v4l2-request-fourier` — VA-API consumer-side backend.
|
||||
Eventual consumer if daedalus produces a V4L2 stateless node.
|
||||
- `firefox-fourier` — Firefox fork that routes stateless V4L2
|
||||
through libavcodec's `v4l2_request` hwaccel. Same pickup point.
|
||||
- `chromium-fourier` — sibling for Chromium.
|
||||
- `kernel-agent` — would house the V4L2 driver wrapping the
|
||||
userspace decoder, once one exists.
|
||||
- `ampere-av1-enablement` — software-side AV1 bring-up on RK3588
|
||||
(rkvdec / vpu981). Provides the userspace conformance harness
|
||||
daedalus reuses for VC7-AV1 verification.
|
||||
|
||||
## Source attribution
|
||||
|
||||
Daedalus-the-myth is public domain. The wax-and-feathers
|
||||
metaphor is older than software engineering.
|
||||
|
||||
Anyone wanting to fail at this project: please file your failures
|
||||
under `branches/icarus/`. Built-in self-deprecation slot, with
|
||||
honor.
|
||||
@@ -0,0 +1,96 @@
|
||||
---
|
||||
name: Claude-Assisted Development Process (9(+1)-phase loop)
|
||||
description: Default workflow for any non-trivial implementation — substrate/motivation/inventory, formulate, analyze, baseline, plan, second-model review, implement, verify, closing (package+ship), memory-update; with explicit loopback edges
|
||||
type: feedback
|
||||
originSessionId: 83898ac9-e61f-4c44-8429-0154cb12d124
|
||||
---
|
||||
Markus's standardized loop for our implementation work. Apply by default whenever a task is bigger than a one-liner. Skipping phases is a deliberate choice that should be flagged, not a default.
|
||||
|
||||
## Phase 0 — Substrate / Motivation / Inventory
|
||||
|
||||
Pre-formulation. Lock the research question and assemble the substrate *before* Phase 1 commits to a measurable goal. Output: a `phase0_findings.md` artifact that future phases can refer back to without re-deriving.
|
||||
|
||||
- **Research question + mechanism captured.** State the question in one sentence. Capture any operator-supplied mechanism (the "why this question, how does it work" insight) verbatim — it's the load-bearing claim Phase 1 binds against.
|
||||
- **Predecessor carry-over: state vs data.** When a campaign succeeds another, categorize what transfers. *State* (installed packages, governor settings, system tweaks, source-read file:line pointers, protocol designs, parser scripts) carries forward. *Data* (drop counts, perf percentages, threshold values, baseline floors) does not — it is reference history only. Binding cells in this campaign anchor to in-session-acquired numbers, even if the predecessor measured an identical condition.
|
||||
- **Tooling and measurement-instrument inventory.** What's installed, what would need installing, what extensions/protocols the live system actually supports. Live verification, not paper compatibility.
|
||||
- **In-session baseline anchor.** Re-run the reference rep — N=3 minimum if the baseline is load-bearing for the campaign's premise — *before* any instrument changes. **If the predecessor's reference floor doesn't replicate at N=3 in the same session, that is the campaign result.** Don't build multi-phase infrastructure on an N=1 historical floor. See `feedback_replicate_baseline_first.md`.
|
||||
- **Open questions tabled.** What's not known going into Phase 1. Phase 1 locks against the knowns; Phase 0 surfaces the unknowns explicitly so they don't slip into binding cells unverified.
|
||||
|
||||
## Phase 1 — Goal Formulation
|
||||
Define the objective in measurable terms. State what success looks like *before* touching anything. The chosen metric is a **hypothesis** about what to measure, not an axiom — Phase 3 may invalidate it.
|
||||
|
||||
## Phase 2 — Situation Analysis
|
||||
Document current state. Identify constraints, dependencies, known failure modes. **Reset context here** — do not carry assumptions from prior sessions; re-read CLAUDE.md, relevant memory files, run `git status`, re-verify reachability.
|
||||
|
||||
## Phase 3 — Baseline Measurements
|
||||
Take concrete measurements *before* any changes. Paste raw output into DokuWiki at capture time — verbatim, not paraphrased. The Phase 5 artifact is the raw data, not Claude's summary.
|
||||
|
||||
**Real data, not theatre.** Phase 3 exists to use AI capacity for absorbing wide, low-level instrumentation a human reader would skim past. Attaching strace / perf / ftrace / eBPF / custom tripwires to the process under test is real Phase 3; scraping mpv's stdout dropped-frame counter is not. Discriminator: if a human with bash and grep could produce the same baseline, it isn't Phase 3 yet — go down to the syscall / call-path / MMIO / register layer. See `feedback_phase3_no_theatre.md`.
|
||||
|
||||
**Anti-fabrication:**
|
||||
- Every cited value traces to a visible tool invocation or verbatim paste-in. If a measurement wasn't taken, write "not measured" — never an estimate, inference, or recall from training / prior sessions / sibling-host memory.
|
||||
- Raw before derived. A derived number (FPS, p99, error rate) appears alongside the raw stream it came from, never alone.
|
||||
- Rig failure is the finding. Empty strace, dead UART, perf counter that didn't increment → that *is* the Phase 3 result. Loop back to Phase 2 to fix the rig; do not synthesize plausible-looking baseline data to keep momentum.
|
||||
|
||||
- **If baseline reveals the Phase 1 metric was tracking the wrong thing → loop back to Phase 1** with the corrected target. (Example: "max H.264 FPS" Phase 1 metric, but baseline shows DMA-setup + sync overhead dwarfs decode → real metric is bytes-copied-per-second / EGL surface-import time, not FPS.)
|
||||
|
||||
**Measurements describe what the system *does*, not what it *should do*.** Baseline data is evidence, not a specification. Do NOT derive API call sequences, struct layouts, or parameter values from observed behaviour (strace, perf, example output). Observable behaviour may reflect bugs, workarounds, or implementation accidents — anything you copy from it inherits those.
|
||||
|
||||
## Phase 4 — Plan
|
||||
Formulate the approach. Identify what will and will not be touched. State expected outcome of implementation in the *same* measurable terms used in Phase 1/3.
|
||||
|
||||
## Phase 5 — Second Model Review
|
||||
Goal, situation, measurements, plan get pasted into **DokuWiki**. Markus reviews and redacts, then initiates the handover to a fresh model instance. **Claude does not curate the artifact going to the reviewer** — that would re-introduce the blind-spot accumulation the review is meant to escape. Do not summarize when handing over; paste the actual artifacts.
|
||||
|
||||
## Phase 6 — Implementation
|
||||
Execute the plan. Scope strictly to what was planned — resist feature creep, refactor-creep, "while I'm here" cleanups, and over-eager scope expansion. If a plan revision is needed mid-implementation, surface it explicitly and re-enter Phase 4.
|
||||
|
||||
**Contract before code.** Before writing or modifying any call site:
|
||||
- Read the API contract — kernel docs, header comments, and upstream source for every call touched.
|
||||
- State the contract explicitly before implementing against it (in the plan, the commit message, or a comment — somewhere reviewable).
|
||||
- If the contract cannot be found: stop and surface the gap. Don't infer it from baseline behaviour or sibling code.
|
||||
|
||||
**Copying from baseline measurements is not implementation. It is transcription of potentially broken behaviour.** A deliverable that matches baseline bytes but violates the API contract is not a deliverable — it is a deferred bug.
|
||||
|
||||
### What "state the contract explicitly" looks like
|
||||
|
||||
Worked example: `0012-h264-omit-scaling-matrix-frame-based.patch` in `~/src/ohm_gl_fix/phase6/step1/`. The commit message opens with the contract before any code:
|
||||
|
||||
> VAAPI signals "explicit scaling lists are present in the bitstream" implicitly: the consumer (ffmpeg-vaapi, mpv, etc.) sends a `VAIQMatrixBufferH264` alongside `RenderPicture` iff `sps_scaling_matrix_present_flag || pps_scaling_matrix_present_flag`. When the bitstream uses default (flat) scaling, no IQMatrixBuffer arrives […]
|
||||
>
|
||||
> Earlier draft of this patch unconditionally omitted SCALING_MATRIX in FRAME_BASED. That's **corpus-correct** (bbb has no explicit scaling lists) but the **wrong predicate**: the kernel-side gating is by "matrix-supplied vs. not," not by decode mode. […]
|
||||
>
|
||||
> Contract verification (audit_0008_decode_params_2026-05-01.md + hantro_h264.c::assemble_scaling_list): the kernel uses the supplied matrix when SCALING_MATRIX is in the control batch and falls back to spec-defined defaults when absent. Mode-independent.
|
||||
|
||||
What this gets right:
|
||||
- **Contract first**: per-control rules cited from kernel doc (`ext-ctrls-codec-stateless.rst:752`), kernel driver (`hantro_h264.c::assemble_scaling_list`), and sibling implementation (gst-plugins-bad commit 9e3e775) — *before* any patch hunks.
|
||||
- **Corpus-correct ≠ spec-correct, called out by name**: the rejected predicate ("omit SCALING_MATRIX in FRAME_BASED") *did* match the BBB baseline. It still got rejected, because the contract said the gate is "matrix-supplied vs. not," not "decode mode." This is exactly the Phase 3-derived-implementation trap.
|
||||
- **Then** the diff implements one branch per contract clause: SPS/PPS/DECODE_PARAMS always, SCALING_MATRIX iff `matrix_set`, SLICE_PARAMS iff SLICE_BASED, PRED_WEIGHTS iff SLICE_BASED + `V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED`.
|
||||
|
||||
Mirror format anywhere reviewable: PR description, commit message body, plan section, or a header comment block. The shape is "contract clauses with citations → code that maps 1:1 to those clauses."
|
||||
|
||||
## Phase 7 — Verification Measurements
|
||||
Repeat measurements from Phase 3. Compare explicitly against baseline.
|
||||
- **If the delta does not match Phase 4's prediction → loop back to Phase 4** (re-plan). Do not declare success when the numbers say otherwise; an unexplained delta is a finding, not a footnote.
|
||||
|
||||
## Phase 8 — Closing (Package & Ship)
|
||||
Ship the deliverable to its consumption point. Working code that lives only in a checkout is half a deliverable — the next session has to re-discover it, the fleet doesn't get the fix, and the loop's value evaporates.
|
||||
|
||||
- **Kernel patch → kernel-agent package.** Route through the kernel-agent flow (`fleet/<host>.yaml` + scope-tagged patches) so the kernel package gets properly built, signed, and published. Don't leave loose `.patch` files in a working tree. See `project_kernel_agent.md` for the manifest shape; `linux-ampere-fourier` and `linux-fresnel-fourier` are the canonical examples.
|
||||
- **Program / library change → marfrit-packages.** Add or update a PKGBUILD (Arch/ALARM) or debian/ tree (deb), push to `git.reauktion.de/marfrit/marfrit-packages`, and let `.gitea/workflows/build.yml` produce + sign + publish to `packages.reauktion.de`. See `project_marfrit_packages.md`. Local-only fixes go upstream as PR-quality diffs into the same overlay.
|
||||
- **Skipping is a deliberate choice.** If the change is one-shot scratch work (debugging tripwire, throw-away script), say so explicitly in the closing note. The default is: it gets packaged.
|
||||
- **Re-verify on the deploy host with the packaged artifact.** A clean Phase 7 result from a hand-rolled dev build (e.g. `meson -Dbuildtype=release && ninja`) is **not** the same as the `.pkg.tar.zst` / `.deb` that the deploy host installs. Distro packaging flags (Arch makepkg's `-O2 + FORTIFY + stack-protector-strong + stack-clash-protection` vs meson's `-O3 -DNDEBUG`, debhelper's hardening defaults, lto toggles) vectorise / unroll loops differently and routinely unmask latent UB the dev build folded away. Pull the published package down via the package manager and re-run the Phase 7 success criterion against it before closing — until that PASSes, the loop is not done. See `feedback_package_build_flags_unmask_bugs.md` for the iter39 incident that codified this.
|
||||
|
||||
## Phase 9 — Memory Update
|
||||
Loop terminates here. Distill the lesson into a memory entry — what was the mistake the loop caught, what's the rule that would shorten the next cycle. Do not let the lesson rot in chat history.
|
||||
|
||||
---
|
||||
|
||||
## Loopback edges (summary)
|
||||
- Phase 3 → Phase 1 (metric was wrong)
|
||||
- Phase 7 → Phase 4 (plan didn't deliver predicted delta)
|
||||
- Any phase → Phase 0 (substrate was wrong: predecessor baseline didn't replicate, mechanism doesn't engage on this stack, or the data inverts the premise → re-anchor or honest close)
|
||||
- Phase 9 closes the loop
|
||||
|
||||
## Why this exists
|
||||
Several recurring failures in prior work codify into individual rules — observer-first, simulate-before-flash, three-strikes-then-verify, "trust eyes not vibes," scope-strictly-to-plan, no-fake-dry-run. Those are all symptoms; this loop is the structural fix. Use it as the spine and let those rules show up as rejection patterns inside the appropriate phases.
|
||||
+239
@@ -0,0 +1,239 @@
|
||||
---
|
||||
phase: 0
|
||||
status: closed 2026-05-18
|
||||
date_opened: 2026-05-17
|
||||
date_closed: 2026-05-18
|
||||
research_method: three rounds of parallel web research (Sonnet via Agent), plus hands-on hertz substrate inventory and live `vulkaninfo` capture
|
||||
target_hardware: hertz (Pi 5 8 GB) for dev; higgs (CM5) eventual user target
|
||||
---
|
||||
|
||||
# Phase 0 — Substrate / motivation / inventory
|
||||
|
||||
This is the consolidated Phase 0 record. Path A (custom VPU firmware)
|
||||
is **closed at the silicon-RoT step**; Path B (QPU compute via the
|
||||
existing Mesa `v3d` driver) is **open**. The remainder of the
|
||||
project lives in Path B.
|
||||
|
||||
The earlier session produced two separate Phase 0 artifacts that
|
||||
were lost when the working tree was wiped at 2026-05-18 10:25
|
||||
(`.git-broken-2026-05-18/` retains the corrupted state if needed).
|
||||
This document supersedes both.
|
||||
|
||||
---
|
||||
|
||||
## 1. Research question
|
||||
|
||||
Verbatim from `README.md`:
|
||||
|
||||
> Community-built VP9 / AV1 software-decode back-end running on the
|
||||
> VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 /
|
||||
> Compute Module 5), via the existing Mesa `v3d` userspace driver.
|
||||
|
||||
The load-bearing claim: *the QPU is programmable by us, on stock
|
||||
production hardware, and the codec back-end is a workload class
|
||||
where that programmability buys CPU time on the A76 cluster.*
|
||||
Phase 0's job is to test that claim before Phase 1 binds a metric.
|
||||
|
||||
## 2. Substrate inventory — hertz
|
||||
|
||||
Captured live 2026-05-17 via SSH. Full `vulkaninfo` in
|
||||
`vulkaninfo_v3d_7_1_7_hertz.txt`.
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Host | hertz, Pi 5, 8 GB, eMMC + 1 TB SATA |
|
||||
| Role | LXD host for 11 containers (home-LAN spine — DNS / VPN / HA proxy / NCP / SMTP) |
|
||||
| OS | Debian 13 Trixie |
|
||||
| Kernel | `6.12.75+rpt-rpi-2712` (RPi Foundation kernel, 2026-03-11) |
|
||||
| CPU | 4× Cortex-A76 @ 2.8 GHz |
|
||||
| GPU clock | V3D 7.1 @ 1000 MHz (slight OC; spec 960 MHz) |
|
||||
| Mesa | `25.0.7-2+rpt4` (`libvulkan_broadcom.so` v3dv ICD) |
|
||||
| Vulkan loader | `1.4.309` |
|
||||
| Vulkan device API | 1.3.305 (conformance 1.3.8.3) |
|
||||
| DRM nodes | `card0 → v3d` (compute target), `card1 → vc4-drm` (display), `renderD128` |
|
||||
| kernel uAPI hdr | `/usr/include/drm/v3d_drm.h` present |
|
||||
| Build tools | cmake 3.31, ninja 1.12, libvulkan-dev 1.4.309, glslang-tools 15.1.0, spirv-tools 2025.1, libdrm-dev 2.4.131 (installed 2026-05-17) |
|
||||
| User groups | mfritsche ∈ `render`, `video`, `lxd`, `sudo` |
|
||||
| Memory pressure | 7.9 GiB RAM, ~3 GiB available; 6 GiB zram, ~2.8 GiB in use (cohabitation with LXD spine) |
|
||||
| Watchdog | yes — power-cut reboot via Himbeere plug if hertz crashes (acknowledged dev cost: household DNS/VPN drops during each reboot cycle) |
|
||||
|
||||
**Inside-view V3D 7.1 compute envelope** (from
|
||||
`vulkaninfo_v3d_7_1_7_hertz.txt`):
|
||||
|
||||
| Property | Value | Implication |
|
||||
|---|---|---|
|
||||
| `maxStorageBufferRange` | 1 GiB | Bounds single-tensor size; codec working sets (frames, planes) fit trivially |
|
||||
| `maxPerStageDescriptorStorageBuffers` | 8 | Forces ≤8 SSBO bindings per dispatch — ggml-vulkan binds more, doesn't fit |
|
||||
| `maxComputeSharedMemorySize` | 16 KiB | Small tiled kernels only; codec block work (8×8, 16×16) fits easily |
|
||||
| `maxComputeWorkGroupInvocations` | 256 | Standard |
|
||||
| `maxComputeWorkGroupSize` | 256 / 256 / ? | Standard |
|
||||
| `subgroupSize` | 16 (fixed) | Matches QPU SIMD width |
|
||||
| `subgroupSupportedOperations` | BASIC + VOTE only | No arithmetic reductions — accumulate via shared memory |
|
||||
| `shaderFloat16` | **false** | Storage only; arithmetic runs FP32 |
|
||||
| `shaderInt8` | **false** | Storage only; arithmetic on widened ints |
|
||||
| `shaderInt16` | **false** | Same |
|
||||
| `storageBuffer8/16BitAccess` | true | Can load tightly-packed quantized / packed pixel data |
|
||||
| `subgroupSizeControl`, `computeFullSubgroups`, `synchronization2` | true | Modern compute features available |
|
||||
|
||||
**Throughput envelopes** (from prior community measurements,
|
||||
not yet re-confirmed in-session):
|
||||
|
||||
| Metric | Value | Source |
|
||||
|---|---|---|
|
||||
| V3D 7.1 theoretical FP32 peak | ~92 GFLOPS at 960 MHz | 12 QPU × 4 ALU × 2 op/cycle |
|
||||
| Direct-DRM SGEMM sustained | 21.4 GFLOPS (~23%) | `Idein/py-videocore7` |
|
||||
| Vulkan-compute `vkpeak` fp32-vec4 | 6.9 GFLOPS (~7.5%) | RPi forum benchmark thread |
|
||||
| A76 NEON sustained for matmul | ~50 GFLOPS | Multiple benchmark sources |
|
||||
| Shared LPDDR4x bus | ~17 GB/s nominal | LPDDR4x-4267 × 32 bit / 8 |
|
||||
| GPU-measured BW share | 4–7 GB/s | py-videocore7 scopy benchmark |
|
||||
| CPU NEON BW achievable | 12–15 GB/s | Pi 5 STREAM benchmarks |
|
||||
|
||||
## 3. Path A — closed
|
||||
|
||||
**Custom VPU firmware loaded onto VC7 scalar cores.** This was the
|
||||
README's original framing.
|
||||
|
||||
Blocked at the silicon-RoT step:
|
||||
|
||||
- **BCM2712 mask ROM hardcodes RPi's public key** and unconditionally
|
||||
verifies the second-stage bootloader (`bootsys`) on every boot
|
||||
path (SPI flash, USB rpiboot, SD recovery). RPi holds the
|
||||
corresponding private key.
|
||||
- `EXECUTE_CODE` mailbox tag (the only documented Pi 1–4 runtime
|
||||
"run code on a VPU core" mechanism) **confirmed removed on Pi 5**
|
||||
by Pi Foundation engineer (forum.raspberrypi.com).
|
||||
- Pre-CRA EEPROM downgrade is possible (no anti-rollback fuse) but
|
||||
only yields *older RPi-signed* EEPROMs — doesn't help.
|
||||
- OTP fuse state on stock CM5 is already the most permissive
|
||||
possible (customer key hash = zero); the RPi-key check is
|
||||
silicon-unconditional, not gated by OTP.
|
||||
- CM5 vs retail Pi 5: same silicon, same chain, no meaningful
|
||||
security delta.
|
||||
- One non-software escape exists: VPU JTAG via documented test
|
||||
points (`schlae/cm5-reveng`, Dec 2025). Hardware mod only,
|
||||
sealed-chassis higgs not the dev unit, novel research with no
|
||||
published firmware-injection workflow. Out of scope for this
|
||||
project.
|
||||
|
||||
Verdict: **structurally blocked for community use without RPi
|
||||
cooperation or hardware-RE-grade work on a sacrificial CM5.**
|
||||
|
||||
## 4. Path B — open
|
||||
|
||||
**QPU compute kernels via the existing Mesa `v3d` driver.** Reachable
|
||||
from userspace today on a stock signed Pi 5 / CM5 via
|
||||
`/dev/dri/card0` (Vulkan compute through `v3dv`) or `renderD128`
|
||||
(direct DRM submit, py-videocore7 style). No firmware loading.
|
||||
No signing fight. mfritsche on hertz is in the `render` group and
|
||||
can hit the device without sudo.
|
||||
|
||||
The substrate is real:
|
||||
- `Idein/py-videocore7` runs SGEMM at 21 GFLOPS sustained on stock
|
||||
Pi 5 with no special setup — existence proof of arbitrary QPU
|
||||
programs.
|
||||
- Mesa v3dv is Vulkan 1.3-conformant on V3D 7.1 (Mesa 24.3+;
|
||||
hertz runs 25.0.7).
|
||||
- The kernel `v3d` DRM driver is fully upstream and open.
|
||||
|
||||
Phase 0 does **not** assume Path B leads to a winning result. It
|
||||
asserts only that Path B is *reachable*, where Path A isn't.
|
||||
|
||||
## 5. Why this isn't the same project as "v3d backend for llama.cpp"
|
||||
|
||||
A llama.cpp v3d backend was investigated mid-session and rejected
|
||||
as structurally infeasible. The verdict was decisive: GPU loses
|
||||
to CPU on raw FP32 (21 vs ~50 GFLOPS), on memory bandwidth share
|
||||
(4–7 vs 12–15 GB/s), and on quantized instruction support (no
|
||||
INT8 MAC vs A76 SDOT/UDOT). For LLM matmul, the QPU is the wrong
|
||||
substrate.
|
||||
|
||||
**Codec back-end work is a different workload class** with
|
||||
properties that fit the QPU substantively better:
|
||||
|
||||
| Property | LLM matmul | Codec back-end (post-entropy) |
|
||||
|---|---|---|
|
||||
| Working set per dispatch | Whole weight matrices (GB) | Per-block (8×8 / 16×16, hundreds of bytes) — fits in 16 KiB shared mem |
|
||||
| Dominant op | INT8 MAC | Integer add / shift / small-constant multiply |
|
||||
| Why GPU misses | No INT8 MAC | Less impact — fewer multiplies, mostly add/shift |
|
||||
| Memory pattern | Full-tensor stream | Sequential plane reads, TMU-friendly |
|
||||
| Parallelism | One big GEMM | Thousands of independent small blocks per frame |
|
||||
| A76 advantage | NEON SDOT/UDOT crushing it | Less specialized; QPU advantage real |
|
||||
| Bandwidth-bound? | Yes (kills the GPU) | Compute-bound at block scale |
|
||||
|
||||
This is the load-bearing reframe between the failed llama.cpp
|
||||
investigation and the daedalus-fourier scope. Codec back-end
|
||||
*might* live on the QPU. Phase 1 measures whether it actually does.
|
||||
|
||||
## 6. Honest probability assessment
|
||||
|
||||
A competent outside reviewer should rate the project as **hard but
|
||||
viable**, with one concrete prior precedent (MulticoreWare /
|
||||
Imagination PowerVR OpenCL VP9 decoder, 2014, achieved 1080p30 in
|
||||
a hybrid model with CPU entropy + GPU back-end on a comparable
|
||||
embedded GPU) and one concrete recent failure (FFmpeg 8.0 VP9-on-
|
||||
Vulkan-compute, 2025, produced corrupted output on a much more
|
||||
capable NVIDIA target — but the failure was in the *attempt to
|
||||
move entropy onto GPU*, not the back-end).
|
||||
|
||||
The win condition is **not** "GPU beats CPU at the same work." The
|
||||
win condition is **"GPU work overlaps with CPU work that has to
|
||||
happen anyway"** — concurrent decode where ARM does entropy and
|
||||
the QPU finishes the block-level back-end on the previous frame,
|
||||
recovering CPU time for the rest of the system (browser, audio,
|
||||
UI, the 11 LXD containers on hertz).
|
||||
|
||||
Phase 1 measures the building block: one kernel, bit-exact, with
|
||||
numbers. Phase 2+ only if Phase 1 numbers justify it.
|
||||
|
||||
## 7. Open questions for Phase 1
|
||||
|
||||
1. **What's the actual single-kernel QPU throughput on a
|
||||
codec-shaped workload?** SGEMM at 21 GFLOPS is the only public
|
||||
number, and SGEMM is not block-IDCT-shaped. We need an in-session
|
||||
N=3 measurement on a real codec kernel.
|
||||
|
||||
2. **What's the ARM NEON baseline for the same kernel on the same
|
||||
hertz?** libavcodec ships highly-tuned NEON paths for IDCT,
|
||||
deblocking, etc. Without measuring NEON in-session, "the QPU
|
||||
wins" or "the QPU loses" is unverifiable.
|
||||
|
||||
3. **Vulkan compute vs direct DRM submit — which path?** Vulkan
|
||||
has tooling, documentation, debuggability. Direct DRM has
|
||||
~10–15% lower per-dispatch overhead and bypasses the
|
||||
v3dv-imposed 16 KiB shared-mem / 8-SSBO limits, at the cost
|
||||
of writing QPU asm against the NDA ISA. Phase 1 picks one.
|
||||
|
||||
4. **Memory bandwidth contention with concurrent ARM decode.**
|
||||
The shared 17 GB/s bus is the floor. If QPU+ARM-NEON both
|
||||
running collide for bandwidth, the "concurrent work" win
|
||||
disappears. Needs in-session measurement once any kernel exists.
|
||||
|
||||
5. **VC7 thermal headroom under sustained mixed CPU+GPU load.**
|
||||
Pi 5 throttles GPU at 85°C, CPU at 80°C. hertz idles at ~64°C
|
||||
with the LXD spine; mixed compute will push higher. With or
|
||||
without active cooling on hertz is an open question.
|
||||
|
||||
These are Phase 1's burden, not Phase 0's. Phase 0 closes here.
|
||||
|
||||
## 8. Sources
|
||||
|
||||
Earlier session's web research produced ~7000 words of substrate
|
||||
references across 6 parallel threads. The full source list lived
|
||||
in the deleted `phase0_findings.md` and `phase0_wall1_bypass.md`.
|
||||
The high-value pointers that should follow this project forward:
|
||||
|
||||
- [Mesa `src/broadcom/qpu/qpu_instr.h`](https://github.com/Mesa3D/mesa/blob/main/src/broadcom/qpu/qpu_instr.h) — de-facto VC7 QPU ISA reference (no Broadcom-published doc; ISA under NDA)
|
||||
- [Mesa `src/broadcom/compiler/`](https://github.com/Mesa3D/mesa/tree/main/src/broadcom/compiler) — NIR→QPU compiler, the open ground truth for what V3D 7.1 can do
|
||||
- [`Idein/py-videocore7`](https://github.com/Idein/py-videocore7) — working QPU GPGPU runtime via DRM; SGEMM benchmark; existence proof
|
||||
- [`Towdo/py-videocore7`](https://github.com/Towdo/py-videocore7) — fork with more fixes
|
||||
- [Mesa `v3dv` driver source](https://gitlab.freedesktop.org/mesa/mesa/-/tree/main/src/broadcom/vulkan) — Vulkan compute path
|
||||
- [Pi 5 HEVC kernel driver patch series](https://patchwork.kernel.org) — closest architectural template for ARM-side V4L2 stateless wrapping a Pi-5 hardware accelerator (search "rpi-hevc-dec")
|
||||
- [raspberrypi/usbboot secure-boot.md](https://github.com/raspberrypi/usbboot/blob/master/docs/secure-boot.md) — Wall 1 silicon-RoT confirmation
|
||||
- [schlae/cm5-reveng](https://github.com/schlae/cm5-reveng) — CM5 PCB RE; VPU JTAG test points (Dec 2025; out of Path B scope, kept as escape hatch reference)
|
||||
- [MulticoreWare / Imagination PowerVR VP9 OpenCL decoder press](https://www.design-reuse.com/news/34030/vp9-decoder-imagination-powervr-series6-gpus.html) — 2014 precedent for hybrid codec back-end on embedded GPU compute
|
||||
- [FFmpeg 8.0 part-3 VP9 Vulkan failure post](https://www.rendi.dev/blog/ffmpeg-8-0-part-3-failed-attempts-to-use-vulkan-for-av1-encoding-vp9-decoding) — recent cautionary tale; failure was in entropy stage, not back-end
|
||||
- [`Halide/Halide` Vulkan Pi 5 issue #8494](https://github.com/halide/Halide/issues/8494) — known runtime edge cases on Pi 5 Vulkan
|
||||
- [Pi Forum p=2330030](https://forums.raspberrypi.com/viewtopic.php?p=2330030) — RPi engineer confirms VC7 ISA NDA + EU CRA signing rationale
|
||||
|
||||
Future phases should add citations here as they're consumed, not
|
||||
re-derive Phase 0's substrate findings.
|
||||
+128
@@ -0,0 +1,128 @@
|
||||
---
|
||||
phase: 1
|
||||
status: open
|
||||
date_opened: 2026-05-18
|
||||
parent: phase0.md
|
||||
target_kernel: VP9 / AV1 8×8 inverse DCT (integer fixed-point)
|
||||
dev_host: hertz
|
||||
---
|
||||
|
||||
# Phase 1 — Goal formulation
|
||||
|
||||
Per `dev_process.md`:
|
||||
|
||||
> Define the objective in measurable terms. State what success looks
|
||||
> like *before* touching anything. The chosen metric is a **hypothesis**
|
||||
> about what to measure, not an axiom — Phase 3 may invalidate it.
|
||||
|
||||
## Kernel under test
|
||||
|
||||
**VP9 / AV1 8×8 inverse DCT (DCT_DCT variant), integer 16-bit
|
||||
fixed-point input, 8-bit output, with reconstructed-block add.**
|
||||
|
||||
Mirrors the `ff_vp9_idct_idct_8x8_add_neon` shape in libavcodec
|
||||
(see `libavcodec/aarch64/vp9itxfm_neon.S`) and the equivalent
|
||||
dav1d / rav1d / libgav1 implementations for AV1's `IDTX_DCT` /
|
||||
`DCT_DCT` 8×8 path.
|
||||
|
||||
I/O contract (per VP9 spec § 8.7 inverse transform process):
|
||||
|
||||
```
|
||||
input: int16_t coeffs[64] // dequantized transform coefficients
|
||||
input: uint8_t pred[64] // predicted block (intra/inter)
|
||||
input: ptrdiff_t stride // typically 8 for an isolated test
|
||||
output: uint8_t dst[64] // clamp(pred + idct(coeffs)) per pixel
|
||||
```
|
||||
|
||||
Bit-exact: integer arithmetic per spec, no rounding ambiguity.
|
||||
|
||||
## Measurable success criteria
|
||||
|
||||
Three numbers must come out of Phase 7, all measured in-session on
|
||||
hertz, all N≥3:
|
||||
|
||||
| ID | Measurement | What it tells us |
|
||||
|---|---|---|
|
||||
| **M1** | **Bit-exactness rate** vs libavcodec C reference, across ≥10 000 random coefficient blocks | Correctness gate. Must be 100.000 %. Anything less and the kernel is wrong, no other number matters. |
|
||||
| **M2** | **QPU throughput** in million-blocks-per-second (MblockS), single-threaded host driver, sustained over ≥1 s | The substrate's actual delivered capacity for this kernel shape. |
|
||||
| **M3** | **NEON throughput** in MblockS on the same hertz, single-threaded, running `ff_vp9_idct_idct_8x8_add_neon` via a microbench harness | The floor any GPU offload has to beat or get close to. |
|
||||
|
||||
Derived figure for go/no-go: **R = M2 / M3**.
|
||||
|
||||
## Decision rules (set before measuring, per `feedback_no_motivated_reasoning`)
|
||||
|
||||
| R | Interpretation | Next step |
|
||||
|---|---|---|
|
||||
| ≥ 1.0 | QPU beats NEON on this kernel in isolation. Strong substrate signal. | Phase 9 lessons → Phase 1 of next kernel (deblocking or CDEF). |
|
||||
| 0.5 ≤ R < 1.0 | QPU loses in isolation but is in the same order of magnitude. *Concurrent-work* hypothesis becomes viable: at R≈0.5 the QPU can roughly handle half of decode while the CPU does the other half + everything else. | Add a Phase 1' measurement: M4 = combined CPU+QPU throughput when both run concurrently (does total system delivery exceed pure-CPU?). Then decide. |
|
||||
| 0.1 ≤ R < 0.5 | QPU is materially slower. Concurrent-work win unlikely to be worth the integration cost. | Honest close. Phase 9 documents the negative result. |
|
||||
| < 0.1 | QPU is structurally wrong for this kernel shape. | Honest close. Phase 9 documents the failure, project shelves. |
|
||||
|
||||
These thresholds are deliberately published *before* measurement so
|
||||
the result can't be retroactively reframed.
|
||||
|
||||
## Secondary measurements (not gating, but recorded)
|
||||
|
||||
- **M5** — per-kernel-launch overhead in µs, isolated (run with 0
|
||||
blocks, measure submit+wait round-trip). Tells us the floor below
|
||||
which kernel batching is required.
|
||||
- **M6** — workgroup-size sweep across {8, 16, 32, 64, 128, 256}
|
||||
invocations to identify the v3dv-optimal launch shape for this
|
||||
kernel. Records the Pareto curve, doesn't change R unless the
|
||||
best-WG result invalidates M2.
|
||||
- **M7** — power draw delta at the wall (via the Himbeere Fritz!DECT
|
||||
plug telemetry, if reachable) under idle vs CPU-only vs QPU-only
|
||||
vs CPU+QPU concurrent. Order-of-magnitude only; informs the higgs
|
||||
battery argument that motivates the project.
|
||||
|
||||
## What Phase 1 does *not* lock
|
||||
|
||||
- The dispatch path (Vulkan compute via `v3dv` vs direct DRM
|
||||
submit via `v3d_drm.h` ioctl). Phase 4 picks. Default for
|
||||
Phase 1 = **Vulkan compute** unless Phase 4 has reason to flip:
|
||||
documented, debuggable, doesn't require QPU asm against the
|
||||
NDA ISA.
|
||||
- The shader source (GLSL → glslang → SPIR-V) vs hand-written
|
||||
SPIR-V. Default = GLSL.
|
||||
- Workgroup partitioning (one-block-per-WG vs many-blocks-per-WG).
|
||||
Phase 4 chooses based on subgroup width and tile cost; Phase 1
|
||||
records the sweep (M6).
|
||||
|
||||
## Non-goals for Phase 1
|
||||
|
||||
- No V4L2 driver work.
|
||||
- No end-to-end VP9 / AV1 decode (entropy + back-end). Just one
|
||||
kernel, isolated, measured.
|
||||
- No optimization beyond what's needed to hit the bit-exact gate
|
||||
and produce a single throughput number. Tuning is Phase 7's
|
||||
feedback if R is borderline.
|
||||
- No build-system perfection. A CMakeLists that compiles the test
|
||||
harness on hertz is enough.
|
||||
|
||||
## Phase 2 → Phase 3 hand-off conditions
|
||||
|
||||
Phase 1 closes when:
|
||||
- The above metrics + decision rules are reviewed (second-model
|
||||
review per dev_process.md Phase 5? No — this is *Phase 1* not
|
||||
Phase 5. The Phase 5 second-model review comes after Phase 4
|
||||
plan).
|
||||
- The metrics are recorded in this file or a sibling
|
||||
`phase1_metrics.md` artifact (TBD).
|
||||
|
||||
The next phase (Phase 2 — situation analysis) inventories:
|
||||
- libavcodec's NEON IDCT reference (file, function, calling
|
||||
convention, expected I/O contract).
|
||||
- VP9 spec § 8.7 transform process (which the C reference
|
||||
implements verbatim).
|
||||
- AV1 spec § 7.7 (same transform structure, larger transform set;
|
||||
8×8 DCT_DCT path is identical to VP9's at this size).
|
||||
- Mesa v3dv's compute-shader compilation path and any known
|
||||
v3dv-specific shader idioms that perform better on V3D 7.1.
|
||||
- The hertz Vulkan dispatch overhead floor (M5 candidate, but
|
||||
measured as part of Phase 3 baseline).
|
||||
|
||||
## Open questions Phase 1 hands forward
|
||||
|
||||
None new. Phase 0 § 7's open questions are the standing list;
|
||||
Phase 1 picks off Q1 (single-kernel throughput) and Q2 (NEON
|
||||
baseline) directly via M2 and M3.
|
||||
+212
@@ -0,0 +1,212 @@
|
||||
---
|
||||
phase: 2
|
||||
status: closed 2026-05-18
|
||||
date_opened: 2026-05-18
|
||||
parent: phase1.md
|
||||
target_kernel: VP9 8×8 inverse DCT (DCT_DCT variant, 8-bit pixels)
|
||||
---
|
||||
|
||||
# Phase 2 — Situation analysis
|
||||
|
||||
Per `dev_process.md`:
|
||||
|
||||
> Document current state. Identify constraints, dependencies, known
|
||||
> failure modes. Reset context here — do not carry assumptions from
|
||||
> prior sessions; re-read CLAUDE.md, relevant memory files, run
|
||||
> `git status`, re-verify reachability.
|
||||
|
||||
## 1. Context reset
|
||||
|
||||
- Working tree state: dirty (Phase 0/1/2 docs not yet committed).
|
||||
`.git-broken-2026-05-18/` preserved as a forensic artifact of
|
||||
the 2026-05-18 10:25 working-tree wipe (cause undetermined).
|
||||
- CLAUDE.md re-read: no contradictions with the Path B scope set
|
||||
in README §"Architecture (Path B)".
|
||||
- hertz reachability: confirmed via SSH; `vcgencmd`, `vulkaninfo`,
|
||||
`apt`, sudo NOPASSWD all working as of 2026-05-17 inventory.
|
||||
Mesa 25.0.7 / Vulkan 1.3.305 / V3D 7.1.7 stable.
|
||||
|
||||
## 2. Reference implementations — VP9 8×8 IDCT (DCT_DCT)
|
||||
|
||||
The Phase 1 kernel has *two* canonical reference implementations
|
||||
in FFmpeg n7.1.3 (the version installed on hertz). The harness
|
||||
will link both: the C path as the bit-exact gate (M1), the NEON
|
||||
path as the throughput baseline (M3).
|
||||
|
||||
### 2.1 C reference
|
||||
|
||||
- **Source**: `libavcodec/vp9dsp_template.c`, function `idct_idct_8x8_add_c`
|
||||
- **Spec basis**: VP9 specification §8.7 — Inverse transform process
|
||||
- **Signature**:
|
||||
|
||||
```c
|
||||
static void idct_idct_8x8_add_c(uint8_t *_dst, ptrdiff_t stride,
|
||||
int16_t *_block, int eob);
|
||||
```
|
||||
|
||||
- **Algorithm** (8-bit path):
|
||||
1. If `eob == 1` (DC-only): single `(coef * 11585 * 11585)` round, broadcast to 8×8 with `+pred, clamp[0,255]`.
|
||||
2. Otherwise: 8 column passes through `idct8_1d` → tmp[64]. Zero the input block. 8 row passes through `idct8_1d` → out[8]. Per-element `(out + 16) >> 5`, add to `dst`, `av_clip_pixel`.
|
||||
- **`idct8_1d`**: 1-D 8-point inverse DCT, 8 trigonometric multiply-add stages with Q14 fixed-point constants then 8-butterfly add/sub stages. All arithmetic is signed int32 (`dctint`).
|
||||
- **Q14 constants** (matched against VP9 spec §8.7.1.4):
|
||||
| symbol | value | trig identity |
|
||||
|---|---|---|
|
||||
| cospi_16_64 | 11585 | cos(π/4) × 2^14 ≈ 0.70711 |
|
||||
| cospi_24_64 | 6270 | cos(3π/8) × 2^14 ≈ 0.38268 |
|
||||
| cospi_8_64 | 15137 | sin(3π/8) × 2^14 ≈ 0.92388 |
|
||||
| cospi_28_64 | 3196 | cos(7π/16) × 2^14 ≈ 0.19509 |
|
||||
| cospi_4_64 | 16069 | sin(7π/16) × 2^14 ≈ 0.98079 |
|
||||
| cospi_20_64 | 9102 | cos(5π/16) × 2^14 ≈ 0.55557 |
|
||||
| cospi_12_64 | 13623 | sin(5π/16) × 2^14 ≈ 0.83147 |
|
||||
|
||||
Rounding convention: `(product + (1 << 13)) >> 14`, i.e. round-half-up at bit 14.
|
||||
|
||||
- **License**: LGPL-2.1-or-later (FFmpeg).
|
||||
- **Side effect**: zeroes the input `block[]` (idempotency requirement; matches spec).
|
||||
|
||||
### 2.2 NEON reference
|
||||
|
||||
- **Source**: `libavcodec/aarch64/vp9itxfm_neon.S`, symbol `ff_vp9_idct_idct_8x8_add_neon`
|
||||
- **Signature** (same as C):
|
||||
```
|
||||
void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
int16_t *block, int eob);
|
||||
```
|
||||
Registers: `x0=dst, x1=stride, x2=block, w3=eob`.
|
||||
- **Internal dependencies** (must be copied alongside the .S):
|
||||
| macro / symbol | location | role |
|
||||
|---|---|---|
|
||||
| `idct8` | `vp9itxfm_neon.S` | 1-D 8-pt IDCT, fully unrolled with `dmbutterfly*` |
|
||||
| `dmbutterfly0` | `vp9itxfm_neon.S` | rotation by π/4 (the `cospi_16_64` case) |
|
||||
| `dmbutterfly` | `vp9itxfm_neon.S` | general 2-input rotation `[a,b] → [a·c1−b·c2, a·c2+b·c1]` (`Q14`) |
|
||||
| `dmbutterfly_l` | `vp9itxfm_neon.S` | wide-form (4×i32 acc) for `dmbutterfly` |
|
||||
| `butterfly_8h` | `vp9itxfm_neon.S` | trivial `[a+b, a−b]` on `int16x8_t` |
|
||||
| `transpose_8x8H` | `libavcodec/aarch64/neon.S` | in-place 8×8 i16 transpose |
|
||||
| `idct_coeffs` | `vp9itxfm_neon.S` (`const`) | Q14 trig constants table, aligned 4 |
|
||||
| `movrel` | `libavutil/aarch64/asm.S` | PIC-aware constant-pool relocation helper |
|
||||
- **License**: LGPL-2.1-or-later (Google, 2016).
|
||||
- **Performance shape**: full unrolled 8-pt butterfly with NEON `smull/smlsl/smlal` + `rshrn` for the Q14 round-shift; output uses `sqxtun` for saturated narrow to u8. Estimated ~80 NEON instructions for the steady state (non-DC) path.
|
||||
|
||||
### 2.3 AV1 equivalence note
|
||||
|
||||
AV1's 8×8 DCT_DCT transform (`av1_iidentity8_iidentity8_c` vs `av1_idct8_idct8_c` family in `libavcodec/av1dsp/...`) shares the same 1-D 8-point structure but with **different** scaling: AV1 uses 12-bit fixed-point (`>> 12`) and a slightly different rounding shift due to its different transform-stage bit growth model. Calling our VP9 IDCT shader on AV1 coefficients will produce wrong output. **AV1 support is out of scope for Phase 1.** A Phase-N variant can fork the shader with the AV1 constants once Phase 1 has proven the VP9 path.
|
||||
|
||||
## 3. Vulkan compute dispatch path
|
||||
|
||||
Hertz exposes V3D 7.1 via Mesa's v3dv driver as Vulkan
|
||||
`PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU`, API 1.3.305, conformance
|
||||
1.3.8.3. The compute-only dispatch path is:
|
||||
|
||||
```
|
||||
host program
|
||||
├─ vkCreateInstance / vkEnumeratePhysicalDevices (picks V3D 7.1.7.0)
|
||||
├─ vkCreateDevice (queue family with COMPUTE_BIT, no graphics needed)
|
||||
├─ vkCreateBuffer x N (SSBOs for block coeffs in / dst pixels in+out)
|
||||
│ - buffer flags: STORAGE_BUFFER_BIT | TRANSFER_SRC/DST
|
||||
│ - memory type: HOST_VISIBLE | HOST_COHERENT (zero-copy on shared LPDDR4x)
|
||||
├─ vkCreateDescriptorSetLayout (≤8 SSBOs per layout — Pi 5 limit)
|
||||
├─ vkCreateShaderModule (SPIR-V from glslang)
|
||||
├─ vkCreateComputePipeline
|
||||
├─ vkBeginCommandBuffer
|
||||
│ vkCmdBindPipeline / vkCmdBindDescriptorSets / vkCmdPushConstants
|
||||
│ vkCmdDispatch(group_count_x, 1, 1) # one WG per ~K blocks
|
||||
├─ vkQueueSubmit + vkQueueWaitIdle (or fence) — this is the measured op
|
||||
└─ (read back via the HOST_VISIBLE buffer, or alias it to the same memory the CPU populated)
|
||||
```
|
||||
|
||||
Per Phase 0 §2 inside-view limits, the relevant constraints
|
||||
for this kernel:
|
||||
|
||||
- ≤8 SSBOs per stage → group inputs/outputs into ≤8 bindings (we
|
||||
only need 2: `block[]` in, `dst[]` in/out).
|
||||
- Shared mem ≤16 KiB → each 8×8 block fits trivially (256 B in
|
||||
i16 plus 64 B in u8). One WG can carry dozens of blocks of
|
||||
shared state if useful.
|
||||
- Subgroup size = 16 (fixed). One workgroup of 64 invocations =
|
||||
4 subgroups; one block per subgroup is a natural shape (each
|
||||
16-lane subgroup processes 8×8 = 64 pixels in 4 cycles of
|
||||
subgroup work).
|
||||
|
||||
## 4. Build path on hertz
|
||||
|
||||
Already installed (2026-05-17): cmake 3.31, ninja 1.12, gcc (Debian
|
||||
trixie default), `libvulkan-dev 1.4.309`, `glslang-tools 15.1.0`,
|
||||
`spirv-tools 2025.1`, `libdrm-dev 2.4.131`, `vulkan-tools 1.4.304`.
|
||||
|
||||
Missing but cheap:
|
||||
- `libavcodec-dev` — only needed if the harness wants to link
|
||||
against system libavcodec for cross-checks against the dynamic
|
||||
dispatcher. *Not* needed for the source-copy approach (preferred,
|
||||
see §5).
|
||||
|
||||
## 5. Reference-copy strategy (vs system-libavcodec link)
|
||||
|
||||
**Decision: source-copy the 3 FFmpeg files into `external/ffmpeg-snapshot/`.**
|
||||
|
||||
Rationale:
|
||||
- System `libavcodec.so` on hertz is symbol-stripped (`nm` returns
|
||||
empty for `ff_vp9_idct_*`). Internal NEON entry points are not
|
||||
reachable via `dlsym`.
|
||||
- The two reference implementations (C, NEON) plus their macro/
|
||||
data dependencies total ~3 files / ~600 lines. Source-copy is
|
||||
smaller than the dlopen plumbing would be.
|
||||
- LGPL-2.1-or-later (FFmpeg license) is propagation-compatible
|
||||
with the harness binary if the harness binary itself is GPL
|
||||
or LGPL. The kernel shaders and dispatch library stay
|
||||
separately-licensed (BSD-2-Clause, default for this project).
|
||||
- Pinning to `n7.1.3` matches hertz's runtime libavcodec version,
|
||||
so any in-session sanity cross-check against the running Mesa
|
||||
/ video tooling stays consistent.
|
||||
|
||||
Files to vendor:
|
||||
|
||||
| Source | License | Target path under `daedalus-fourier/` |
|
||||
|---|---|---|
|
||||
| `libavcodec/vp9dsp_template.c` | LGPL-2.1+ | `external/ffmpeg-snapshot/vp9dsp_template.c` |
|
||||
| `libavcodec/aarch64/vp9itxfm_neon.S` | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/vp9itxfm_neon.S` |
|
||||
| `libavcodec/aarch64/neon.S` (for `transpose_8x8H`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/neon.S` |
|
||||
| `libavutil/aarch64/asm.S` (for `movrel`, `function`, `endfunc`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/asm.S` |
|
||||
| (whatever else `vp9dsp_template.c` transitively needs) | LGPL-2.1+ | as required |
|
||||
|
||||
A `external/ffmpeg-snapshot/COPYING.LGPL` and `external/ffmpeg-snapshot/PROVENANCE.md` document the upstream commit (n7.1.3 tag, commit hash) and the verbatim-copy guarantee.
|
||||
|
||||
## 6. Known constraints / failure modes carried from Phase 0
|
||||
|
||||
Repeated here so Phase 4 (plan) can bind against them without
|
||||
re-derivation:
|
||||
|
||||
- **C1**: shaderFloat16 = false → all shader arithmetic must be int32 (we are int anyway — no risk).
|
||||
- **C2**: maxComputeSharedMemorySize = 16 KiB → kernel must not require more (8×8 IDCT trivially fits even with many blocks per WG).
|
||||
- **C3**: maxPerStageDescriptorStorageBuffers = 8 → we need only 2 (coeffs + dst), no risk.
|
||||
- **C4**: subgroupSupportedOperations = BASIC + VOTE only → no `subgroupAdd`/etc. for accumulator reductions. Workaround: the IDCT structure is fully data-parallel without reductions; this constraint doesn't bite.
|
||||
- **C5**: VC7 has SMUL24 but no INT8 MAC. Our Q14 multiplies are i16×i16→i32 — the multiplicands fit in 17 bits, so SMUL24 covers it. No INT8/INT4 issues.
|
||||
- **C6**: shared LPDDR4x bus; GPU sees ~4–7 GB/s vs CPU ~12–15 GB/s. For 8×8 IDCT, working set is tiny (≤320 B/block), so per-block bandwidth is not the bottleneck; per-dispatch submit overhead is.
|
||||
- **C7**: VPM read-stall serialization. If we hand-write QPU asm (we won't, in Phase 1) this would matter; the Vulkan compute path lets the v3d_compiler schedule for us.
|
||||
- **C8**: VC7 thermal throttle at 85°C GPU / 80°C CPU. Phase 7 measurements should record temp before/during/after to flag throttling.
|
||||
|
||||
## 7. What Phase 2 does *not* close
|
||||
|
||||
- The harness architecture (single binary? Two binaries — one for
|
||||
bit-exact, one for throughput?). Phase 4 picks.
|
||||
- Block-per-WG dispatch geometry. Phase 4 + Phase 6 sweep.
|
||||
- Random-coefficient generation strategy (uniform i16 vs
|
||||
realistic-distribution; the latter affects DC-only path
|
||||
frequency). Phase 4 picks; Phase 7 may re-evaluate.
|
||||
- Whether NEON measurement uses `clock_gettime(CLOCK_MONOTONIC_RAW)`
|
||||
per-call (high overhead) or batched (more realistic for codec
|
||||
use). Phase 3 picks during baseline collection.
|
||||
|
||||
## 8. Hand-off to Phase 3
|
||||
|
||||
Phase 3 measures:
|
||||
- **M3-prelim**: NEON `ff_vp9_idct_idct_8x8_add_neon` throughput
|
||||
on hertz, batched over 10⁶ random blocks, single-threaded,
|
||||
4-thread, sched-isolated. This is the *floor*.
|
||||
- **M5-prelim**: Vulkan dispatch overhead — pipeline create cost
|
||||
(one-time), per-`vkCmdDispatch` cost (per-frame-equivalent),
|
||||
per-`vkQueueSubmit + vkQueueWaitIdle` cost (per-completion).
|
||||
Bound below which kernel batching is mandatory.
|
||||
|
||||
Both are measurements on the *existing* substrate. Neither
|
||||
requires writing any shader code. Phase 3 closes before Phase 4
|
||||
(plan) begins.
|
||||
+105
@@ -0,0 +1,105 @@
|
||||
---
|
||||
phase: 3
|
||||
status: closed 2026-05-18
|
||||
date_opened: 2026-05-18
|
||||
date_closed: 2026-05-18
|
||||
parent: phase2.md
|
||||
host: hertz (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75+rpt-rpi-2712, Mesa 25.0.7-2+rpt4, V3D 7.1.7 @ 1 GHz, A76 @ 2.8 GHz)
|
||||
artifacts: build/bench_neon_idct, build/bench_vulkan_dispatch, build/noop.spv
|
||||
---
|
||||
|
||||
# Phase 3 — Baseline measurements
|
||||
|
||||
Per `dev_process.md`:
|
||||
|
||||
> Take concrete measurements *before* any changes. Raw before
|
||||
> derived. Real data, not theatre.
|
||||
|
||||
These numbers anchor every Phase 4+ decision. Re-run with the
|
||||
same harness on the same hertz before drawing any new conclusions
|
||||
in later phases.
|
||||
|
||||
## M1 — bit-exact correctness gate (Phase 1)
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Method | 10 000 random VP9-plausible coefficient blocks + random `pred[64]`, compare `daedalus_vp9_idct_idct_8x8_add_ref` C output vs vendored FFmpeg `ff_vp9_idct_idct_8x8_add_neon` |
|
||||
| Run | `./bench_neon_idct --blocks 1000000 --iters 5` (built 2026-05-18) |
|
||||
| **Result** | **10 000 / 10 000 = 100.0000 %** |
|
||||
| DC-only path frequency | 11 / 10 000 = 0.11 % |
|
||||
| Notes | Random generator: xorshift64, biased toward 1–16 non-zero coeffs per block; eob mostly ∈ [4, 63]. DC-only frequency is incidental; Phase 7 may revisit if it materially affects the throughput number. |
|
||||
|
||||
**Gate passes. Throughput measurement was authorized to run.**
|
||||
|
||||
## M3 — NEON throughput (single-core)
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Kernel | `ff_vp9_idct_idct_8x8_add_neon` from FFmpeg n7.1.3 (vendored, see `external/ffmpeg-snapshot/PROVENANCE.md`) |
|
||||
| Method | Pre-generate 1 M random blocks + preds. Per iteration: memcpy refresh of all blocks/preds (NEON path zeroes blocks), then call NEON kernel 1 M times. Subtract setup memcpy time from the measured wall-clock. 5 iterations, single thread, no CPU pinning. |
|
||||
| Compiler flags | `-O3 -march=armv8-a+simd` |
|
||||
| Run | `./bench_neon_idct --blocks 1000000 --iters 5` |
|
||||
| **Throughput** | **8.171 Mblock/s** |
|
||||
| Per-block | 122.4 ns |
|
||||
| Equivalent 1080p frame rate | 252.2 FPS (32 400 blocks per 1080p frame, assuming pure 8×8 work) |
|
||||
| Elapsed (kernel) | 0.612 s / 5 M blocks |
|
||||
| Elapsed (setup-only) | 0.250 s / 5 M iters |
|
||||
| Cross-check | Cycle estimate at 2.8 GHz: 122.4 ns × 2.8 GHz ≈ 342 cycles/block. Plausible for a fully-unrolled NEON 8-point IDCT with butterflies + saturated narrow stores; the FFmpeg implementation interleaves loads/computes/stores aggressively. |
|
||||
|
||||
### M3 implications
|
||||
|
||||
- A single A76 core handles ~8 M blocks/s = **252 FPS at 1080p**. Real decode needs ~60 FPS = 4.2× headroom on one core, ~16× headroom on all four cores. **NEON is not the bottleneck for current YouTube workloads on Pi 5.**
|
||||
- The QPU offload story is not "make decode faster" — decode is already fast enough single-threaded. The story has to be "free CPU cycles for the rest of the system" (browser, audio, the 11 LXD containers on hertz).
|
||||
- For a per-kernel R = QPU / NEON measurement (per `phase1.md §"Decision rules"`), the QPU has to hit ≥4 M blocks/s to score R ≥ 0.5. That's the gate.
|
||||
|
||||
## M5 — Vulkan compute dispatch overhead
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Method | Allocate empty pipeline (no descriptors, no push constants), bind+dispatch a `void main(){}` shader on `local_size_x=64`. Time `vkQueueSubmit` + `vkQueueWaitIdle` round-trip. 50 000 iterations, warm. |
|
||||
| Device | V3D 7.1.7.0 via Mesa v3dv 25.0.7 (selected past llvmpipe by `strstr("V3D")`) |
|
||||
| Run | `./bench_vulkan_dispatch --iters 50000` |
|
||||
| **M5a — empty CB submit+wait** | **22.66 µs / op** |
|
||||
| **M5b — 1-WG noop dispatch submit+wait** | **55.60 µs / op** |
|
||||
| **M5 delta — per-vkCmdDispatch + pipeline-bind** | **32.95 µs** |
|
||||
|
||||
### M5 implications — the load-bearing finding for Phase 4
|
||||
|
||||
This is the single most important number from Phase 3.
|
||||
|
||||
- Per-dispatch cost (55.6 µs) is **~455× the NEON per-block cost** (122 ns).
|
||||
- A per-block QPU dispatch is structurally impossible — overhead dominates by two-and-a-half orders of magnitude.
|
||||
- Break-even batch size for a *hypothetical* zero-cost QPU kernel: **≥ 556 blocks per dispatch**. Real kernel cost on top of that.
|
||||
- Frame-level batching is mandatory: a 1080p frame has 32 400 8×8 blocks; one dispatch per frame amortizes M5b to 1.7 ns/block — well below NEON's 122 ns.
|
||||
- Tile-level batching is borderline: a typical VP9 64×64 superblock has 64 sub-blocks; 55.6 µs / 64 ≈ 870 ns/block, ~7× NEON. Probably too coarse — frame-level or full-plane is the right granularity.
|
||||
|
||||
### M5 measurement caveats
|
||||
|
||||
- `vkQueueWaitIdle` after each submit forces a full GPU sync, modelling the "submit and need the result now" case. Real decode pipelines can submit multiple frames ahead and wait less often — the per-dispatch cost in a pipelined deployment will be lower (probably bounded below by M5a ≈ 22.66 µs as the pure submit cost).
|
||||
- Empty CB (M5a) at 22.66 µs is the *floor*. This is Mesa command-list construction + kernel `DRM_IOCTL_V3D_SUBMIT_CL` + scheduler RTT. Cannot be optimised at the userspace level without changing Mesa or kernel.
|
||||
- Both numbers include `vkQueueWaitIdle` overhead; pure submit-without-wait would be lower. For Phase 1's threshold analysis the with-wait number is the right one to use because end-to-end frame decode must wait for its output to be readable.
|
||||
|
||||
## Phase 3 closure
|
||||
|
||||
Two anchor measurements captured, both with verbatim raw output
|
||||
(see `bench_neon_idct` and `bench_vulkan_dispatch` source for the
|
||||
print format). No estimates, no inferences, no recall from prior
|
||||
sessions or sibling-host memory.
|
||||
|
||||
Phase 4 (plan) opens against these numbers. Its first decision:
|
||||
**given the 32.95 µs per-dispatch floor, what is the
|
||||
batch granularity for the first kernel?** The answer is either
|
||||
frame-level (32 400 blocks/dispatch) or row-level (~120
|
||||
blocks/dispatch for one 1920-wide row of 8×8 → still ~460 ns/block
|
||||
overhead, ~4× NEON). Frame-level is the only granularity that
|
||||
amortises overhead enough to leave kernel compute room to win.
|
||||
|
||||
Open thread for a later phase (not blocking Phase 4):
|
||||
- Multi-core NEON sweep (M3'): single-core NEON is the right
|
||||
*competitor floor*, but the actual ARM headroom on hertz is
|
||||
4× this number under load.
|
||||
- Memory-bandwidth contention measurement (M6): does NEON's
|
||||
rate change when concurrent QPU is reading the same LPDDR4x
|
||||
bus? Needs the QPU kernel to exist first.
|
||||
- Power-draw delta via Himbeere plug (M7): same — needs a real
|
||||
GPU workload to differentiate from idle.
|
||||
File diff suppressed because it is too large
Load Diff
+502
@@ -0,0 +1,502 @@
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
Version 2.1, February 1999
|
||||
|
||||
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
[This is the first released version of the Lesser GPL. It also counts
|
||||
as the successor of the GNU Library Public License, version 2, hence
|
||||
the version number 2.1.]
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
Licenses are intended to guarantee your freedom to share and change
|
||||
free software--to make sure the software is free for all its users.
|
||||
|
||||
This license, the Lesser General Public License, applies to some
|
||||
specially designated software packages--typically libraries--of the
|
||||
Free Software Foundation and other authors who decide to use it. You
|
||||
can use it too, but we suggest you first think carefully about whether
|
||||
this license or the ordinary General Public License is the better
|
||||
strategy to use in any particular case, based on the explanations below.
|
||||
|
||||
When we speak of free software, we are referring to freedom of use,
|
||||
not price. Our General Public Licenses are designed to make sure that
|
||||
you have the freedom to distribute copies of free software (and charge
|
||||
for this service if you wish); that you receive source code or can get
|
||||
it if you want it; that you can change the software and use pieces of
|
||||
it in new free programs; and that you are informed that you can do
|
||||
these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
distributors to deny you these rights or to ask you to surrender these
|
||||
rights. These restrictions translate to certain responsibilities for
|
||||
you if you distribute copies of the library or if you modify it.
|
||||
|
||||
For example, if you distribute copies of the library, whether gratis
|
||||
or for a fee, you must give the recipients all the rights that we gave
|
||||
you. You must make sure that they, too, receive or can get the source
|
||||
code. If you link other code with the library, you must provide
|
||||
complete object files to the recipients, so that they can relink them
|
||||
with the library after making changes to the library and recompiling
|
||||
it. And you must show them these terms so they know their rights.
|
||||
|
||||
We protect your rights with a two-step method: (1) we copyright the
|
||||
library, and (2) we offer you this license, which gives you legal
|
||||
permission to copy, distribute and/or modify the library.
|
||||
|
||||
To protect each distributor, we want to make it very clear that
|
||||
there is no warranty for the free library. Also, if the library is
|
||||
modified by someone else and passed on, the recipients should know
|
||||
that what they have is not the original version, so that the original
|
||||
author's reputation will not be affected by problems that might be
|
||||
introduced by others.
|
||||
|
||||
Finally, software patents pose a constant threat to the existence of
|
||||
any free program. We wish to make sure that a company cannot
|
||||
effectively restrict the users of a free program by obtaining a
|
||||
restrictive license from a patent holder. Therefore, we insist that
|
||||
any patent license obtained for a version of the library must be
|
||||
consistent with the full freedom of use specified in this license.
|
||||
|
||||
Most GNU software, including some libraries, is covered by the
|
||||
ordinary GNU General Public License. This license, the GNU Lesser
|
||||
General Public License, applies to certain designated libraries, and
|
||||
is quite different from the ordinary General Public License. We use
|
||||
this license for certain libraries in order to permit linking those
|
||||
libraries into non-free programs.
|
||||
|
||||
When a program is linked with a library, whether statically or using
|
||||
a shared library, the combination of the two is legally speaking a
|
||||
combined work, a derivative of the original library. The ordinary
|
||||
General Public License therefore permits such linking only if the
|
||||
entire combination fits its criteria of freedom. The Lesser General
|
||||
Public License permits more lax criteria for linking other code with
|
||||
the library.
|
||||
|
||||
We call this license the "Lesser" General Public License because it
|
||||
does Less to protect the user's freedom than the ordinary General
|
||||
Public License. It also provides other free software developers Less
|
||||
of an advantage over competing non-free programs. These disadvantages
|
||||
are the reason we use the ordinary General Public License for many
|
||||
libraries. However, the Lesser license provides advantages in certain
|
||||
special circumstances.
|
||||
|
||||
For example, on rare occasions, there may be a special need to
|
||||
encourage the widest possible use of a certain library, so that it becomes
|
||||
a de-facto standard. To achieve this, non-free programs must be
|
||||
allowed to use the library. A more frequent case is that a free
|
||||
library does the same job as widely used non-free libraries. In this
|
||||
case, there is little to gain by limiting the free library to free
|
||||
software only, so we use the Lesser General Public License.
|
||||
|
||||
In other cases, permission to use a particular library in non-free
|
||||
programs enables a greater number of people to use a large body of
|
||||
free software. For example, permission to use the GNU C Library in
|
||||
non-free programs enables many more people to use the whole GNU
|
||||
operating system, as well as its variant, the GNU/Linux operating
|
||||
system.
|
||||
|
||||
Although the Lesser General Public License is Less protective of the
|
||||
users' freedom, it does ensure that the user of a program that is
|
||||
linked with the Library has the freedom and the wherewithal to run
|
||||
that program using a modified version of the Library.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow. Pay close attention to the difference between a
|
||||
"work based on the library" and a "work that uses the library". The
|
||||
former contains code derived from the library, whereas the latter must
|
||||
be combined with the library in order to run.
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License Agreement applies to any software library or other
|
||||
program which contains a notice placed by the copyright holder or
|
||||
other authorized party saying it may be distributed under the terms of
|
||||
this Lesser General Public License (also called "this License").
|
||||
Each licensee is addressed as "you".
|
||||
|
||||
A "library" means a collection of software functions and/or data
|
||||
prepared so as to be conveniently linked with application programs
|
||||
(which use some of those functions and data) to form executables.
|
||||
|
||||
The "Library", below, refers to any such software library or work
|
||||
which has been distributed under these terms. A "work based on the
|
||||
Library" means either the Library or any derivative work under
|
||||
copyright law: that is to say, a work containing the Library or a
|
||||
portion of it, either verbatim or with modifications and/or translated
|
||||
straightforwardly into another language. (Hereinafter, translation is
|
||||
included without limitation in the term "modification".)
|
||||
|
||||
"Source code" for a work means the preferred form of the work for
|
||||
making modifications to it. For a library, complete source code means
|
||||
all the source code for all modules it contains, plus any associated
|
||||
interface definition files, plus the scripts used to control compilation
|
||||
and installation of the library.
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running a program using the Library is not restricted, and output from
|
||||
such a program is covered only if its contents constitute a work based
|
||||
on the Library (independent of the use of the Library in a tool for
|
||||
writing it). Whether that is true depends on what the Library does
|
||||
and what the program that uses the Library does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Library's
|
||||
complete source code as you receive it, in any medium, provided that
|
||||
you conspicuously and appropriately publish on each copy an
|
||||
appropriate copyright notice and disclaimer of warranty; keep intact
|
||||
all the notices that refer to this License and to the absence of any
|
||||
warranty; and distribute a copy of this License along with the
|
||||
Library.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy,
|
||||
and you may at your option offer warranty protection in exchange for a
|
||||
fee.
|
||||
|
||||
2. You may modify your copy or copies of the Library or any portion
|
||||
of it, thus forming a work based on the Library, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) The modified work must itself be a software library.
|
||||
|
||||
b) You must cause the files modified to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
c) You must cause the whole of the work to be licensed at no
|
||||
charge to all third parties under the terms of this License.
|
||||
|
||||
d) If a facility in the modified Library refers to a function or a
|
||||
table of data to be supplied by an application program that uses
|
||||
the facility, other than as an argument passed when the facility
|
||||
is invoked, then you must make a good faith effort to ensure that,
|
||||
in the event an application does not supply such function or
|
||||
table, the facility still operates, and performs whatever part of
|
||||
its purpose remains meaningful.
|
||||
|
||||
(For example, a function in a library to compute square roots has
|
||||
a purpose that is entirely well-defined independent of the
|
||||
application. Therefore, Subsection 2d requires that any
|
||||
application-supplied function or table used by this function must
|
||||
be optional: if the application does not supply it, the square
|
||||
root function must still compute square roots.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Library,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Library, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote
|
||||
it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Library.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Library
|
||||
with the Library (or with a work based on the Library) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may opt to apply the terms of the ordinary GNU General Public
|
||||
License instead of this License to a given copy of the Library. To do
|
||||
this, you must alter all the notices that refer to this License, so
|
||||
that they refer to the ordinary GNU General Public License, version 2,
|
||||
instead of to this License. (If a newer version than version 2 of the
|
||||
ordinary GNU General Public License has appeared, then you can specify
|
||||
that version instead if you wish.) Do not make any other change in
|
||||
these notices.
|
||||
|
||||
Once this change is made in a given copy, it is irreversible for
|
||||
that copy, so the ordinary GNU General Public License applies to all
|
||||
subsequent copies and derivative works made from that copy.
|
||||
|
||||
This option is useful when you wish to copy part of the code of
|
||||
the Library into a program that is not a library.
|
||||
|
||||
4. You may copy and distribute the Library (or a portion or
|
||||
derivative of it, under Section 2) in object code or executable form
|
||||
under the terms of Sections 1 and 2 above provided that you accompany
|
||||
it with the complete corresponding machine-readable source code, which
|
||||
must be distributed under the terms of Sections 1 and 2 above on a
|
||||
medium customarily used for software interchange.
|
||||
|
||||
If distribution of object code is made by offering access to copy
|
||||
from a designated place, then offering equivalent access to copy the
|
||||
source code from the same place satisfies the requirement to
|
||||
distribute the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
5. A program that contains no derivative of any portion of the
|
||||
Library, but is designed to work with the Library by being compiled or
|
||||
linked with it, is called a "work that uses the Library". Such a
|
||||
work, in isolation, is not a derivative work of the Library, and
|
||||
therefore falls outside the scope of this License.
|
||||
|
||||
However, linking a "work that uses the Library" with the Library
|
||||
creates an executable that is a derivative of the Library (because it
|
||||
contains portions of the Library), rather than a "work that uses the
|
||||
library". The executable is therefore covered by this License.
|
||||
Section 6 states terms for distribution of such executables.
|
||||
|
||||
When a "work that uses the Library" uses material from a header file
|
||||
that is part of the Library, the object code for the work may be a
|
||||
derivative work of the Library even though the source code is not.
|
||||
Whether this is true is especially significant if the work can be
|
||||
linked without the Library, or if the work is itself a library. The
|
||||
threshold for this to be true is not precisely defined by law.
|
||||
|
||||
If such an object file uses only numerical parameters, data
|
||||
structure layouts and accessors, and small macros and small inline
|
||||
functions (ten lines or less in length), then the use of the object
|
||||
file is unrestricted, regardless of whether it is legally a derivative
|
||||
work. (Executables containing this object code plus portions of the
|
||||
Library will still fall under Section 6.)
|
||||
|
||||
Otherwise, if the work is a derivative of the Library, you may
|
||||
distribute the object code for the work under the terms of Section 6.
|
||||
Any executables containing that work also fall under Section 6,
|
||||
whether or not they are linked directly with the Library itself.
|
||||
|
||||
6. As an exception to the Sections above, you may also combine or
|
||||
link a "work that uses the Library" with the Library to produce a
|
||||
work containing portions of the Library, and distribute that work
|
||||
under terms of your choice, provided that the terms permit
|
||||
modification of the work for the customer's own use and reverse
|
||||
engineering for debugging such modifications.
|
||||
|
||||
You must give prominent notice with each copy of the work that the
|
||||
Library is used in it and that the Library and its use are covered by
|
||||
this License. You must supply a copy of this License. If the work
|
||||
during execution displays copyright notices, you must include the
|
||||
copyright notice for the Library among them, as well as a reference
|
||||
directing the user to the copy of this License. Also, you must do one
|
||||
of these things:
|
||||
|
||||
a) Accompany the work with the complete corresponding
|
||||
machine-readable source code for the Library including whatever
|
||||
changes were used in the work (which must be distributed under
|
||||
Sections 1 and 2 above); and, if the work is an executable linked
|
||||
with the Library, with the complete machine-readable "work that
|
||||
uses the Library", as object code and/or source code, so that the
|
||||
user can modify the Library and then relink to produce a modified
|
||||
executable containing the modified Library. (It is understood
|
||||
that the user who changes the contents of definitions files in the
|
||||
Library will not necessarily be able to recompile the application
|
||||
to use the modified definitions.)
|
||||
|
||||
b) Use a suitable shared library mechanism for linking with the
|
||||
Library. A suitable mechanism is one that (1) uses at run time a
|
||||
copy of the library already present on the user's computer system,
|
||||
rather than copying library functions into the executable, and (2)
|
||||
will operate properly with a modified version of the library, if
|
||||
the user installs one, as long as the modified version is
|
||||
interface-compatible with the version that the work was made with.
|
||||
|
||||
c) Accompany the work with a written offer, valid for at
|
||||
least three years, to give the same user the materials
|
||||
specified in Subsection 6a, above, for a charge no more
|
||||
than the cost of performing this distribution.
|
||||
|
||||
d) If distribution of the work is made by offering access to copy
|
||||
from a designated place, offer equivalent access to copy the above
|
||||
specified materials from the same place.
|
||||
|
||||
e) Verify that the user has already received a copy of these
|
||||
materials or that you have already sent this user a copy.
|
||||
|
||||
For an executable, the required form of the "work that uses the
|
||||
Library" must include any data and utility programs needed for
|
||||
reproducing the executable from it. However, as a special exception,
|
||||
the materials to be distributed need not include anything that is
|
||||
normally distributed (in either source or binary form) with the major
|
||||
components (compiler, kernel, and so on) of the operating system on
|
||||
which the executable runs, unless that component itself accompanies
|
||||
the executable.
|
||||
|
||||
It may happen that this requirement contradicts the license
|
||||
restrictions of other proprietary libraries that do not normally
|
||||
accompany the operating system. Such a contradiction means you cannot
|
||||
use both them and the Library together in an executable that you
|
||||
distribute.
|
||||
|
||||
7. You may place library facilities that are a work based on the
|
||||
Library side-by-side in a single library together with other library
|
||||
facilities not covered by this License, and distribute such a combined
|
||||
library, provided that the separate distribution of the work based on
|
||||
the Library and of the other library facilities is otherwise
|
||||
permitted, and provided that you do these two things:
|
||||
|
||||
a) Accompany the combined library with a copy of the same work
|
||||
based on the Library, uncombined with any other library
|
||||
facilities. This must be distributed under the terms of the
|
||||
Sections above.
|
||||
|
||||
b) Give prominent notice with the combined library of the fact
|
||||
that part of it is a work based on the Library, and explaining
|
||||
where to find the accompanying uncombined form of the same work.
|
||||
|
||||
8. You may not copy, modify, sublicense, link with, or distribute
|
||||
the Library except as expressly provided under this License. Any
|
||||
attempt otherwise to copy, modify, sublicense, link with, or
|
||||
distribute the Library is void, and will automatically terminate your
|
||||
rights under this License. However, parties who have received copies,
|
||||
or rights, from you under this License will not have their licenses
|
||||
terminated so long as such parties remain in full compliance.
|
||||
|
||||
9. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Library or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Library (or any work based on the
|
||||
Library), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Library or works based on it.
|
||||
|
||||
10. Each time you redistribute the Library (or any work based on the
|
||||
Library), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute, link with or modify the Library
|
||||
subject to these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties with
|
||||
this License.
|
||||
|
||||
11. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Library at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Library by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Library.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under any
|
||||
particular circumstance, the balance of the section is intended to apply,
|
||||
and the section as a whole is intended to apply in other circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
12. If the distribution and/or use of the Library is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Library under this License may add
|
||||
an explicit geographical distribution limitation excluding those countries,
|
||||
so that distribution is permitted only in or among countries not thus
|
||||
excluded. In such case, this License incorporates the limitation as if
|
||||
written in the body of this License.
|
||||
|
||||
13. The Free Software Foundation may publish revised and/or new
|
||||
versions of the Lesser General Public License from time to time.
|
||||
Such new versions will be similar in spirit to the present version,
|
||||
but may differ in detail to address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Library
|
||||
specifies a version number of this License which applies to it and
|
||||
"any later version", you have the option of following the terms and
|
||||
conditions either of that version or of any later version published by
|
||||
the Free Software Foundation. If the Library does not specify a
|
||||
license version number, you may choose any version ever published by
|
||||
the Free Software Foundation.
|
||||
|
||||
14. If you wish to incorporate parts of the Library into other free
|
||||
programs whose distribution conditions are incompatible with these,
|
||||
write to the author to ask for permission. For software which is
|
||||
copyrighted by the Free Software Foundation, write to the Free
|
||||
Software Foundation; we sometimes make exceptions for this. Our
|
||||
decision will be guided by the two goals of preserving the free status
|
||||
of all derivatives of our free software and of promoting the sharing
|
||||
and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
|
||||
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
|
||||
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
|
||||
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
|
||||
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
|
||||
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
|
||||
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
|
||||
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
|
||||
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
|
||||
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Libraries
|
||||
|
||||
If you develop a new library, and you want it to be of the greatest
|
||||
possible use to the public, we recommend making it free software that
|
||||
everyone can redistribute and change. You can do so by permitting
|
||||
redistribution under these terms (or, alternatively, under the terms of the
|
||||
ordinary General Public License).
|
||||
|
||||
To apply these terms, attach the following notices to the library. It is
|
||||
safest to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least the
|
||||
"copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the library's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the library, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the
|
||||
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1990
|
||||
Ty Coon, President of Vice
|
||||
|
||||
That's all there is to it!
|
||||
+92
@@ -0,0 +1,92 @@
|
||||
# FFmpeg source snapshot
|
||||
|
||||
Verbatim subset of FFmpeg source pinned for use as reference
|
||||
implementations of the VP9 8×8 inverse DCT (Phase 1 target of
|
||||
`daedalus-fourier`). See `../../docs/phase2.md §2` and `§5` for
|
||||
the rationale.
|
||||
|
||||
## Upstream pin
|
||||
|
||||
- **Repository**: https://github.com/FFmpeg/FFmpeg
|
||||
- **Tag**: `n7.1.3` (matches `libavcodec61 8:7.1.3-0+deb13u1+rpt1`
|
||||
shipping in Debian Trixie on the dev host `hertz`)
|
||||
- **Annotated tag object**: `0a9a757e96fdf053697084bbd1f620edeac9d084`
|
||||
- **Commit object (tag target)**: `f46e514491172d15bd74b4abb1814cd2f05a763e`
|
||||
- **Snapshot fetched**: 2026-05-18 (UTC), via
|
||||
`https://raw.githubusercontent.com/FFmpeg/FFmpeg/n7.1.3/<path>`
|
||||
|
||||
## Files in this snapshot
|
||||
|
||||
All files are byte-for-byte copies of the upstream source at the
|
||||
tagged commit, no modifications.
|
||||
|
||||
| Path | Lines | Bytes | SHA-256 |
|
||||
|---|---|---|---|
|
||||
| `libavcodec/vp9dsp_template.c` | 2578 | 89045 | `41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f` |
|
||||
| `libavcodec/aarch64/vp9itxfm_neon.S` | 1580 | 63534 | `82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6` |
|
||||
| `libavcodec/aarch64/neon.S` | 173 | 7496 | `72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538` |
|
||||
| `libavutil/aarch64/asm.S` | 260 | 8069 | `c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3` |
|
||||
| `COPYING.LGPLv2.1` | 502 | — | `b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe` |
|
||||
|
||||
Verify with:
|
||||
|
||||
```sh
|
||||
( cd external/ffmpeg-snapshot && sha256sum -c <<'EOF'
|
||||
41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f libavcodec/vp9dsp_template.c
|
||||
82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6 libavcodec/aarch64/vp9itxfm_neon.S
|
||||
72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538 libavcodec/aarch64/neon.S
|
||||
c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3 libavutil/aarch64/asm.S
|
||||
b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe COPYING.LGPLv2.1
|
||||
EOF
|
||||
)
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
LGPL-2.1-or-later. See `COPYING.LGPLv2.1`. Original copyright
|
||||
holders include the FFmpeg authors and Google Inc. (2016) for
|
||||
the aarch64 NEON paths. The snapshot inherits FFmpeg's license
|
||||
in full.
|
||||
|
||||
## Why each file is in this snapshot
|
||||
|
||||
- `libavcodec/vp9dsp_template.c` — contains `idct_idct_8x8_add_c`,
|
||||
the bit-exact C reference for the Phase 1 kernel under test (M1).
|
||||
- `libavcodec/aarch64/vp9itxfm_neon.S` — contains
|
||||
`ff_vp9_idct_idct_8x8_add_neon`, the NEON throughput baseline
|
||||
(M3). Also defines `idct8`, `dmbutterfly0`, `dmbutterfly`,
|
||||
`dmbutterfly_l`, `butterfly_8h`, and the `idct_coeffs` constant
|
||||
table.
|
||||
- `libavcodec/aarch64/neon.S` — defines `transpose_8x8H` used by
|
||||
`vp9itxfm_neon.S`.
|
||||
- `libavutil/aarch64/asm.S` — defines `function`, `endfunc`,
|
||||
`movrel`, `const`, `endconst`, and other assembly preamble
|
||||
macros required to assemble the above NEON files.
|
||||
|
||||
## Re-vendoring procedure
|
||||
|
||||
If the upstream pin needs to change (e.g., hertz updates to a
|
||||
newer libavcodec):
|
||||
|
||||
```sh
|
||||
TAG=nX.Y.Z
|
||||
BASE=https://raw.githubusercontent.com/FFmpeg/FFmpeg/$TAG
|
||||
cd external/ffmpeg-snapshot
|
||||
for f in libavcodec/vp9dsp_template.c \
|
||||
libavcodec/aarch64/vp9itxfm_neon.S \
|
||||
libavcodec/aarch64/neon.S \
|
||||
libavutil/aarch64/asm.S \
|
||||
COPYING.LGPLv2.1; do
|
||||
curl -sSf -o "$f" "$BASE/$f"
|
||||
done
|
||||
sha256sum libavcodec/vp9dsp_template.c \
|
||||
libavcodec/aarch64/vp9itxfm_neon.S \
|
||||
libavcodec/aarch64/neon.S \
|
||||
libavutil/aarch64/asm.S \
|
||||
COPYING.LGPLv2.1
|
||||
# update this PROVENANCE.md with the new tag, commit hash, and hashes
|
||||
```
|
||||
|
||||
After re-vendoring, re-run the bit-exact gate (M1) and throughput
|
||||
baseline (M3) — both can shift across FFmpeg versions even when
|
||||
the VP9 spec doesn't change (e.g., NEON micro-optimizations).
|
||||
Vendored
+27
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Minimal config.h shim for assembling the vendored FFmpeg .S files
|
||||
* outside the FFmpeg build tree.
|
||||
*
|
||||
* The vendored .S files (vp9itxfm_neon.S, neon.S, asm.S) reference
|
||||
* exactly 7 preprocessor symbols, enumerated below. Values target
|
||||
* aarch64-Linux with modern binutils (≥2.41) — matches the Debian
|
||||
* Trixie environment on hertz (the project's dev host).
|
||||
*
|
||||
* See ../../docs/phase2.md §5 for the source-copy rationale and
|
||||
* PROVENANCE.md for the upstream pin (FFmpeg n7.1.3).
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#define HAVE_AS_FUNC 1
|
||||
#define HAVE_AS_ARCH_DIRECTIVE 1
|
||||
#define AS_ARCH_LEVEL armv8-a
|
||||
#define HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE 1
|
||||
#define HAVE_AS_ARCHEXT_I8MM_DIRECTIVE 1
|
||||
#define HAVE_SECTION_DATA_REL_RO 1
|
||||
#define CONFIG_PIC 1
|
||||
|
||||
/* Symbol prefix for exported labels. On ELF/Linux this is empty
|
||||
* (no leading underscore). FFmpeg's configure script normally
|
||||
* defines this in the generated config.h; we replicate the
|
||||
* Linux-target value here. */
|
||||
#define EXTERN_ASM
|
||||
+173
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
.macro clip min, max, regs:vararg
|
||||
.irp x, \regs
|
||||
smax \x, \x, \min
|
||||
.endr
|
||||
.irp x, \regs
|
||||
smin \x, \x, \max
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \r9\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \r1\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \r3\().8b, \r2\().8b, \r3\().8b
|
||||
trn1 \r0\().8b, \r4\().8b, \r5\().8b
|
||||
trn2 \r5\().8b, \r4\().8b, \r5\().8b
|
||||
trn1 \r2\().8b, \r6\().8b, \r7\().8b
|
||||
trn2 \r7\().8b, \r6\().8b, \r7\().8b
|
||||
|
||||
trn1 \r4\().4h, \r0\().4h, \r2\().4h
|
||||
trn2 \r2\().4h, \r0\().4h, \r2\().4h
|
||||
trn1 \r6\().4h, \r5\().4h, \r7\().4h
|
||||
trn2 \r7\().4h, \r5\().4h, \r7\().4h
|
||||
trn1 \r5\().4h, \r9\().4h, \r3\().4h
|
||||
trn2 \r9\().4h, \r9\().4h, \r3\().4h
|
||||
trn1 \r3\().4h, \r8\().4h, \r1\().4h
|
||||
trn2 \r8\().4h, \r8\().4h, \r1\().4h
|
||||
|
||||
trn1 \r0\().2s, \r3\().2s, \r4\().2s
|
||||
trn2 \r4\().2s, \r3\().2s, \r4\().2s
|
||||
|
||||
trn1 \r1\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r5\().2s, \r5\().2s, \r6\().2s
|
||||
|
||||
trn2 \r6\().2s, \r8\().2s, \r2\().2s
|
||||
trn1 \r2\().2s, \r8\().2s, \r2\().2s
|
||||
|
||||
trn1 \r3\().2s, \r9\().2s, \r7\().2s
|
||||
trn2 \r7\().2s, \r9\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
||||
trn1 \t0\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t1\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
||||
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
||||
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
||||
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
||||
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
||||
|
||||
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
||||
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
||||
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
||||
trn1 \r5\().8h, \t1\().8h, \r3\().8h
|
||||
trn2 \t1\().8h, \t1\().8h, \r3\().8h
|
||||
trn1 \r3\().8h, \t0\().8h, \r1\().8h
|
||||
trn2 \t0\().8h, \t0\().8h, \r1\().8h
|
||||
|
||||
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
||||
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
||||
|
||||
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
||||
|
||||
trn2 \r6\().4s, \t0\().4s, \r2\().4s
|
||||
trn1 \r2\().4s, \t0\().4s, \r2\().4s
|
||||
|
||||
trn1 \r3\().4s, \t1\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \t1\().4s, \r7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
||||
|
||||
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
||||
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
||||
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
||||
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \t5\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \t6\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \t7\().8b, \r2\().8b, \r3\().8b
|
||||
|
||||
trn1 \r0\().4h, \t4\().4h, \t6\().4h
|
||||
trn2 \r2\().4h, \t4\().4h, \t6\().4h
|
||||
trn1 \r1\().4h, \t5\().4h, \t7\().4h
|
||||
trn2 \r3\().4h, \t5\().4h, \t7\().4h
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
||||
trn1 \r4\().4h, \r0\().4h, \r1\().4h
|
||||
trn2 \r5\().4h, \r0\().4h, \r1\().4h
|
||||
trn1 \r6\().4h, \r2\().4h, \r3\().4h
|
||||
trn2 \r7\().4h, \r2\().4h, \r3\().4h
|
||||
|
||||
trn1 \r0\().2s, \r4\().2s, \r6\().2s
|
||||
trn2 \r2\().2s, \r4\().2s, \r6\().2s
|
||||
trn1 \r1\().2s, \r5\().2s, \r7\().2s
|
||||
trn2 \r3\().2s, \r5\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \t5\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \t6\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \t7\().8h, \r2\().8h, \r3\().8h
|
||||
|
||||
trn1 \r0\().4s, \t4\().4s, \t6\().4s
|
||||
trn2 \r2\().4s, \t4\().4s, \t6\().4s
|
||||
trn1 \r1\().4s, \t5\().4s, \t7\().4s
|
||||
trn2 \r3\().4s, \t5\().4s, \t7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \r9\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \r1\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \r3\().8h, \r2\().8h, \r3\().8h
|
||||
trn1 \r0\().8h, \r4\().8h, \r5\().8h
|
||||
trn2 \r5\().8h, \r4\().8h, \r5\().8h
|
||||
trn1 \r2\().8h, \r6\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r6\().8h, \r7\().8h
|
||||
|
||||
trn1 \r4\().4s, \r0\().4s, \r2\().4s
|
||||
trn2 \r2\().4s, \r0\().4s, \r2\().4s
|
||||
trn1 \r6\().4s, \r5\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \r5\().4s, \r7\().4s
|
||||
trn1 \r5\().4s, \r9\().4s, \r3\().4s
|
||||
trn2 \r9\().4s, \r9\().4s, \r3\().4s
|
||||
trn1 \r3\().4s, \r8\().4s, \r1\().4s
|
||||
trn2 \r8\().4s, \r8\().4s, \r1\().4s
|
||||
|
||||
trn1 \r0\().2d, \r3\().2d, \r4\().2d
|
||||
trn2 \r4\().2d, \r3\().2d, \r4\().2d
|
||||
|
||||
trn1 \r1\().2d, \r5\().2d, \r6\().2d
|
||||
trn2 \r5\().2d, \r5\().2d, \r6\().2d
|
||||
|
||||
trn2 \r6\().2d, \r8\().2d, \r2\().2d
|
||||
trn1 \r2\().2d, \r8\().2d, \r2\().2d
|
||||
|
||||
trn1 \r3\().2d, \r9\().2d, \r7\().2d
|
||||
trn2 \r7\().2d, \r9\().2d, \r7\().2d
|
||||
|
||||
.endm
|
||||
File diff suppressed because it is too large
Load Diff
+2578
File diff suppressed because it is too large
Load Diff
+260
@@ -0,0 +1,260 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#ifdef __ELF__
|
||||
# define ELF
|
||||
#else
|
||||
# define ELF #
|
||||
#endif
|
||||
|
||||
#if HAVE_AS_FUNC
|
||||
# define FUNC
|
||||
#else
|
||||
# define FUNC #
|
||||
#endif
|
||||
|
||||
#ifndef __has_feature
|
||||
# define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#if HAVE_AS_ARCH_DIRECTIVE
|
||||
.arch AS_ARCH_LEVEL
|
||||
#endif
|
||||
|
||||
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
|
||||
#define ENABLE_DOTPROD .arch_extension dotprod
|
||||
#define DISABLE_DOTPROD .arch_extension nodotprod
|
||||
#else
|
||||
#define ENABLE_DOTPROD
|
||||
#define DISABLE_DOTPROD
|
||||
#endif
|
||||
|
||||
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
|
||||
#define ENABLE_I8MM .arch_extension i8mm
|
||||
#define DISABLE_I8MM .arch_extension noi8mm
|
||||
#else
|
||||
#define ENABLE_I8MM
|
||||
#define DISABLE_I8MM
|
||||
#endif
|
||||
|
||||
DISABLE_DOTPROD
|
||||
DISABLE_I8MM
|
||||
|
||||
|
||||
/* Support macros for
|
||||
* - Armv8.3-A Pointer Authentication and
|
||||
* - Armv8.5-A Branch Target Identification
|
||||
* features which require emitting a .note.gnu.property section with the
|
||||
* appropriate architecture-dependent feature bits set.
|
||||
*
|
||||
* |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
|
||||
* PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
|
||||
* used immediately before saving the LR register (x30) to the stack.
|
||||
* |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
|
||||
* it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
|
||||
* with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
|
||||
* have the same value at the two points. For example:
|
||||
*
|
||||
* .global f
|
||||
* f:
|
||||
* AARCH64_SIGN_LINK_REGISTER
|
||||
* stp x29, x30, [sp, #-96]!
|
||||
* mov x29, sp
|
||||
* ...
|
||||
* ldp x29, x30, [sp], #96
|
||||
* AARCH64_VALIDATE_LINK_REGISTER
|
||||
* ret
|
||||
*
|
||||
* |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
|
||||
* |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
|
||||
* indirect call target. In particular, all symbols exported from a file must
|
||||
* begin with one of these macros. For example, a leaf function that does not
|
||||
* save LR can instead use |AARCH64_VALID_CALL_TARGET|:
|
||||
*
|
||||
* .globl return_zero
|
||||
* return_zero:
|
||||
* AARCH64_VALID_CALL_TARGET
|
||||
* mov x0, #0
|
||||
* ret
|
||||
*
|
||||
* A non-leaf function which does not immediately save LR may need both macros
|
||||
* because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
|
||||
* may jump to an alternate implementation before setting up the stack:
|
||||
*
|
||||
* .globl with_early_jump
|
||||
* with_early_jump:
|
||||
* AARCH64_VALID_CALL_TARGET
|
||||
* cmp x0, #128
|
||||
* b.lt .Lwith_early_jump_128
|
||||
* AARCH64_SIGN_LINK_REGISTER
|
||||
* stp x29, x30, [sp, #-96]!
|
||||
* mov x29, sp
|
||||
* ...
|
||||
* ldp x29, x30, [sp], #96
|
||||
* AARCH64_VALIDATE_LINK_REGISTER
|
||||
* ret
|
||||
*
|
||||
* .Lwith_early_jump_128:
|
||||
* ...
|
||||
* ret
|
||||
*
|
||||
* These annotations are only required with indirect calls. Private symbols that
|
||||
* are only the target of direct calls do not require annotations. Also note
|
||||
* that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
|
||||
* indirect jumps (BR). Indirect jumps in assembly are supported through
|
||||
* |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
|
||||
* calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
|
||||
*
|
||||
* Although not necessary, it is safe to use these macros in 32-bit ARM
|
||||
* assembly. This may be used to simplify dual 32-bit and 64-bit files.
|
||||
*
|
||||
* References:
|
||||
* - "ELF for the Arm® 64-bit Architecture"
|
||||
* https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
|
||||
* - "Providing protection for complex software"
|
||||
* https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
|
||||
*/
|
||||
#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
|
||||
# define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has BTI
|
||||
# define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
|
||||
# define AARCH64_VALID_JUMP_TARGET hint #38 // BTI 'j'
|
||||
#else
|
||||
# define GNU_PROPERTY_AARCH64_BTI 0 // No BTI
|
||||
# define AARCH64_VALID_CALL_TARGET
|
||||
# define AARCH64_VALID_JUMP_TARGET
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_PAC_DEFAULT)
|
||||
# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
|
||||
# define AARCH64_SIGN_LINK_REGISTER paciasp
|
||||
# define AARCH64_VALIDATE_LINK_REGISTER autiasp
|
||||
# elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
|
||||
# define AARCH64_SIGN_LINK_REGISTER pacibsp
|
||||
# define AARCH64_VALIDATE_LINK_REGISTER autibsp
|
||||
# else
|
||||
# error Pointer authentication defines no valid key!
|
||||
# endif
|
||||
# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0)
|
||||
# error Authentication of leaf functions is enabled but not supported in FFmpeg!
|
||||
# endif
|
||||
# define GNU_PROPERTY_AARCH64_PAC (1 << 1)
|
||||
#else
|
||||
# define GNU_PROPERTY_AARCH64_PAC 0
|
||||
# define AARCH64_SIGN_LINK_REGISTER
|
||||
# define AARCH64_VALIDATE_LINK_REGISTER
|
||||
#endif
|
||||
|
||||
|
||||
#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
|
||||
.pushsection .note.gnu.property, "a"
|
||||
.balign 8
|
||||
.long 4
|
||||
.long 0x10
|
||||
.long 0x5
|
||||
.asciz "GNU"
|
||||
.long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
|
||||
.long 4
|
||||
.long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
|
||||
.long 0
|
||||
.popsection
|
||||
#endif
|
||||
|
||||
.macro function name, export=0, align=2
|
||||
.macro endfunc
|
||||
ELF .size \name, . - \name
|
||||
FUNC .endfunc
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text
|
||||
.align \align
|
||||
.if \export
|
||||
.global EXTERN_ASM\name
|
||||
ELF .type EXTERN_ASM\name, %function
|
||||
FUNC .func EXTERN_ASM\name
|
||||
EXTERN_ASM\name:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
.else
|
||||
ELF .type \name, %function
|
||||
FUNC .func \name
|
||||
\name:
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro const name, align=2, relocate=0
|
||||
.macro endconst
|
||||
ELF .size \name, . - \name
|
||||
.purgem endconst
|
||||
.endm
|
||||
#if HAVE_SECTION_DATA_REL_RO
|
||||
.if \relocate
|
||||
.section .data.rel.ro
|
||||
.else
|
||||
.section .rodata
|
||||
.endif
|
||||
#elif defined(_WIN32)
|
||||
.section .rdata
|
||||
#elif !defined(__MACH__)
|
||||
.section .rodata
|
||||
#else
|
||||
.const_data
|
||||
#endif
|
||||
.align \align
|
||||
\name:
|
||||
.endm
|
||||
|
||||
.macro movrel rd, val, offset=0
|
||||
#if CONFIG_PIC && defined(__APPLE__)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val@PAGE
|
||||
add \rd, \rd, \val@PAGEOFF
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)@PAGE
|
||||
add \rd, \rd, \val+(\offset)@PAGEOFF
|
||||
.endif
|
||||
#elif CONFIG_PIC && defined(_WIN32)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val
|
||||
add \rd, \rd, :lo12:\val
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
.endif
|
||||
#elif CONFIG_PIC
|
||||
# if __has_feature(hwaddress_sanitizer)
|
||||
adrp \rd, :pg_hi21_nc:\val+(\offset)
|
||||
# else
|
||||
adrp \rd, \val+(\offset)
|
||||
# endif
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
#else
|
||||
ldr \rd, =\val+\offset
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#define GLUE(a, b) a ## b
|
||||
#define JOIN(a, b) GLUE(a, b)
|
||||
#define X(s) JOIN(EXTERN_ASM, s)
|
||||
|
||||
#define x18 do_not_use_x18
|
||||
#define w18 do_not_use_w18
|
||||
@@ -0,0 +1,248 @@
|
||||
/*
|
||||
* Phase 3 — NEON baseline microbench for VP9 8×8 DCT_DCT IDCT add.
|
||||
*
|
||||
* Reports two numbers:
|
||||
* M1 (correctness): bit-exact match rate, our C reference vs
|
||||
* FFmpeg's NEON, across N random blocks.
|
||||
* M3 (throughput): NEON sustained MblockS on this host.
|
||||
*
|
||||
* Both are gating measurements for Phase 1 (see docs/phase1.md).
|
||||
* NO QPU work happens here — that's later phases.
|
||||
*
|
||||
* Build: see CMakeLists.txt at project root.
|
||||
* Run: ./bench_neon_idct [--blocks N] [--iters K] [--seed S]
|
||||
*
|
||||
* License: BSD-2-Clause (daedalus-fourier), but this binary
|
||||
* statically links the LGPL-2.1+ FFmpeg NEON snapshot
|
||||
* — distribute the binary under LGPL-2.1+ in that case.
|
||||
*/
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
#include <time.h>
|
||||
#include <getopt.h>
|
||||
|
||||
/* Our C reference (tests/vp9_idct8_ref.c). */
|
||||
extern void daedalus_vp9_idct_idct_8x8_add_ref(
|
||||
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
||||
|
||||
/* FFmpeg NEON entry point (vendored vp9itxfm_neon.S). */
|
||||
extern void ff_vp9_idct_idct_8x8_add_neon(
|
||||
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
||||
|
||||
/* ---- Random-block generation ----------------------------------- */
|
||||
|
||||
/* xorshift64 — deterministic per seed, fast enough not to dominate
|
||||
* the measurement. */
|
||||
static uint64_t xs64_state;
|
||||
static inline uint64_t xs64(void)
|
||||
{
|
||||
uint64_t x = xs64_state;
|
||||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||
return xs64_state = x;
|
||||
}
|
||||
|
||||
/* Random VP9-plausible coefficient block: most coefficients zero,
|
||||
* a handful of nonzero ones in low-frequency positions. Bias chosen
|
||||
* so eob is typically in [4, 32], hitting the general (non-DC) path.
|
||||
* For Phase 3 baseline this isn't load-balanced against a real
|
||||
* bitstream distribution — Phase 7 may revisit. */
|
||||
static int gen_block(int16_t block[64])
|
||||
{
|
||||
memset(block, 0, 64 * sizeof(*block));
|
||||
int eob = 0;
|
||||
int n_nonzero = 1 + (int)(xs64() % 16);
|
||||
for (int i = 0; i < n_nonzero; i++) {
|
||||
/* Bias toward low-freq positions via xs64() % (xs64() % 64 + 1). */
|
||||
int pos = (int)(xs64() % 64);
|
||||
/* Coefficient range: signed 12-bit (typical dequant output). */
|
||||
int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096);
|
||||
block[pos] = coef;
|
||||
if (pos + 1 > eob) eob = pos + 1;
|
||||
}
|
||||
if (eob == 0) eob = 1;
|
||||
return eob;
|
||||
}
|
||||
|
||||
static void gen_pred(uint8_t pred[64])
|
||||
{
|
||||
for (int i = 0; i < 64; i++)
|
||||
pred[i] = (uint8_t)(xs64() & 0xff);
|
||||
}
|
||||
|
||||
/* ---- Wall-clock timing (CLOCK_MONOTONIC_RAW) ------------------- */
|
||||
|
||||
static double now_seconds(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||
}
|
||||
|
||||
/* ---- Phase 1 M1: bit-exact gate -------------------------------- */
|
||||
|
||||
static int correctness_check(uint64_t seed, int n_blocks)
|
||||
{
|
||||
xs64_state = seed ? seed : 0xdeadbeefcafebabeULL;
|
||||
int mismatches = 0;
|
||||
int dc_only_seen = 0;
|
||||
|
||||
int16_t block_a[64], block_b[64];
|
||||
uint8_t pred[64];
|
||||
uint8_t dst_a[64], dst_b[64];
|
||||
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
int eob = gen_block(block_a);
|
||||
memcpy(block_b, block_a, sizeof(block_a));
|
||||
gen_pred(pred);
|
||||
memcpy(dst_a, pred, 64);
|
||||
memcpy(dst_b, pred, 64);
|
||||
|
||||
daedalus_vp9_idct_idct_8x8_add_ref(dst_a, 8, block_a, eob);
|
||||
ff_vp9_idct_idct_8x8_add_neon(dst_b, 8, block_b, eob);
|
||||
|
||||
if (memcmp(dst_a, dst_b, 64) != 0) {
|
||||
if (mismatches < 4) {
|
||||
fprintf(stderr, "MISMATCH block %d eob=%d:\n", i, eob);
|
||||
for (int r = 0; r < 8; r++) {
|
||||
fprintf(stderr, " row %d ref ", r);
|
||||
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r * 8 + c]);
|
||||
fprintf(stderr, " neon ");
|
||||
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r * 8 + c]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
mismatches++;
|
||||
}
|
||||
if (eob == 1) dc_only_seen++;
|
||||
}
|
||||
|
||||
printf("M1 correctness: %d / %d blocks bit-exact match (%.4f%%)\n",
|
||||
n_blocks - mismatches, n_blocks,
|
||||
100.0 * (n_blocks - mismatches) / n_blocks);
|
||||
printf(" dc-only path frequency: %d / %d (%.2f%%)\n",
|
||||
dc_only_seen, n_blocks, 100.0 * dc_only_seen / n_blocks);
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
/* ---- Phase 1 M3: NEON throughput ------------------------------- */
|
||||
|
||||
static void throughput_neon(uint64_t seed, int n_blocks, int iters)
|
||||
{
|
||||
xs64_state = seed ? seed : 0xfeedfacecafebeefULL;
|
||||
|
||||
/* Pre-generate all blocks + preds so generation cost is excluded
|
||||
* from the timed region. Each block is consumed once per iteration
|
||||
* (NEON path zeroes the block, so we restore from the master). */
|
||||
int16_t *blocks_master = malloc(n_blocks * 64 * sizeof(int16_t));
|
||||
int16_t *blocks_work = malloc(n_blocks * 64 * sizeof(int16_t));
|
||||
uint8_t *preds = malloc(n_blocks * 64);
|
||||
uint8_t *dsts = malloc(n_blocks * 64);
|
||||
int *eobs = malloc(n_blocks * sizeof(int));
|
||||
if (!blocks_master || !blocks_work || !preds || !dsts || !eobs) {
|
||||
fprintf(stderr, "alloc failed\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
eobs[i] = gen_block(blocks_master + i * 64);
|
||||
gen_pred(preds + i * 64);
|
||||
}
|
||||
|
||||
/* Warm-up. */
|
||||
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
|
||||
memcpy(dsts, preds, n_blocks * 64);
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
|
||||
blocks_work + i * 64, eobs[i]);
|
||||
|
||||
/* Timed region. */
|
||||
double t0 = now_seconds();
|
||||
for (int it = 0; it < iters; it++) {
|
||||
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
|
||||
memcpy(dsts, preds, n_blocks * 64);
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
|
||||
blocks_work + i * 64, eobs[i]);
|
||||
}
|
||||
double t1 = now_seconds();
|
||||
|
||||
/* memcpy cost-only run, to subtract setup overhead. */
|
||||
double s0 = now_seconds();
|
||||
for (int it = 0; it < iters; it++) {
|
||||
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
|
||||
memcpy(dsts, preds, n_blocks * 64);
|
||||
}
|
||||
double s1 = now_seconds();
|
||||
|
||||
double total_seconds = (t1 - t0) - (s1 - s0);
|
||||
double total_blocks = (double) n_blocks * iters;
|
||||
double mblocks_s = total_blocks / total_seconds / 1e6;
|
||||
|
||||
printf("M3 NEON throughput:\n");
|
||||
printf(" blocks=%d iters=%d total=%.0f\n", n_blocks, iters, total_blocks);
|
||||
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", total_seconds);
|
||||
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
|
||||
printf(" throughput = %.3f Mblock/s\n", mblocks_s);
|
||||
printf(" per-block = %.1f ns\n", total_seconds / total_blocks * 1e9);
|
||||
|
||||
/* Equivalent at 1920x1080: 32 400 blocks/frame -> FPS. */
|
||||
printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n",
|
||||
mblocks_s * 1e6 / 32400.0);
|
||||
|
||||
free(blocks_master); free(blocks_work); free(preds);
|
||||
free(dsts); free(eobs);
|
||||
}
|
||||
|
||||
/* ---- CLI ------------------------------------------------------- */
|
||||
|
||||
static void usage(const char *p)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"Usage: %s [--blocks N] [--iters K] [--seed S] [--no-correctness]\n"
|
||||
"Defaults: N=1000000, K=10, S=0 (uses fixed default).\n", p);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int n_blocks = 1000000;
|
||||
int iters = 10;
|
||||
uint64_t seed = 0;
|
||||
int do_correctness = 1;
|
||||
|
||||
static struct option opts[] = {
|
||||
{"blocks", required_argument, 0, 'b'},
|
||||
{"iters", required_argument, 0, 'i'},
|
||||
{"seed", required_argument, 0, 's'},
|
||||
{"no-correctness", no_argument, 0, 'C'},
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{0,0,0,0}
|
||||
};
|
||||
for (int c; (c = getopt_long(argc, argv, "b:i:s:Ch", opts, 0)) != -1;) {
|
||||
switch (c) {
|
||||
case 'b': n_blocks = atoi(optarg); break;
|
||||
case 'i': iters = atoi(optarg); break;
|
||||
case 's': seed = strtoull(optarg, 0, 0); break;
|
||||
case 'C': do_correctness = 0; break;
|
||||
case 'h': usage(argv[0]); return 0;
|
||||
default: usage(argv[0]); return 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (do_correctness) {
|
||||
printf("=== M1: bit-exact correctness (10000 random blocks) ===\n");
|
||||
int miss = correctness_check(seed, 10000);
|
||||
if (miss != 0) {
|
||||
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
|
||||
return 1;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("=== M3: NEON throughput ===\n");
|
||||
throughput_neon(seed, n_blocks, iters);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,279 @@
|
||||
/*
|
||||
* Phase 3 — Vulkan compute dispatch-overhead microbench (M5).
|
||||
*
|
||||
* Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa
|
||||
* v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a
|
||||
* noop compute shader. Establishes the floor below which kernel
|
||||
* batching is mandatory.
|
||||
*
|
||||
* Two measurements:
|
||||
* M5a: empty command-buffer submit (no dispatch at all)
|
||||
* M5b: 1-workgroup dispatch of an empty shader
|
||||
*
|
||||
* The delta M5b - M5a isolates the per-vkCmdDispatch cost from
|
||||
* the per-vkQueueSubmit cost.
|
||||
*
|
||||
* Build: cmake -DDAEDALUS_BUILD_VULKAN=ON ..
|
||||
* Run: ./bench_vulkan_dispatch [--iters N] [--spv PATH]
|
||||
*
|
||||
* License: BSD-2-Clause (daedalus-fourier).
|
||||
*/
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <getopt.h>
|
||||
#include <vulkan/vulkan.h>
|
||||
|
||||
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
|
||||
fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \
|
||||
exit(1); } } while (0)
|
||||
|
||||
static double now_seconds(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||
}
|
||||
|
||||
static uint32_t *read_spv(const char *path, size_t *out_size)
|
||||
{
|
||||
FILE *f = fopen(path, "rb");
|
||||
if (!f) { perror(path); exit(1); }
|
||||
fseek(f, 0, SEEK_END);
|
||||
long sz = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
if (sz <= 0 || (sz & 3)) {
|
||||
fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
|
||||
exit(1);
|
||||
}
|
||||
uint32_t *buf = malloc(sz);
|
||||
if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
|
||||
perror("read"); exit(1);
|
||||
}
|
||||
fclose(f);
|
||||
*out_size = sz;
|
||||
return buf;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int iters = 100000;
|
||||
const char *spv_path = "noop.spv";
|
||||
|
||||
static struct option opts[] = {
|
||||
{"iters", required_argument, 0, 'i'},
|
||||
{"spv", required_argument, 0, 's'},
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{0,0,0,0}
|
||||
};
|
||||
for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) {
|
||||
switch (c) {
|
||||
case 'i': iters = atoi(optarg); break;
|
||||
case 's': spv_path = optarg; break;
|
||||
case 'h':
|
||||
fprintf(stderr,
|
||||
"Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]);
|
||||
return 0;
|
||||
default:
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* ---- Instance ---- */
|
||||
VkApplicationInfo app = {
|
||||
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
|
||||
.pApplicationName = "daedalus-fourier-bench",
|
||||
.apiVersion = VK_API_VERSION_1_3,
|
||||
};
|
||||
VkInstanceCreateInfo ici = {
|
||||
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
|
||||
.pApplicationInfo = &app,
|
||||
};
|
||||
VkInstance instance;
|
||||
CHK(vkCreateInstance(&ici, NULL, &instance));
|
||||
|
||||
/* ---- Pick V3D physical device (skip llvmpipe) ---- */
|
||||
uint32_t pd_count = 0;
|
||||
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL));
|
||||
VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds));
|
||||
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds));
|
||||
VkPhysicalDevice phys = VK_NULL_HANDLE;
|
||||
VkPhysicalDeviceProperties props = {0};
|
||||
for (uint32_t i = 0; i < pd_count; i++) {
|
||||
vkGetPhysicalDeviceProperties(pds[i], &props);
|
||||
printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n",
|
||||
i, props.deviceName,
|
||||
VK_VERSION_MAJOR(props.apiVersion),
|
||||
VK_VERSION_MINOR(props.apiVersion),
|
||||
VK_VERSION_PATCH(props.apiVersion),
|
||||
props.vendorID);
|
||||
if (strstr(props.deviceName, "V3D") != NULL) {
|
||||
phys = pds[i];
|
||||
}
|
||||
}
|
||||
if (phys == VK_NULL_HANDLE) {
|
||||
fprintf(stderr, "no V3D device found; bailing.\n");
|
||||
return 1;
|
||||
}
|
||||
vkGetPhysicalDeviceProperties(phys, &props);
|
||||
printf("selected: %s\n", props.deviceName);
|
||||
free(pds);
|
||||
|
||||
/* ---- Compute queue family ---- */
|
||||
uint32_t qfc = 0;
|
||||
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL);
|
||||
VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp));
|
||||
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp);
|
||||
uint32_t qfi = (uint32_t) -1;
|
||||
for (uint32_t i = 0; i < qfc; i++) {
|
||||
if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
|
||||
qfi = i; break;
|
||||
}
|
||||
}
|
||||
if (qfi == (uint32_t) -1) {
|
||||
fprintf(stderr, "no compute queue family\n");
|
||||
return 1;
|
||||
}
|
||||
free(qfp);
|
||||
|
||||
/* ---- Logical device ---- */
|
||||
float qprio = 1.0f;
|
||||
VkDeviceQueueCreateInfo dqci = {
|
||||
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
|
||||
.queueFamilyIndex = qfi,
|
||||
.queueCount = 1,
|
||||
.pQueuePriorities = &qprio,
|
||||
};
|
||||
VkDeviceCreateInfo dci = {
|
||||
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
|
||||
.queueCreateInfoCount = 1,
|
||||
.pQueueCreateInfos = &dqci,
|
||||
};
|
||||
VkDevice dev;
|
||||
CHK(vkCreateDevice(phys, &dci, NULL, &dev));
|
||||
VkQueue queue;
|
||||
vkGetDeviceQueue(dev, qfi, 0, &queue);
|
||||
|
||||
/* ---- Command pool + buffers ---- */
|
||||
VkCommandPoolCreateInfo cpci = {
|
||||
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
|
||||
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
|
||||
.queueFamilyIndex = qfi,
|
||||
};
|
||||
VkCommandPool pool;
|
||||
CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool));
|
||||
|
||||
VkCommandBuffer cb_empty, cb_dispatch;
|
||||
VkCommandBufferAllocateInfo cbai = {
|
||||
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
|
||||
.commandPool = pool,
|
||||
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
|
||||
.commandBufferCount = 1,
|
||||
};
|
||||
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty));
|
||||
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch));
|
||||
|
||||
/* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */
|
||||
VkPipelineLayoutCreateInfo plci = {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
|
||||
};
|
||||
VkPipelineLayout playout;
|
||||
CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout));
|
||||
|
||||
/* ---- Compute pipeline from noop SPIR-V ---- */
|
||||
size_t spv_size = 0;
|
||||
uint32_t *spv = read_spv(spv_path, &spv_size);
|
||||
VkShaderModuleCreateInfo smci = {
|
||||
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
|
||||
.codeSize = spv_size,
|
||||
.pCode = spv,
|
||||
};
|
||||
VkShaderModule shader;
|
||||
CHK(vkCreateShaderModule(dev, &smci, NULL, &shader));
|
||||
free(spv);
|
||||
|
||||
VkComputePipelineCreateInfo cpci2 = {
|
||||
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
|
||||
.stage = {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
|
||||
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.module = shader,
|
||||
.pName = "main",
|
||||
},
|
||||
.layout = playout,
|
||||
};
|
||||
VkPipeline pipe;
|
||||
CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe));
|
||||
|
||||
/* ---- Record both command buffers once, reuse for every iteration ---- */
|
||||
VkCommandBufferBeginInfo cbbi = {
|
||||
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
|
||||
};
|
||||
|
||||
CHK(vkBeginCommandBuffer(cb_empty, &cbbi));
|
||||
CHK(vkEndCommandBuffer(cb_empty));
|
||||
|
||||
CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi));
|
||||
vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe);
|
||||
vkCmdDispatch(cb_dispatch, 1, 1, 1);
|
||||
CHK(vkEndCommandBuffer(cb_dispatch));
|
||||
|
||||
VkSubmitInfo si_empty = {
|
||||
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
||||
.commandBufferCount = 1, .pCommandBuffers = &cb_empty,
|
||||
};
|
||||
VkSubmitInfo si_disp = {
|
||||
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
||||
.commandBufferCount = 1, .pCommandBuffers = &cb_dispatch,
|
||||
};
|
||||
|
||||
/* ---- Warm-up ---- */
|
||||
for (int i = 0; i < 100; i++) {
|
||||
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
|
||||
CHK(vkQueueWaitIdle(queue));
|
||||
}
|
||||
|
||||
/* ---- M5a: empty CB submit+wait ---- */
|
||||
double t0 = now_seconds();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE));
|
||||
CHK(vkQueueWaitIdle(queue));
|
||||
}
|
||||
double t1 = now_seconds();
|
||||
double m5a_per = (t1 - t0) / iters * 1e6; /* µs */
|
||||
|
||||
/* ---- M5b: 1-WG noop dispatch submit+wait ---- */
|
||||
double t2 = now_seconds();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
|
||||
CHK(vkQueueWaitIdle(queue));
|
||||
}
|
||||
double t3 = now_seconds();
|
||||
double m5b_per = (t3 - t2) / iters * 1e6; /* µs */
|
||||
|
||||
printf("\n=== M5: Vulkan compute dispatch overhead ===\n");
|
||||
printf(" iters per measurement: %d\n", iters);
|
||||
printf(" M5a empty CB submit+wait: %.2f µs/op\n", m5a_per);
|
||||
printf(" M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per);
|
||||
printf(" delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n",
|
||||
m5b_per - m5a_per);
|
||||
printf("\n");
|
||||
printf(" Implication for kernel batching:\n");
|
||||
printf(" if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n");
|
||||
printf(" a single-block dispatch costs %.0fx more in overhead\n",
|
||||
m5b_per * 1e3 / 100.0);
|
||||
printf(" -> batch at least %.0f blocks per dispatch to break even.\n",
|
||||
m5b_per * 1e3 / 100.0);
|
||||
|
||||
/* ---- Tear down (minimal — process exit handles the rest) ---- */
|
||||
vkDestroyPipeline(dev, pipe, NULL);
|
||||
vkDestroyShaderModule(dev, shader, NULL);
|
||||
vkDestroyPipelineLayout(dev, playout, NULL);
|
||||
vkDestroyCommandPool(dev, pool, NULL);
|
||||
vkDestroyDevice(dev, NULL);
|
||||
vkDestroyInstance(instance, NULL);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#version 450
|
||||
// Empty compute shader for measuring Vulkan dispatch overhead (M5).
|
||||
// Reads nothing, writes nothing — pure dispatch round-trip floor.
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
void main() {}
|
||||
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
|
||||
* transform + add (8-bit pixels), transcribed from the spec
|
||||
* structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
|
||||
* (vendored under external/ffmpeg-snapshot/ at commit f46e514).
|
||||
*
|
||||
* Provided as a self-contained translation unit so the harness
|
||||
* doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
|
||||
* expansion. Cross-checked against the vendored reference at
|
||||
* runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches the upstream reference).
|
||||
*
|
||||
* Spec source: VP9 specification §8.7 — Inverse transform process.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Q14 trig constants — VP9 spec table 8.7.1.4. */
|
||||
#define COSPI_16_64 11585 /* cos(pi/4) * 2^14 */
|
||||
#define COSPI_24_64 6270 /* cos(3pi/8) * 2^14 */
|
||||
#define COSPI_8_64 15137 /* sin(3pi/8) * 2^14 */
|
||||
#define COSPI_28_64 3196 /* cos(7pi/16)* 2^14 */
|
||||
#define COSPI_4_64 16069 /* sin(7pi/16)* 2^14 */
|
||||
#define COSPI_20_64 9102 /* cos(5pi/16)* 2^14 */
|
||||
#define COSPI_12_64 13623 /* sin(5pi/16)* 2^14 */
|
||||
|
||||
/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
|
||||
static inline int32_t qround14(int64_t x)
|
||||
{
|
||||
return (int32_t) ((x + (1 << 13)) >> 14);
|
||||
}
|
||||
|
||||
static inline uint8_t clip_u8(int x)
|
||||
{
|
||||
return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
|
||||
}
|
||||
|
||||
/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
|
||||
* idct8_1d in libavcodec/vp9dsp_template.c (with the stride
|
||||
* collapsed to indexed access; identical arithmetic). */
|
||||
static void idct8_1d(const int32_t in[8], int32_t out[8])
|
||||
{
|
||||
int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
|
||||
int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
|
||||
int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
|
||||
int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64 + (int64_t)in[6] * COSPI_24_64);
|
||||
int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
|
||||
int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
|
||||
int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
|
||||
int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64 + (int64_t)in[7] * COSPI_28_64);
|
||||
|
||||
int32_t t0 = t0a + t3a, t1 = t1a + t2a;
|
||||
int32_t t2 = t1a - t2a, t3 = t0a - t3a;
|
||||
int32_t t4 = t4a + t5a;
|
||||
int32_t t5p = t4a - t5a;
|
||||
int32_t t7 = t7a + t6a;
|
||||
int32_t t6p = t7a - t6a;
|
||||
|
||||
int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
|
||||
int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);
|
||||
|
||||
out[0] = t0 + t7; out[1] = t1 + t6;
|
||||
out[2] = t2 + t5; out[3] = t3 + t4;
|
||||
out[4] = t3 - t4; out[5] = t2 - t5;
|
||||
out[6] = t1 - t6; out[7] = t0 - t7;
|
||||
}
|
||||
|
||||
/* Public reference entry point. Signature matches
|
||||
* ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
|
||||
* zeroed (matches FFmpeg behaviour). */
|
||||
void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
|
||||
int16_t *block, int eob)
|
||||
{
|
||||
int32_t tmp[64];
|
||||
int32_t out[8];
|
||||
int32_t col[8];
|
||||
|
||||
/* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
|
||||
* broadcast (+16) >> 5 added to every pixel. */
|
||||
if (eob == 1) {
|
||||
int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
|
||||
* (int64_t) COSPI_16_64);
|
||||
block[0] = 0;
|
||||
int32_t add = (dc + 16) >> 5;
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
|
||||
return;
|
||||
}
|
||||
|
||||
/* 8 column passes, transposed write: IDCT of block column i lands
|
||||
* in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
|
||||
* uses `tmp + i*8` as the column-pass output base — the transpose
|
||||
* is implicit in the offset pattern, making the row pass below
|
||||
* read columns of tmp and write columns of dst. */
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
|
||||
idct8_1d(col, out);
|
||||
for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
|
||||
}
|
||||
memset(block, 0, 64 * sizeof(*block));
|
||||
|
||||
/* 8 row passes: column i of tmp -> column i of dst (matches
|
||||
* FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
|
||||
idct8_1d(col, out);
|
||||
for (int r = 0; r < 8; r++)
|
||||
dst[r * stride + i] = clip_u8(dst[r * stride + i]
|
||||
+ ((out[r] + 16) >> 5));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user