commit dcbbc77038f3489e8fc7221de5d9c96b62fa2f4a
Author: Markus Fritsche <mfritsche@reauktion.de>
Date:   Mon May 18 11:30:12 2026 +0000

    Path B pivot + Phase 0-3 closed with first baseline numbers
    
    This is a from-scratch initial commit on a fresh .git. The original
    scaffold commit (7510b56) and the earlier session's working-tree
    docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
    .git is preserved at .git-broken-2026-05-18/ (gitignored) for
    forensic inspection.
    
    Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
    cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
    to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
    direct DRM, on stock signed Pi 5 / CM5). See README.md and
    docs/phase0.md for the substrate audit that closed Path A.
    
    Phases closed:
      Phase 0 — substrate audit; Path A blocked, Path B open;
                codec-back-end-fits-QPU finding (docs/phase0.md)
      Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
                publish-before-measure R = M2/M3 decision rules
                (docs/phase1.md)
      Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
                under external/ffmpeg-snapshot/ (PROVENANCE.md pins
                commit f46e514 + per-file SHA-256s) (docs/phase2.md)
      Phase 3 — real baseline measurements on hertz (docs/phase3.md):
                  M1 bit-exact            100.0000 % (10000/10000)
                  M3 NEON IDCT8 single    8.171 Mblock/s (122.4 ns/block)
                  M5a empty Vulkan submit 22.66 us
                  M5b 1-WG noop dispatch  55.60 us
                  M5 delta                32.95 us/dispatch
                => per-dispatch overhead is ~455x per-NEON-block cost;
                   Phase 4 must batch at frame level or close to it.
    
    Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
    vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
    external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
    Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
    libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
    assembles via the config.h shim.
    
    Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
    constraint) -> Phase 5 second-model review -> Phase 6 implement.
    
    Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7a6eee8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,13 @@
+build/
+build-*/
+*.o
+*.spv
+.cache/
+.vscode/
+.idea/
+*.swp
+*~
+
+# Forensic snapshot of the corrupted .git from 2026-05-18 10:25
+# working-tree wipe. Retained on disk for inspection; not tracked.
+.git-broken-2026-05-18/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a6b5125
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,103 @@
+# daedalus-fourier — Phase 3 baseline + (later) Phase 6 implementation.
+#
+# Builds:
+#   bench_neon_idct  — NEON throughput baseline (Phase 3 M3) +
+#                      bit-exact correctness gate (Phase 1 M1).
+#   bench_vulkan_dispatch — Vulkan compute dispatch-overhead baseline (M5).
+#
+# Linkage note: bench_neon_idct statically links the vendored
+# FFmpeg n7.1.3 NEON snapshot (LGPL-2.1+); see
+# external/ffmpeg-snapshot/PROVENANCE.md.
+
+cmake_minimum_required(VERSION 3.20)
+project(daedalus-fourier C ASM)
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    message(FATAL_ERROR
+        "daedalus-fourier targets aarch64 (Pi 5 / BCM2712). "
+        "Cross-compile not yet wired.")
+endif()
+
+add_compile_options(-Wall -Wextra -Wno-unused-parameter)
+
+# ---- Vendored FFmpeg snapshot (LGPL-2.1+) -----------------------------------
+
+set(FFSNAP ${CMAKE_SOURCE_DIR}/external/ffmpeg-snapshot)
+
+# Assembly preamble (config.h shim + FFmpeg's asm helpers) used by the
+# vendored .S file. -I flags expose:
+#   - FFSNAP/                    so `#include "config.h"` finds our shim
+#   - FFSNAP/libavcodec/aarch64/ so `#include "neon.S"` finds the helper
+#   - FFSNAP/                    so `#include "libavutil/aarch64/asm.S"`
+#                                resolves against the vendored copy
+set(FFASM_FLAGS
+    -I${FFSNAP}
+    -I${FFSNAP}/libavcodec/aarch64
+    -I${FFSNAP}
+)
+
+set(FFASM_SOURCES
+    ${FFSNAP}/libavcodec/aarch64/vp9itxfm_neon.S
+)
+
+# Tell CMake/gas to preprocess .S sources.
+set_source_files_properties(${FFASM_SOURCES} PROPERTIES
+    COMPILE_OPTIONS "${FFASM_FLAGS}"
+    LANGUAGE ASM)
+
+# ---- NEON baseline microbench ----------------------------------------------
+
+add_executable(bench_neon_idct
+    tests/bench_neon_idct.c
+    tests/vp9_idct8_ref.c
+    ${FFASM_SOURCES}
+)
+target_compile_options(bench_neon_idct PRIVATE -O3 -march=armv8-a+simd)
+# bench_neon_idct doesn't need vulkan/drm — pure CPU baseline.
+
+# ---- Vulkan dispatch-overhead microbench (next chunk) ----------------------
+# Stub: written in a follow-up step. Toggle ON with -DDAEDALUS_BUILD_VULKAN=ON
+# once tests/bench_vulkan_dispatch.c exists.
+
+option(DAEDALUS_BUILD_VULKAN "Build Vulkan compute-dispatch microbench" ON)
+
+if (DAEDALUS_BUILD_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    # Compile GLSL compute shaders to SPIR-V via glslangValidator.
+    # The binary loads them at runtime from the build dir (cwd-relative).
+    find_program(GLSLANG_VALIDATOR
+        NAMES glslangValidator glslang
+        REQUIRED)
+
+    set(NOOP_SPV ${CMAKE_BINARY_DIR}/noop.spv)
+    add_custom_command(
+        OUTPUT ${NOOP_SPV}
+        COMMAND ${GLSLANG_VALIDATOR} -V -o ${NOOP_SPV}
+                ${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
+        DEPENDS ${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
+        COMMENT "glslang: noop.comp -> noop.spv"
+        VERBATIM
+    )
+    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV})
+
+    add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c)
+    add_dependencies(bench_vulkan_dispatch daedalus_shaders)
+    target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan)
+    target_compile_options(bench_vulkan_dispatch PRIVATE -O2)
+endif()
+
+# ---- Summary ----------------------------------------------------------------
+
+message(STATUS "daedalus-fourier build configured for ${CMAKE_SYSTEM_PROCESSOR}")
+message(STATUS "  FFmpeg snapshot: ${FFSNAP}")
+message(STATUS "  Build type:      ${CMAKE_BUILD_TYPE}")
+message(STATUS "  Targets:         bench_neon_idct"
+               "$<$<BOOL:${DAEDALUS_BUILD_VULKAN}>:; bench_vulkan_dispatch>")
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8d9d223
--- /dev/null
+++ b/README.md
@@ -0,0 +1,177 @@
+# daedalus-fourier
+
+Community-built VP9 / AV1 software-decode back-end running on the
+VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 /
+Compute Module 5), via the existing Mesa `v3d` userspace driver.
+ARM keeps the serial entropy front-end; the QPU takes the parallel
+back-end (inverse transforms, deblocking, CDEF, loop restoration,
+MC residual add).
+
+> Daedalus built the Labyrinth for King Minos, then escaped from it
+> by hand-forging flight firmware out of feathers and wax when no
+> sanctioned exit existed.
+
+That's the project shape. The Broadcom-locked VideoCore VII is the
+Labyrinth; the Pi Foundation's "use the HEVC block and live with
+software decode for everything else" is the official non-exit;
+the QPU sits unused inside the labyrinth's walls.
+
+**Status: Phase 0 closed (substrate audit). Phase 1 in progress
+(first-kernel proof on hertz).** This is research-track work that
+may take months or may yield a single proof-of-concept kernel that
+loses to ARM NEON, in which case the negative result ships and the
+project closes.
+
+## Why this exists
+
+higgs is a Raspberry Pi Compute Module 5 in a small portable
+chassis with a battery. Watching nerds review *Star Wars* on YouTube
+while putting Mac Studios into virtual shopping baskets is a
+core workload for the higgs class of device.
+
+YouTube serves H.264 (legacy), VP9 (typical 4K), and AV1 (newer
+high-bitrate / high-resolution content). It does not serve HEVC.
+Pi 5's BCM2712 has one HW decoder block: HEVC. The intersection
+of {what YouTube serves} ∩ {what BCM2712 decodes in HW} = ∅.
+
+Every YouTube frame on higgs today is software-decoded on Cortex-A76
+cores at ~50–90% CPU per video stream. Offloading the parallel
+back-end of that decode to the otherwise-idle QPU complex *might*
+recover meaningful CPU time and battery on higgs. The honest
+prior — measured in Phase 0 — is that the QPU has roughly equal
+raw compute to the A76 cluster but a smaller slice of the shared
+LPDDR4x bandwidth, so the win, if any, comes from offloading
+*concurrent* work the CPU would have done anyway.
+
+The Pi Foundation isn't going to do this work (per their own
+statement: chromium-patch sustainment was too much; codec
+sustainment would be moreso). The kernel `rpi-hevc-dec` series has
+been 17 months in review for one decoder block they DID write
+themselves. Whatever ships here ships through the community.
+
+## Architecture (Path B)
+
+Phase 0 closed two paths:
+
+- **Path A — custom VPU firmware on the VC7 scalar cores.**
+  Blocked. BCM2712 has a silicon root of trust: the mask ROM
+  hardcodes RPi's public key and unconditionally verifies the
+  second-stage bootloader. `EXECUTE_CODE` mailbox removed on Pi 5.
+  No software-only bypass exists. See `docs/phase0.md §3`.
+
+- **Path B — QPU compute kernels via the existing Mesa `v3d` /
+  DRM / Vulkan-compute path.** This is the path. The QPU is
+  reachable from userspace today on a stock signed Pi 5 / CM5
+  via `/dev/dri/card0`. No firmware loading. No signing fight.
+  `Idein/py-videocore7` (SGEMM 21 GFLOPS sustained) is the
+  existence proof.
+
+The build:
+
+```
+┌───────────────────────────────┐
+│ userspace VP9 / AV1 decoder   │
+│  (fork of dav1d / libvpx)     │
+├───────────────────────────────┤
+│  ARM:    entropy decode       │ ← Cortex-A76 + NEON
+│          (Bool coder / ANS)   │   structurally serial
+├───────────────────────────────┤
+│  QPU:    parallel back-end    │ ← V3D 7.1 via Mesa v3dv
+│          (IDCT, CDEF,         │   Vulkan compute shaders
+│           deblock, LR, MC)    │   or direct DRM submit
+├───────────────────────────────┤
+│ V4L2 stateless wrapper        │ ← out-of-tree kernel module
+│  (eventual, kernel-agent)     │   exposing /dev/videoN
+└───────────────────────────────┘
+```
+
+The first deliverable is *not* the V4L2 wrapper. The first
+deliverable is one back-end kernel running on the QPU, bit-exact
+against a libavcodec reference, with measured throughput. If that
+single kernel can't beat NEON or get within 50% of it, the project
+closes here with a documented negative result.
+
+## In scope
+
+- A small set of codec back-end kernels (IDCT 8×8, CDEF, deblocking,
+  loop restoration filter, MC interpolation) compiled as SPIR-V
+  compute shaders for Mesa `v3dv`, dispatched via Vulkan compute
+  from userspace.
+- A test harness on hertz that runs each kernel against libavcodec
+  reference outputs and measures throughput (megapixels/sec or
+  blocks/sec) against the equivalent NEON path.
+- Phase 1 = one kernel, bit-exact, with numbers. Phase 2+ = more
+  kernels only if Phase 1 numbers justify it.
+
+## Out of scope (for now)
+
+- HEVC (Pi 5 has dedicated silicon; `rpi-hevc-dec` covers it).
+- Pi 4 / BCM2711 / VideoCore VI. Different ISA, smaller compute
+  budget. Path B *could* extend but isn't the priority.
+- Encode. Pi Foundation removed all HW encode in Pi 5; encode on
+  VC7 is a separate, larger project.
+- Custom VPU firmware (Path A — blocked by silicon RoT, see
+  `docs/phase0.md`).
+- V4L2 stateless driver wrapping the userspace decoder. Eventual
+  consumption point, but Phase 1 lives entirely in userspace.
+- Beating ARM NEON unconditionally. The honest target is
+  *concurrent* work: QPU runs while CPU does something else.
+
+## Dev substrate
+
+- **hertz** (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75-rpt-rpi-2712,
+  Mesa 25.0.7 with v3dv, V3D 7.1.7) — the dev / test / measurement
+  host. Watchdog-protected for crash recovery. See
+  `docs/vulkaninfo_v3d_7_1_7_hertz.txt` for the inside-view device
+  profile.
+- **higgs** (CM5 in portable battery chassis) — the eventual user
+  target. Not a dev unit; sealed chassis.
+
+## Conventions
+
+This project follows the 9(+1)-phase dev process. See
+`docs/dev_process.md`. Phase 0 is closed (`docs/phase0.md`);
+Phase 1 is `docs/phase1.md`.
+
+Gitea identity: `claude-noether` (per
+`feedback_gitea_as_claude_noether.md`). No `marfrit` pushes from
+Claude sessions.
+
+## Layout
+
+```
+daedalus-fourier/
+├── README.md             ← this file
+├── docs/
+│   ├── dev_process.md    ← reference copy of the 9(+1)-phase loop
+│   ├── phase0.md         ← substrate audit (closes Paths A and B)
+│   ├── phase1.md         ← first-kernel goal + measurement plan
+│   └── vulkaninfo_v3d_7_1_7_hertz.txt
+│                          ← inside-view device profile from hertz
+├── src/                  ← kernels + Vulkan dispatch harness
+└── tests/                ← bit-exact vs libavcodec, throughput
+```
+
+No build system yet. Adding CMake when the first kernel lands.
+
+## Sibling projects in the same orbit
+
+- `libva-v4l2-request-fourier` — VA-API consumer-side backend.
+  Eventual consumer if daedalus produces a V4L2 stateless node.
+- `firefox-fourier` — Firefox fork that routes stateless V4L2
+  through libavcodec's `v4l2_request` hwaccel. Same pickup point.
+- `chromium-fourier` — sibling for Chromium.
+- `kernel-agent` — would house the V4L2 driver wrapping the
+  userspace decoder, once one exists.
+- `ampere-av1-enablement` — software-side AV1 bring-up on RK3588
+  (rkvdec / vpu981). Provides the userspace conformance harness
+  daedalus reuses for VC7-AV1 verification.
+
+## Source attribution
+
+Daedalus-the-myth is public domain. The wax-and-feathers
+metaphor is older than software engineering.
+
+Anyone wanting to fail at this project: please file your failures
+under `branches/icarus/`. Built-in self-deprecation slot, with
+honor.
diff --git a/docs/dev_process.md b/docs/dev_process.md
new file mode 100644
index 0000000..ae9715c
--- /dev/null
+++ b/docs/dev_process.md
@@ -0,0 +1,96 @@
+---
+name: Claude-Assisted Development Process (9(+1)-phase loop)
+description: Default workflow for any non-trivial implementation — substrate/motivation/inventory, formulate, analyze, baseline, plan, second-model review, implement, verify, closing (package+ship), memory-update; with explicit loopback edges
+type: feedback
+originSessionId: 83898ac9-e61f-4c44-8429-0154cb12d124
+---
+Markus's standardized loop for our implementation work. Apply by default whenever a task is bigger than a one-liner. Skipping phases is a deliberate choice that should be flagged, not a default.
+
+## Phase 0 — Substrate / Motivation / Inventory
+
+Pre-formulation. Lock the research question and assemble the substrate *before* Phase 1 commits to a measurable goal. Output: a `phase0_findings.md` artifact that future phases can refer back to without re-deriving.
+
+- **Research question + mechanism captured.** State the question in one sentence. Capture any operator-supplied mechanism (the "why this question, how does it work" insight) verbatim — it's the load-bearing claim Phase 1 binds against.
+- **Predecessor carry-over: state vs data.** When a campaign succeeds another, categorize what transfers. *State* (installed packages, governor settings, system tweaks, source-read file:line pointers, protocol designs, parser scripts) carries forward. *Data* (drop counts, perf percentages, threshold values, baseline floors) does not — it is reference history only. Binding cells in this campaign anchor to in-session-acquired numbers, even if the predecessor measured an identical condition.
+- **Tooling and measurement-instrument inventory.** What's installed, what would need installing, what extensions/protocols the live system actually supports. Live verification, not paper compatibility.
+- **In-session baseline anchor.** Re-run the reference rep — N=3 minimum if the baseline is load-bearing for the campaign's premise — *before* any instrument changes. **If the predecessor's reference floor doesn't replicate at N=3 in the same session, that is the campaign result.** Don't build multi-phase infrastructure on an N=1 historical floor. See `feedback_replicate_baseline_first.md`.
+- **Open questions tabled.** What's not known going into Phase 1. Phase 1 locks against the knowns; Phase 0 surfaces the unknowns explicitly so they don't slip into binding cells unverified.
+
+## Phase 1 — Goal Formulation
+Define the objective in measurable terms. State what success looks like *before* touching anything. The chosen metric is a **hypothesis** about what to measure, not an axiom — Phase 3 may invalidate it.
+
+## Phase 2 — Situation Analysis
+Document current state. Identify constraints, dependencies, known failure modes. **Reset context here** — do not carry assumptions from prior sessions; re-read CLAUDE.md, relevant memory files, run `git status`, re-verify reachability.
+
+## Phase 3 — Baseline Measurements
+Take concrete measurements *before* any changes. Paste raw output into DokuWiki at capture time — verbatim, not paraphrased. The Phase 5 artifact is the raw data, not Claude's summary.
+
+**Real data, not theatre.** Phase 3 exists to use AI capacity for absorbing wide, low-level instrumentation a human reader would skim past. Attaching strace / perf / ftrace / eBPF / custom tripwires to the process under test is real Phase 3; scraping mpv's stdout dropped-frame counter is not. Discriminator: if a human with bash and grep could produce the same baseline, it isn't Phase 3 yet — go down to the syscall / call-path / MMIO / register layer. See `feedback_phase3_no_theatre.md`.
+
+**Anti-fabrication:**
+- Every cited value traces to a visible tool invocation or verbatim paste-in. If a measurement wasn't taken, write "not measured" — never an estimate, inference, or recall from training / prior sessions / sibling-host memory.
+- Raw before derived. A derived number (FPS, p99, error rate) appears alongside the raw stream it came from, never alone.
+- Rig failure is the finding. Empty strace, dead UART, perf counter that didn't increment → that *is* the Phase 3 result. Loop back to Phase 2 to fix the rig; do not synthesize plausible-looking baseline data to keep momentum.
+
+- **If baseline reveals the Phase 1 metric was tracking the wrong thing → loop back to Phase 1** with the corrected target. (Example: "max H.264 FPS" Phase 1 metric, but baseline shows DMA-setup + sync overhead dwarfs decode → real metric is bytes-copied-per-second / EGL surface-import time, not FPS.)
+
+**Measurements describe what the system *does*, not what it *should do*.** Baseline data is evidence, not a specification. Do NOT derive API call sequences, struct layouts, or parameter values from observed behaviour (strace, perf, example output). Observable behaviour may reflect bugs, workarounds, or implementation accidents — anything you copy from it inherits those.
+
+## Phase 4 — Plan
+Formulate the approach. Identify what will and will not be touched. State expected outcome of implementation in the *same* measurable terms used in Phase 1/3.
+
+## Phase 5 — Second Model Review
+Goal, situation, measurements, plan get pasted into **DokuWiki**. Markus reviews and redacts, then initiates the handover to a fresh model instance. **Claude does not curate the artifact going to the reviewer** — that would re-introduce the blind-spot accumulation the review is meant to escape. Do not summarize when handing over; paste the actual artifacts.
+
+## Phase 6 — Implementation
+Execute the plan. Scope strictly to what was planned — resist feature creep, refactor-creep, "while I'm here" cleanups, and over-eager scope expansion. If a plan revision is needed mid-implementation, surface it explicitly and re-enter Phase 4.
+
+**Contract before code.** Before writing or modifying any call site:
+- Read the API contract — kernel docs, header comments, and upstream source for every call touched.
+- State the contract explicitly before implementing against it (in the plan, the commit message, or a comment — somewhere reviewable).
+- If the contract cannot be found: stop and surface the gap. Don't infer it from baseline behaviour or sibling code.
+
+**Copying from baseline measurements is not implementation. It is transcription of potentially broken behaviour.** A deliverable that matches baseline bytes but violates the API contract is not a deliverable — it is a deferred bug.
+
+### What "state the contract explicitly" looks like
+
+Worked example: `0012-h264-omit-scaling-matrix-frame-based.patch` in `~/src/ohm_gl_fix/phase6/step1/`. The commit message opens with the contract before any code:
+
+> VAAPI signals "explicit scaling lists are present in the bitstream" implicitly: the consumer (ffmpeg-vaapi, mpv, etc.) sends a `VAIQMatrixBufferH264` alongside `RenderPicture` iff `sps_scaling_matrix_present_flag || pps_scaling_matrix_present_flag`. When the bitstream uses default (flat) scaling, no IQMatrixBuffer arrives […]
+>
+> Earlier draft of this patch unconditionally omitted SCALING_MATRIX in FRAME_BASED. That's **corpus-correct** (bbb has no explicit scaling lists) but the **wrong predicate**: the kernel-side gating is by "matrix-supplied vs. not," not by decode mode. […]
+>
+> Contract verification (audit_0008_decode_params_2026-05-01.md + hantro_h264.c::assemble_scaling_list): the kernel uses the supplied matrix when SCALING_MATRIX is in the control batch and falls back to spec-defined defaults when absent. Mode-independent.
+
+What this gets right:
+- **Contract first**: per-control rules cited from kernel doc (`ext-ctrls-codec-stateless.rst:752`), kernel driver (`hantro_h264.c::assemble_scaling_list`), and sibling implementation (gst-plugins-bad commit 9e3e775) — *before* any patch hunks.
+- **Corpus-correct ≠ spec-correct, called out by name**: the rejected predicate ("omit SCALING_MATRIX in FRAME_BASED") *did* match the BBB baseline. It still got rejected, because the contract said the gate is "matrix-supplied vs. not," not "decode mode." This is exactly the Phase 3-derived-implementation trap.
+- **Then** the diff implements one branch per contract clause: SPS/PPS/DECODE_PARAMS always, SCALING_MATRIX iff `matrix_set`, SLICE_PARAMS iff SLICE_BASED, PRED_WEIGHTS iff SLICE_BASED + `V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED`.
+
+Mirror format anywhere reviewable: PR description, commit message body, plan section, or a header comment block. The shape is "contract clauses with citations → code that maps 1:1 to those clauses."
+
+## Phase 7 — Verification Measurements
+Repeat measurements from Phase 3. Compare explicitly against baseline.
+- **If the delta does not match Phase 4's prediction → loop back to Phase 4** (re-plan). Do not declare success when the numbers say otherwise; an unexplained delta is a finding, not a footnote.
+
+## Phase 8 — Closing (Package & Ship)
+Ship the deliverable to its consumption point. Working code that lives only in a checkout is half a deliverable — the next session has to re-discover it, the fleet doesn't get the fix, and the loop's value evaporates.
+
+- **Kernel patch → kernel-agent package.** Route through the kernel-agent flow (`fleet/<host>.yaml` + scope-tagged patches) so the kernel package gets properly built, signed, and published. Don't leave loose `.patch` files in a working tree. See `project_kernel_agent.md` for the manifest shape; `linux-ampere-fourier` and `linux-fresnel-fourier` are the canonical examples.
+- **Program / library change → marfrit-packages.** Add or update a PKGBUILD (Arch/ALARM) or debian/ tree (deb), push to `git.reauktion.de/marfrit/marfrit-packages`, and let `.gitea/workflows/build.yml` produce + sign + publish to `packages.reauktion.de`. See `project_marfrit_packages.md`. Local-only fixes go upstream as PR-quality diffs into the same overlay.
+- **Skipping is a deliberate choice.** If the change is one-shot scratch work (debugging tripwire, throw-away script), say so explicitly in the closing note. The default is: it gets packaged.
+- **Re-verify on the deploy host with the packaged artifact.** A clean Phase 7 result from a hand-rolled dev build (e.g. `meson -Dbuildtype=release && ninja`) is **not** the same as the `.pkg.tar.zst` / `.deb` that the deploy host installs. Distro packaging flags (Arch makepkg's `-O2 + FORTIFY + stack-protector-strong + stack-clash-protection` vs meson's `-O3 -DNDEBUG`, debhelper's hardening defaults, lto toggles) vectorise / unroll loops differently and routinely unmask latent UB the dev build folded away. Pull the published package down via the package manager and re-run the Phase 7 success criterion against it before closing — until that PASSes, the loop is not done. See `feedback_package_build_flags_unmask_bugs.md` for the iter39 incident that codified this.
+
+## Phase 9 — Memory Update
+Loop terminates here. Distill the lesson into a memory entry — what was the mistake the loop caught, what's the rule that would shorten the next cycle. Do not let the lesson rot in chat history.
+
+---
+
+## Loopback edges (summary)
+- Phase 3 → Phase 1 (metric was wrong)
+- Phase 7 → Phase 4 (plan didn't deliver predicted delta)
+- Any phase → Phase 0 (substrate was wrong: predecessor baseline didn't replicate, mechanism doesn't engage on this stack, or the data inverts the premise → re-anchor or honest close)
+- Phase 9 closes the loop
+
+## Why this exists
+Several recurring failures in prior work codify into individual rules — observer-first, simulate-before-flash, three-strikes-then-verify, "trust eyes not vibes," scope-strictly-to-plan, no-fake-dry-run. Those are all symptoms; this loop is the structural fix. Use it as the spine and let those rules show up as rejection patterns inside the appropriate phases.
diff --git a/docs/phase0.md b/docs/phase0.md
new file mode 100644
index 0000000..8dbc370
--- /dev/null
+++ b/docs/phase0.md
@@ -0,0 +1,239 @@
+---
+phase: 0
+status: closed 2026-05-18
+date_opened: 2026-05-17
+date_closed: 2026-05-18
+research_method: three rounds of parallel web research (Sonnet via Agent), plus hands-on hertz substrate inventory and live `vulkaninfo` capture
+target_hardware: hertz (Pi 5 8 GB) for dev; higgs (CM5) eventual user target
+---
+
+# Phase 0 — Substrate / motivation / inventory
+
+This is the consolidated Phase 0 record. Path A (custom VPU firmware)
+is **closed at the silicon-RoT step**; Path B (QPU compute via the
+existing Mesa `v3d` driver) is **open**. The remainder of the
+project lives in Path B.
+
+The earlier session produced two separate Phase 0 artifacts that
+were lost when the working tree was wiped at 2026-05-18 10:25
+(`.git-broken-2026-05-18/` retains the corrupted state if needed).
+This document supersedes both.
+
+---
+
+## 1. Research question
+
+Verbatim from `README.md`:
+
+> Community-built VP9 / AV1 software-decode back-end running on the
+> VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 /
+> Compute Module 5), via the existing Mesa `v3d` userspace driver.
+
+The load-bearing claim: *the QPU is programmable by us, on stock
+production hardware, and the codec back-end is a workload class
+where that programmability buys CPU time on the A76 cluster.*
+Phase 0's job is to test that claim before Phase 1 binds a metric.
+
+## 2. Substrate inventory — hertz
+
+Captured live 2026-05-17 via SSH. Full `vulkaninfo` in
+`vulkaninfo_v3d_7_1_7_hertz.txt`.
+
+| | |
+|---|---|
+| Host | hertz, Pi 5, 8 GB, eMMC + 1 TB SATA |
+| Role | LXD host for 11 containers (home-LAN spine — DNS / VPN / HA proxy / NCP / SMTP) |
+| OS | Debian 13 Trixie |
+| Kernel | `6.12.75+rpt-rpi-2712` (RPi Foundation kernel, 2026-03-11) |
+| CPU | 4× Cortex-A76 @ 2.8 GHz |
+| GPU clock | V3D 7.1 @ 1000 MHz (slight OC; spec 960 MHz) |
+| Mesa | `25.0.7-2+rpt4` (`libvulkan_broadcom.so` v3dv ICD) |
+| Vulkan loader | `1.4.309` |
+| Vulkan device API | 1.3.305 (conformance 1.3.8.3) |
+| DRM nodes | `card0 → v3d` (compute target), `card1 → vc4-drm` (display), `renderD128` |
+| kernel uAPI hdr | `/usr/include/drm/v3d_drm.h` present |
+| Build tools | cmake 3.31, ninja 1.12, libvulkan-dev 1.4.309, glslang-tools 15.1.0, spirv-tools 2025.1, libdrm-dev 2.4.131 (installed 2026-05-17) |
+| User groups | mfritsche ∈ `render`, `video`, `lxd`, `sudo` |
+| Memory pressure | 7.9 GiB RAM, ~3 GiB available; 6 GiB zram, ~2.8 GiB in use (cohabitation with LXD spine) |
+| Watchdog | yes — power-cut reboot via Himbeere plug if hertz crashes (acknowledged dev cost: household DNS/VPN drops during each reboot cycle) |
+
+**Inside-view V3D 7.1 compute envelope** (from
+`vulkaninfo_v3d_7_1_7_hertz.txt`):
+
+| Property | Value | Implication |
+|---|---|---|
+| `maxStorageBufferRange` | 1 GiB | Bounds single-tensor size; codec working sets (frames, planes) fit trivially |
+| `maxPerStageDescriptorStorageBuffers` | 8 | Forces ≤8 SSBO bindings per dispatch — ggml-vulkan binds more, doesn't fit |
+| `maxComputeSharedMemorySize` | 16 KiB | Small tiled kernels only; codec block work (8×8, 16×16) fits easily |
+| `maxComputeWorkGroupInvocations` | 256 | Standard |
+| `maxComputeWorkGroupSize` | 256 / 256 / ? | Standard |
+| `subgroupSize` | 16 (fixed) | Matches QPU SIMD width |
+| `subgroupSupportedOperations` | BASIC + VOTE only | No arithmetic reductions — accumulate via shared memory |
+| `shaderFloat16` | **false** | Storage only; arithmetic runs FP32 |
+| `shaderInt8` | **false** | Storage only; arithmetic on widened ints |
+| `shaderInt16` | **false** | Same |
+| `storageBuffer8/16BitAccess` | true | Can load tightly-packed quantized / packed pixel data |
+| `subgroupSizeControl`, `computeFullSubgroups`, `synchronization2` | true | Modern compute features available |
+
+**Throughput envelopes** (from prior community measurements,
+not yet re-confirmed in-session):
+
+| Metric | Value | Source |
+|---|---|---|
+| V3D 7.1 theoretical FP32 peak | ~92 GFLOPS at 960 MHz | 12 QPU × 4 ALU × 2 op/cycle |
+| Direct-DRM SGEMM sustained | 21.4 GFLOPS (~23%) | `Idein/py-videocore7` |
+| Vulkan-compute `vkpeak` fp32-vec4 | 6.9 GFLOPS (~7.5%) | RPi forum benchmark thread |
+| A76 NEON sustained for matmul | ~50 GFLOPS | Multiple benchmark sources |
+| Shared LPDDR4x bus | ~17 GB/s nominal | LPDDR4x-4267 × 32 bit / 8 |
+| GPU-measured BW share | 4–7 GB/s | py-videocore7 scopy benchmark |
+| CPU NEON BW achievable | 12–15 GB/s | Pi 5 STREAM benchmarks |
+
+## 3. Path A — closed
+
+**Custom VPU firmware loaded onto VC7 scalar cores.** This was the
+README's original framing.
+
+Blocked at the silicon-RoT step:
+
+- **BCM2712 mask ROM hardcodes RPi's public key** and unconditionally
+  verifies the second-stage bootloader (`bootsys`) on every boot
+  path (SPI flash, USB rpiboot, SD recovery). RPi holds the
+  corresponding private key.
+- `EXECUTE_CODE` mailbox tag (the only documented Pi 1–4 runtime
+  "run code on a VPU core" mechanism) **confirmed removed on Pi 5**
+  by Pi Foundation engineer (forum.raspberrypi.com).
+- Pre-CRA EEPROM downgrade is possible (no anti-rollback fuse) but
+  only yields *older RPi-signed* EEPROMs — doesn't help.
+- OTP fuse state on stock CM5 is already the most permissive
+  possible (customer key hash = zero); the RPi-key check is
+  silicon-unconditional, not gated by OTP.
+- CM5 vs retail Pi 5: same silicon, same chain, no meaningful
+  security delta.
+- One non-software escape exists: VPU JTAG via documented test
+  points (`schlae/cm5-reveng`, Dec 2025). Hardware mod only,
+  sealed-chassis higgs not the dev unit, novel research with no
+  published firmware-injection workflow. Out of scope for this
+  project.
+
+Verdict: **structurally blocked for community use without RPi
+cooperation or hardware-RE-grade work on a sacrificial CM5.**
+
+## 4. Path B — open
+
+**QPU compute kernels via the existing Mesa `v3d` driver.** Reachable
+from userspace today on a stock signed Pi 5 / CM5 via
+`/dev/dri/card0` (Vulkan compute through `v3dv`) or `renderD128`
+(direct DRM submit, py-videocore7 style). No firmware loading.
+No signing fight. mfritsche on hertz is in the `render` group and
+can hit the device without sudo.
+
+The substrate is real:
+- `Idein/py-videocore7` runs SGEMM at 21 GFLOPS sustained on stock
+  Pi 5 with no special setup — existence proof of arbitrary QPU
+  programs.
+- Mesa v3dv is Vulkan 1.3-conformant on V3D 7.1 (Mesa 24.3+;
+  hertz runs 25.0.7).
+- The kernel `v3d` DRM driver is fully upstream and open.
+
+Phase 0 does **not** assume Path B leads to a winning result. It
+asserts only that Path B is *reachable*, where Path A isn't.
+
+## 5. Why this isn't the same project as "v3d backend for llama.cpp"
+
+A llama.cpp v3d backend was investigated mid-session and rejected
+as structurally infeasible. The verdict was decisive: GPU loses
+to CPU on raw FP32 (21 vs ~50 GFLOPS), on memory bandwidth share
+(4–7 vs 12–15 GB/s), and on quantized instruction support (no
+INT8 MAC vs A76 SDOT/UDOT). For LLM matmul, the QPU is the wrong
+substrate.
+
+**Codec back-end work is a different workload class** with
+properties that fit the QPU substantively better:
+
+| Property | LLM matmul | Codec back-end (post-entropy) |
+|---|---|---|
+| Working set per dispatch | Whole weight matrices (GB) | Per-block (8×8 / 16×16, hundreds of bytes) — fits in 16 KiB shared mem |
+| Dominant op | INT8 MAC | Integer add / shift / small-constant multiply |
+| Why GPU misses | No INT8 MAC | Less impact — fewer multiplies, mostly add/shift |
+| Memory pattern | Full-tensor stream | Sequential plane reads, TMU-friendly |
+| Parallelism | One big GEMM | Thousands of independent small blocks per frame |
+| A76 advantage | NEON SDOT/UDOT crushing it | Less specialized; QPU advantage real |
+| Bandwidth-bound? | Yes (kills the GPU) | Compute-bound at block scale |
+
+This is the load-bearing reframe between the failed llama.cpp
+investigation and the daedalus-fourier scope. Codec back-end
+*might* live on the QPU. Phase 1 measures whether it actually does.
+
+## 6. Honest probability assessment
+
+A competent outside reviewer should rate the project as **hard but
+viable**, with one concrete prior precedent (MulticoreWare /
+Imagination PowerVR OpenCL VP9 decoder, 2014, achieved 1080p30 in
+a hybrid model with CPU entropy + GPU back-end on a comparable
+embedded GPU) and one concrete recent failure (FFmpeg 8.0 VP9-on-
+Vulkan-compute, 2025, produced corrupted output on a much more
+capable NVIDIA target — but the failure was in the *attempt to
+move entropy onto GPU*, not the back-end).
+
+The win condition is **not** "GPU beats CPU at the same work." The
+win condition is **"GPU work overlaps with CPU work that has to
+happen anyway"** — concurrent decode where ARM does entropy and
+the QPU finishes the block-level back-end on the previous frame,
+recovering CPU time for the rest of the system (browser, audio,
+UI, the 11 LXD containers on hertz).
+
+Phase 1 measures the building block: one kernel, bit-exact, with
+numbers. Phase 2+ only if Phase 1 numbers justify it.
+
+## 7. Open questions for Phase 1
+
+1. **What's the actual single-kernel QPU throughput on a
+   codec-shaped workload?** SGEMM at 21 GFLOPS is the only public
+   number, and SGEMM is not block-IDCT-shaped. We need an in-session
+   N=3 measurement on a real codec kernel.
+
+2. **What's the ARM NEON baseline for the same kernel on the same
+   hertz?** libavcodec ships highly-tuned NEON paths for IDCT,
+   deblocking, etc. Without measuring NEON in-session, "the QPU
+   wins" or "the QPU loses" is unverifiable.
+
+3. **Vulkan compute vs direct DRM submit — which path?** Vulkan
+   has tooling, documentation, debuggability. Direct DRM has
+   ~10–15% lower per-dispatch overhead and bypasses the
+   v3dv-imposed 16 KiB shared-mem / 8-SSBO limits, at the cost
+   of writing QPU asm against the NDA ISA. Phase 1 picks one.
+
+4. **Memory bandwidth contention with concurrent ARM decode.**
+   The shared 17 GB/s bus is the floor. If QPU+ARM-NEON both
+   running collide for bandwidth, the "concurrent work" win
+   disappears. Needs in-session measurement once any kernel exists.
+
+5. **VC7 thermal headroom under sustained mixed CPU+GPU load.**
+   Pi 5 throttles GPU at 85°C, CPU at 80°C. hertz idles at ~64°C
+   with the LXD spine; mixed compute will push higher. With or
+   without active cooling on hertz is an open question.
+
+These are Phase 1's burden, not Phase 0's. Phase 0 closes here.
+
+## 8. Sources
+
+Earlier session's web research produced ~7000 words of substrate
+references across 6 parallel threads. The full source list lived
+in the deleted `phase0_findings.md` and `phase0_wall1_bypass.md`.
+The high-value pointers that should follow this project forward:
+
+- [Mesa `src/broadcom/qpu/qpu_instr.h`](https://github.com/Mesa3D/mesa/blob/main/src/broadcom/qpu/qpu_instr.h) — de-facto VC7 QPU ISA reference (no Broadcom-published doc; ISA under NDA)
+- [Mesa `src/broadcom/compiler/`](https://github.com/Mesa3D/mesa/tree/main/src/broadcom/compiler) — NIR→QPU compiler, the open ground truth for what V3D 7.1 can do
+- [`Idein/py-videocore7`](https://github.com/Idein/py-videocore7) — working QPU GPGPU runtime via DRM; SGEMM benchmark; existence proof
+- [`Towdo/py-videocore7`](https://github.com/Towdo/py-videocore7) — fork with more fixes
+- [Mesa `v3dv` driver source](https://gitlab.freedesktop.org/mesa/mesa/-/tree/main/src/broadcom/vulkan) — Vulkan compute path
+- [Pi 5 HEVC kernel driver patch series](https://patchwork.kernel.org) — closest architectural template for ARM-side V4L2 stateless wrapping a Pi-5 hardware accelerator (search "rpi-hevc-dec")
+- [raspberrypi/usbboot secure-boot.md](https://github.com/raspberrypi/usbboot/blob/master/docs/secure-boot.md) — Wall 1 silicon-RoT confirmation
+- [schlae/cm5-reveng](https://github.com/schlae/cm5-reveng) — CM5 PCB RE; VPU JTAG test points (Dec 2025; out of Path B scope, kept as escape hatch reference)
+- [MulticoreWare / Imagination PowerVR VP9 OpenCL decoder press](https://www.design-reuse.com/news/34030/vp9-decoder-imagination-powervr-series6-gpus.html) — 2014 precedent for hybrid codec back-end on embedded GPU compute
+- [FFmpeg 8.0 part-3 VP9 Vulkan failure post](https://www.rendi.dev/blog/ffmpeg-8-0-part-3-failed-attempts-to-use-vulkan-for-av1-encoding-vp9-decoding) — recent cautionary tale; failure was in entropy stage, not back-end
+- [`Halide/Halide` Vulkan Pi 5 issue #8494](https://github.com/halide/Halide/issues/8494) — known runtime edge cases on Pi 5 Vulkan
+- [Pi Forum p=2330030](https://forums.raspberrypi.com/viewtopic.php?p=2330030) — RPi engineer confirms VC7 ISA NDA + EU CRA signing rationale
+
+Future phases should add citations here as they're consumed, not
+re-derive Phase 0's substrate findings.
diff --git a/docs/phase1.md b/docs/phase1.md
new file mode 100644
index 0000000..0032fe3
--- /dev/null
+++ b/docs/phase1.md
@@ -0,0 +1,128 @@
+---
+phase: 1
+status: open
+date_opened: 2026-05-18
+parent: phase0.md
+target_kernel: VP9 / AV1 8×8 inverse DCT (integer fixed-point)
+dev_host: hertz
+---
+
+# Phase 1 — Goal formulation
+
+Per `dev_process.md`:
+
+> Define the objective in measurable terms. State what success looks
+> like *before* touching anything. The chosen metric is a **hypothesis**
+> about what to measure, not an axiom — Phase 3 may invalidate it.
+
+## Kernel under test
+
+**VP9 / AV1 8×8 inverse DCT (DCT_DCT variant), integer 16-bit
+fixed-point input, 8-bit output, with reconstructed-block add.**
+
+Mirrors the `ff_vp9_idct_idct_8x8_add_neon` shape in libavcodec
+(see `libavcodec/aarch64/vp9itxfm_neon.S`) and the equivalent
+dav1d / rav1d / libgav1 implementations for AV1's `IDTX_DCT` /
+`DCT_DCT` 8×8 path.
+
+I/O contract (per VP9 spec § 8.7 inverse transform process):
+
+```
+input:   int16_t coeffs[64]   // dequantized transform coefficients
+input:   uint8_t pred[64]     // predicted block (intra/inter)
+input:   ptrdiff_t stride     // typically 8 for an isolated test
+output:  uint8_t dst[64]      // clamp(pred + idct(coeffs)) per pixel
+```
+
+Bit-exact: integer arithmetic per spec, no rounding ambiguity.
+
+## Measurable success criteria
+
+Three numbers must come out of Phase 7, all measured in-session on
+hertz, all N≥3:
+
+| ID | Measurement | What it tells us |
+|---|---|---|
+| **M1** | **Bit-exactness rate** vs libavcodec C reference, across ≥10 000 random coefficient blocks | Correctness gate. Must be 100.000 %. Anything less and the kernel is wrong, no other number matters. |
+| **M2** | **QPU throughput** in million-blocks-per-second (MblockS), single-threaded host driver, sustained over ≥1 s | The substrate's actual delivered capacity for this kernel shape. |
+| **M3** | **NEON throughput** in MblockS on the same hertz, single-threaded, running `ff_vp9_idct_idct_8x8_add_neon` via a microbench harness | The floor any GPU offload has to beat or get close to. |
+
+Derived figure for go/no-go: **R = M2 / M3**.
+
+## Decision rules (set before measuring, per `feedback_no_motivated_reasoning`)
+
+| R | Interpretation | Next step |
+|---|---|---|
+| ≥ 1.0 | QPU beats NEON on this kernel in isolation. Strong substrate signal. | Phase 9 lessons → Phase 1 of next kernel (deblocking or CDEF). |
+| 0.5 ≤ R < 1.0 | QPU loses in isolation but is in the same order of magnitude. *Concurrent-work* hypothesis becomes viable: at R≈0.5 the QPU can roughly handle half of decode while the CPU does the other half + everything else. | Add a Phase 1' measurement: M4 = combined CPU+QPU throughput when both run concurrently (does total system delivery exceed pure-CPU?). Then decide. |
+| 0.1 ≤ R < 0.5 | QPU is materially slower. Concurrent-work win unlikely to be worth the integration cost. | Honest close. Phase 9 documents the negative result. |
+| < 0.1 | QPU is structurally wrong for this kernel shape. | Honest close. Phase 9 documents the failure, project shelves. |
+
+These thresholds are deliberately published *before* measurement so
+the result can't be retroactively reframed.
+
+## Secondary measurements (not gating, but recorded)
+
+- **M5** — per-kernel-launch overhead in µs, isolated (run with 0
+  blocks, measure submit+wait round-trip). Tells us the floor below
+  which kernel batching is required.
+- **M6** — workgroup-size sweep across {8, 16, 32, 64, 128, 256}
+  invocations to identify the v3dv-optimal launch shape for this
+  kernel. Records the Pareto curve, doesn't change R unless the
+  best-WG result invalidates M2.
+- **M7** — power draw delta at the wall (via the Himbeere Fritz!DECT
+  plug telemetry, if reachable) under idle vs CPU-only vs QPU-only
+  vs CPU+QPU concurrent. Order-of-magnitude only; informs the higgs
+  battery argument that motivates the project.
+
+## What Phase 1 does *not* lock
+
+- The dispatch path (Vulkan compute via `v3dv` vs direct DRM
+  submit via `v3d_drm.h` ioctl). Phase 4 picks. Default for
+  Phase 1 = **Vulkan compute** unless Phase 4 has reason to flip:
+  documented, debuggable, doesn't require QPU asm against the
+  NDA ISA.
+- The shader source (GLSL → glslang → SPIR-V) vs hand-written
+  SPIR-V. Default = GLSL.
+- Workgroup partitioning (one-block-per-WG vs many-blocks-per-WG).
+  Phase 4 chooses based on subgroup width and tile cost; Phase 1
+  records the sweep (M6).
+
+## Non-goals for Phase 1
+
+- No V4L2 driver work.
+- No end-to-end VP9 / AV1 decode (entropy + back-end). Just one
+  kernel, isolated, measured.
+- No optimization beyond what's needed to hit the bit-exact gate
+  and produce a single throughput number. Tuning is Phase 7's
+  feedback if R is borderline.
+- No build-system perfection. A CMakeLists that compiles the test
+  harness on hertz is enough.
+
+## Phase 2 → Phase 3 hand-off conditions
+
+Phase 1 closes when:
+- The above metrics + decision rules are reviewed (second-model
+  review per dev_process.md Phase 5? No — this is *Phase 1* not
+  Phase 5. The Phase 5 second-model review comes after Phase 4
+  plan).
+- The metrics are recorded in this file or a sibling
+  `phase1_metrics.md` artifact (TBD).
+
+The next phase (Phase 2 — situation analysis) inventories:
+- libavcodec's NEON IDCT reference (file, function, calling
+  convention, expected I/O contract).
+- VP9 spec § 8.7 transform process (which the C reference
+  implements verbatim).
+- AV1 spec § 7.7 (same transform structure, larger transform set;
+  8×8 DCT_DCT path is identical to VP9's at this size).
+- Mesa v3dv's compute-shader compilation path and any known
+  v3dv-specific shader idioms that perform better on V3D 7.1.
+- The hertz Vulkan dispatch overhead floor (M5 candidate, but
+  measured as part of Phase 3 baseline).
+
+## Open questions Phase 1 hands forward
+
+None new. Phase 0 § 7's open questions are the standing list;
+Phase 1 picks off Q1 (single-kernel throughput) and Q2 (NEON
+baseline) directly via M2 and M3.
diff --git a/docs/phase2.md b/docs/phase2.md
new file mode 100644
index 0000000..8cad881
--- /dev/null
+++ b/docs/phase2.md
@@ -0,0 +1,212 @@
+---
+phase: 2
+status: closed 2026-05-18
+date_opened: 2026-05-18
+parent: phase1.md
+target_kernel: VP9 8×8 inverse DCT (DCT_DCT variant, 8-bit pixels)
+---
+
+# Phase 2 — Situation analysis
+
+Per `dev_process.md`:
+
+> Document current state. Identify constraints, dependencies, known
+> failure modes. Reset context here — do not carry assumptions from
+> prior sessions; re-read CLAUDE.md, relevant memory files, run
+> `git status`, re-verify reachability.
+
+## 1. Context reset
+
+- Working tree state: dirty (Phase 0/1/2 docs not yet committed).
+  `.git-broken-2026-05-18/` preserved as a forensic artifact of
+  the 2026-05-18 10:25 working-tree wipe (cause undetermined).
+- CLAUDE.md re-read: no contradictions with the Path B scope set
+  in README §"Architecture (Path B)".
+- hertz reachability: confirmed via SSH; `vcgencmd`, `vulkaninfo`,
+  `apt`, sudo NOPASSWD all working as of 2026-05-17 inventory.
+  Mesa 25.0.7 / Vulkan 1.3.305 / V3D 7.1.7 stable.
+
+## 2. Reference implementations — VP9 8×8 IDCT (DCT_DCT)
+
+The Phase 1 kernel has *two* canonical reference implementations
+in FFmpeg n7.1.3 (the version installed on hertz). The harness
+will link both: the C path as the bit-exact gate (M1), the NEON
+path as the throughput baseline (M3).
+
+### 2.1 C reference
+
+- **Source**: `libavcodec/vp9dsp_template.c`, function `idct_idct_8x8_add_c`
+- **Spec basis**: VP9 specification §8.7 — Inverse transform process
+- **Signature**:
+
+  ```c
+  static void idct_idct_8x8_add_c(uint8_t *_dst, ptrdiff_t stride,
+                                  int16_t *_block, int eob);
+  ```
+
+- **Algorithm** (8-bit path):
+  1. If `eob == 1` (DC-only): single `(coef * 11585 * 11585)` round, broadcast to 8×8 with `+pred, clamp[0,255]`.
+  2. Otherwise: 8 column passes through `idct8_1d` → tmp[64]. Zero the input block. 8 row passes through `idct8_1d` → out[8]. Per-element `(out + 16) >> 5`, add to `dst`, `av_clip_pixel`.
+- **`idct8_1d`**: 1-D 8-point inverse DCT, 8 trigonometric multiply-add stages with Q14 fixed-point constants then 8-butterfly add/sub stages. All arithmetic is signed int32 (`dctint`).
+- **Q14 constants** (matched against VP9 spec §8.7.1.4):
+  | symbol | value | trig identity |
+  |---|---|---|
+  | cospi_16_64 | 11585 | cos(π/4) × 2^14 ≈ 0.70711 |
+  | cospi_24_64 |  6270 | cos(3π/8) × 2^14 ≈ 0.38268 |
+  | cospi_8_64  | 15137 | sin(3π/8) × 2^14 ≈ 0.92388 |
+  | cospi_28_64 |  3196 | cos(7π/16) × 2^14 ≈ 0.19509 |
+  | cospi_4_64  | 16069 | sin(7π/16) × 2^14 ≈ 0.98079 |
+  | cospi_20_64 |  9102 | cos(5π/16) × 2^14 ≈ 0.55557 |
+  | cospi_12_64 | 13623 | sin(5π/16) × 2^14 ≈ 0.83147 |
+
+  Rounding convention: `(product + (1 << 13)) >> 14`, i.e. round-half-up at bit 14.
+
+- **License**: LGPL-2.1-or-later (FFmpeg).
+- **Side effect**: zeroes the input `block[]` (idempotency requirement; matches spec).
+
+### 2.2 NEON reference
+
+- **Source**: `libavcodec/aarch64/vp9itxfm_neon.S`, symbol `ff_vp9_idct_idct_8x8_add_neon`
+- **Signature** (same as C):
+  ```
+  void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
+                                     int16_t *block, int eob);
+  ```
+  Registers: `x0=dst, x1=stride, x2=block, w3=eob`.
+- **Internal dependencies** (must be copied alongside the .S):
+  | macro / symbol | location | role |
+  |---|---|---|
+  | `idct8` | `vp9itxfm_neon.S` | 1-D 8-pt IDCT, fully unrolled with `dmbutterfly*` |
+  | `dmbutterfly0` | `vp9itxfm_neon.S` | rotation by π/4 (the `cospi_16_64` case) |
+  | `dmbutterfly` | `vp9itxfm_neon.S` | general 2-input rotation `[a,b] → [a·c1−b·c2, a·c2+b·c1]` (`Q14`) |
+  | `dmbutterfly_l` | `vp9itxfm_neon.S` | wide-form (4×i32 acc) for `dmbutterfly` |
+  | `butterfly_8h` | `vp9itxfm_neon.S` | trivial `[a+b, a−b]` on `int16x8_t` |
+  | `transpose_8x8H` | `libavcodec/aarch64/neon.S` | in-place 8×8 i16 transpose |
+  | `idct_coeffs` | `vp9itxfm_neon.S` (`const`) | Q14 trig constants table, aligned 4 |
+  | `movrel` | `libavutil/aarch64/asm.S` | PIC-aware constant-pool relocation helper |
+- **License**: LGPL-2.1-or-later (Google, 2016).
+- **Performance shape**: full unrolled 8-pt butterfly with NEON `smull/smlsl/smlal` + `rshrn` for the Q14 round-shift; output uses `sqxtun` for saturated narrow to u8. Estimated ~80 NEON instructions for the steady state (non-DC) path.
+
+### 2.3 AV1 equivalence note
+
+AV1's 8×8 DCT_DCT transform (`av1_iidentity8_iidentity8_c` vs `av1_idct8_idct8_c` family in `libavcodec/av1dsp/...`) shares the same 1-D 8-point structure but with **different** scaling: AV1 uses 12-bit fixed-point (`>> 12`) and a slightly different rounding shift due to its different transform-stage bit growth model. Calling our VP9 IDCT shader on AV1 coefficients will produce wrong output. **AV1 support is out of scope for Phase 1.** A Phase-N variant can fork the shader with the AV1 constants once Phase 1 has proven the VP9 path.
+
+## 3. Vulkan compute dispatch path
+
+Hertz exposes V3D 7.1 via Mesa's v3dv driver as Vulkan
+`PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU`, API 1.3.305, conformance
+1.3.8.3. The compute-only dispatch path is:
+
+```
+host program
+  ├─ vkCreateInstance / vkEnumeratePhysicalDevices (picks V3D 7.1.7.0)
+  ├─ vkCreateDevice (queue family with COMPUTE_BIT, no graphics needed)
+  ├─ vkCreateBuffer x N (SSBOs for block coeffs in / dst pixels in+out)
+  │     - buffer flags: STORAGE_BUFFER_BIT | TRANSFER_SRC/DST
+  │     - memory type: HOST_VISIBLE | HOST_COHERENT (zero-copy on shared LPDDR4x)
+  ├─ vkCreateDescriptorSetLayout (≤8 SSBOs per layout — Pi 5 limit)
+  ├─ vkCreateShaderModule (SPIR-V from glslang)
+  ├─ vkCreateComputePipeline
+  ├─ vkBeginCommandBuffer
+  │     vkCmdBindPipeline / vkCmdBindDescriptorSets / vkCmdPushConstants
+  │     vkCmdDispatch(group_count_x, 1, 1)   # one WG per ~K blocks
+  ├─ vkQueueSubmit + vkQueueWaitIdle (or fence) — this is the measured op
+  └─ (read back via the HOST_VISIBLE buffer, or alias it to the same memory the CPU populated)
+```
+
+Per Phase 0 §2 inside-view limits, the relevant constraints
+for this kernel:
+
+- ≤8 SSBOs per stage → group inputs/outputs into ≤8 bindings (we
+  only need 2: `block[]` in, `dst[]` in/out).
+- Shared mem ≤16 KiB → each 8×8 block fits trivially (256 B in
+  i16 plus 64 B in u8). One WG can carry dozens of blocks of
+  shared state if useful.
+- Subgroup size = 16 (fixed). One workgroup of 64 invocations =
+  4 subgroups; one block per subgroup is a natural shape (each
+  16-lane subgroup processes 8×8 = 64 pixels in 4 cycles of
+  subgroup work).
+
+## 4. Build path on hertz
+
+Already installed (2026-05-17): cmake 3.31, ninja 1.12, gcc (Debian
+trixie default), `libvulkan-dev 1.4.309`, `glslang-tools 15.1.0`,
+`spirv-tools 2025.1`, `libdrm-dev 2.4.131`, `vulkan-tools 1.4.304`.
+
+Missing but cheap:
+- `libavcodec-dev` — only needed if the harness wants to link
+  against system libavcodec for cross-checks against the dynamic
+  dispatcher. *Not* needed for the source-copy approach (preferred,
+  see §5).
+
+## 5. Reference-copy strategy (vs system-libavcodec link)
+
+**Decision: source-copy the 3 FFmpeg files into `external/ffmpeg-snapshot/`.**
+
+Rationale:
+- System `libavcodec.so` on hertz is symbol-stripped (`nm` returns
+  empty for `ff_vp9_idct_*`). Internal NEON entry points are not
+  reachable via `dlsym`.
+- The two reference implementations (C, NEON) plus their macro/
+  data dependencies total ~3 files / ~600 lines. Source-copy is
+  smaller than the dlopen plumbing would be.
+- LGPL-2.1-or-later (FFmpeg license) is propagation-compatible
+  with the harness binary if the harness binary itself is GPL
+  or LGPL. The kernel shaders and dispatch library stay
+  separately-licensed (BSD-2-Clause, default for this project).
+- Pinning to `n7.1.3` matches hertz's runtime libavcodec version,
+  so any in-session sanity cross-check against the running Mesa
+  / video tooling stays consistent.
+
+Files to vendor:
+
+| Source | License | Target path under `daedalus-fourier/` |
+|---|---|---|
+| `libavcodec/vp9dsp_template.c` | LGPL-2.1+ | `external/ffmpeg-snapshot/vp9dsp_template.c` |
+| `libavcodec/aarch64/vp9itxfm_neon.S` | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/vp9itxfm_neon.S` |
+| `libavcodec/aarch64/neon.S` (for `transpose_8x8H`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/neon.S` |
+| `libavutil/aarch64/asm.S` (for `movrel`, `function`, `endfunc`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/asm.S` |
+| (whatever else `vp9dsp_template.c` transitively needs) | LGPL-2.1+ | as required |
+
+A `external/ffmpeg-snapshot/COPYING.LGPL` and `external/ffmpeg-snapshot/PROVENANCE.md` document the upstream commit (n7.1.3 tag, commit hash) and the verbatim-copy guarantee.
+
+## 6. Known constraints / failure modes carried from Phase 0
+
+Repeated here so Phase 4 (plan) can bind against them without
+re-derivation:
+
+- **C1**: shaderFloat16 = false → all shader arithmetic must be int32 (we are int anyway — no risk).
+- **C2**: maxComputeSharedMemorySize = 16 KiB → kernel must not require more (8×8 IDCT trivially fits even with many blocks per WG).
+- **C3**: maxPerStageDescriptorStorageBuffers = 8 → we need only 2 (coeffs + dst), no risk.
+- **C4**: subgroupSupportedOperations = BASIC + VOTE only → no `subgroupAdd`/etc. for accumulator reductions. Workaround: the IDCT structure is fully data-parallel without reductions; this constraint doesn't bite.
+- **C5**: VC7 has SMUL24 but no INT8 MAC. Our Q14 multiplies are i16×i16→i32 — the multiplicands fit in 17 bits, so SMUL24 covers it. No INT8/INT4 issues.
+- **C6**: shared LPDDR4x bus; GPU sees ~4–7 GB/s vs CPU ~12–15 GB/s. For 8×8 IDCT, working set is tiny (≤320 B/block), so per-block bandwidth is not the bottleneck; per-dispatch submit overhead is.
+- **C7**: VPM read-stall serialization. If we hand-write QPU asm (we won't, in Phase 1) this would matter; the Vulkan compute path lets the v3d_compiler schedule for us.
+- **C8**: VC7 thermal throttle at 85°C GPU / 80°C CPU. Phase 7 measurements should record temp before/during/after to flag throttling.
+
+## 7. What Phase 2 does *not* close
+
+- The harness architecture (single binary? Two binaries — one for
+  bit-exact, one for throughput?). Phase 4 picks.
+- Block-per-WG dispatch geometry. Phase 4 + Phase 6 sweep.
+- Random-coefficient generation strategy (uniform i16 vs
+  realistic-distribution; the latter affects DC-only path
+  frequency). Phase 4 picks; Phase 7 may re-evaluate.
+- Whether NEON measurement uses `clock_gettime(CLOCK_MONOTONIC_RAW)`
+  per-call (high overhead) or batched (more realistic for codec
+  use). Phase 3 picks during baseline collection.
+
+## 8. Hand-off to Phase 3
+
+Phase 3 measures:
+- **M3-prelim**: NEON `ff_vp9_idct_idct_8x8_add_neon` throughput
+  on hertz, batched over 10⁶ random blocks, single-threaded,
+  4-thread, sched-isolated. This is the *floor*.
+- **M5-prelim**: Vulkan dispatch overhead — pipeline create cost
+  (one-time), per-`vkCmdDispatch` cost (per-frame-equivalent),
+  per-`vkQueueSubmit + vkQueueWaitIdle` cost (per-completion).
+  Bound below which kernel batching is mandatory.
+
+Both are measurements on the *existing* substrate. Neither
+requires writing any shader code. Phase 3 closes before Phase 4
+(plan) begins.
diff --git a/docs/phase3.md b/docs/phase3.md
new file mode 100644
index 0000000..700287f
--- /dev/null
+++ b/docs/phase3.md
@@ -0,0 +1,105 @@
+---
+phase: 3
+status: closed 2026-05-18
+date_opened: 2026-05-18
+date_closed: 2026-05-18
+parent: phase2.md
+host: hertz (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75+rpt-rpi-2712, Mesa 25.0.7-2+rpt4, V3D 7.1.7 @ 1 GHz, A76 @ 2.8 GHz)
+artifacts: build/bench_neon_idct, build/bench_vulkan_dispatch, build/noop.spv
+---
+
+# Phase 3 — Baseline measurements
+
+Per `dev_process.md`:
+
+> Take concrete measurements *before* any changes. Raw before
+> derived. Real data, not theatre.
+
+These numbers anchor every Phase 4+ decision. Re-run with the
+same harness on the same hertz before drawing any new conclusions
+in later phases.
+
+## M1 — bit-exact correctness gate (Phase 1)
+
+| | |
+|---|---|
+| Method | 10 000 random VP9-plausible coefficient blocks + random `pred[64]`, compare `daedalus_vp9_idct_idct_8x8_add_ref` C output vs vendored FFmpeg `ff_vp9_idct_idct_8x8_add_neon` |
+| Run    | `./bench_neon_idct --blocks 1000000 --iters 5` (built 2026-05-18) |
+| **Result** | **10 000 / 10 000 = 100.0000 %** |
+| DC-only path frequency | 11 / 10 000 = 0.11 % |
+| Notes | Random generator: xorshift64, biased toward 1–16 non-zero coeffs per block; eob mostly ∈ [4, 63]. DC-only frequency is incidental; Phase 7 may revisit if it materially affects the throughput number. |
+
+**Gate passes. Throughput measurement was authorized to run.**
+
+## M3 — NEON throughput (single-core)
+
+| | |
+|---|---|
+| Kernel | `ff_vp9_idct_idct_8x8_add_neon` from FFmpeg n7.1.3 (vendored, see `external/ffmpeg-snapshot/PROVENANCE.md`) |
+| Method | Pre-generate 1 M random blocks + preds. Per iteration: memcpy refresh of all blocks/preds (NEON path zeroes blocks), then call NEON kernel 1 M times. Subtract setup memcpy time from the measured wall-clock. 5 iterations, single thread, no CPU pinning. |
+| Compiler flags | `-O3 -march=armv8-a+simd` |
+| Run | `./bench_neon_idct --blocks 1000000 --iters 5` |
+| **Throughput** | **8.171 Mblock/s** |
+| Per-block | 122.4 ns |
+| Equivalent 1080p frame rate | 252.2 FPS (32 400 blocks per 1080p frame, assuming pure 8×8 work) |
+| Elapsed (kernel) | 0.612 s / 5 M blocks |
+| Elapsed (setup-only) | 0.250 s / 5 M iters |
+| Cross-check | Cycle estimate at 2.8 GHz: 122.4 ns × 2.8 GHz ≈ 342 cycles/block. Plausible for a fully-unrolled NEON 8-point IDCT with butterflies + saturated narrow stores; the FFmpeg implementation interleaves loads/computes/stores aggressively. |
+
+### M3 implications
+
+- A single A76 core handles ~8 M blocks/s = **252 FPS at 1080p**. Real decode needs ~60 FPS = 4.2× headroom on one core, ~16× headroom on all four cores. **NEON is not the bottleneck for current YouTube workloads on Pi 5.**
+- The QPU offload story is not "make decode faster" — decode is already fast enough single-threaded. The story has to be "free CPU cycles for the rest of the system" (browser, audio, the 11 LXD containers on hertz).
+- For a per-kernel R = QPU / NEON measurement (per `phase1.md §"Decision rules"`), the QPU has to hit ≥4 M blocks/s to score R ≥ 0.5. That's the gate.
+
+## M5 — Vulkan compute dispatch overhead
+
+| | |
+|---|---|
+| Method | Allocate empty pipeline (no descriptors, no push constants), bind+dispatch a `void main(){}` shader on `local_size_x=64`. Time `vkQueueSubmit` + `vkQueueWaitIdle` round-trip. 50 000 iterations, warm. |
+| Device | V3D 7.1.7.0 via Mesa v3dv 25.0.7 (selected past llvmpipe by `strstr("V3D")`) |
+| Run | `./bench_vulkan_dispatch --iters 50000` |
+| **M5a — empty CB submit+wait** | **22.66 µs / op** |
+| **M5b — 1-WG noop dispatch submit+wait** | **55.60 µs / op** |
+| **M5 delta — per-vkCmdDispatch + pipeline-bind** | **32.95 µs** |
+
+### M5 implications — the load-bearing finding for Phase 4
+
+This is the single most important number from Phase 3.
+
+- Per-dispatch cost (55.6 µs) is **~455× the NEON per-block cost** (122 ns).
+- A per-block QPU dispatch is structurally impossible — overhead dominates by two-and-a-half orders of magnitude.
+- Break-even batch size for a *hypothetical* zero-cost QPU kernel: **≥ 556 blocks per dispatch**. Real kernel cost on top of that.
+- Frame-level batching is mandatory: a 1080p frame has 32 400 8×8 blocks; one dispatch per frame amortizes M5b to 1.7 ns/block — well below NEON's 122 ns.
+- Tile-level batching is borderline: a typical VP9 64×64 superblock has 64 sub-blocks; 55.6 µs / 64 ≈ 870 ns/block, ~7× NEON. Probably too coarse — frame-level or full-plane is the right granularity.
+
+### M5 measurement caveats
+
+- `vkQueueWaitIdle` after each submit forces a full GPU sync, modelling the "submit and need the result now" case. Real decode pipelines can submit multiple frames ahead and wait less often — the per-dispatch cost in a pipelined deployment will be lower (probably bounded below by M5a ≈ 22.66 µs as the pure submit cost).
+- Empty CB (M5a) at 22.66 µs is the *floor*. This is Mesa command-list construction + kernel `DRM_IOCTL_V3D_SUBMIT_CL` + scheduler RTT. Cannot be optimised at the userspace level without changing Mesa or kernel.
+- Both numbers include `vkQueueWaitIdle` overhead; pure submit-without-wait would be lower. For Phase 1's threshold analysis the with-wait number is the right one to use because end-to-end frame decode must wait for its output to be readable.
+
+## Phase 3 closure
+
+Two anchor measurements captured, both with verbatim raw output
+(see `bench_neon_idct` and `bench_vulkan_dispatch` source for the
+print format). No estimates, no inferences, no recall from prior
+sessions or sibling-host memory.
+
+Phase 4 (plan) opens against these numbers. Its first decision:
+**given the 32.95 µs per-dispatch floor, what is the
+batch granularity for the first kernel?** The answer is either
+frame-level (32 400 blocks/dispatch) or row-level (~120
+blocks/dispatch for one 1920-wide row of 8×8 → still ~460 ns/block
+overhead, ~4× NEON). Frame-level is the only granularity that
+amortises overhead enough to leave kernel compute room to win.
+
+Open thread for a later phase (not blocking Phase 4):
+- Multi-core NEON sweep (M3'): single-core NEON is the right
+  *competitor floor*, but the actual ARM headroom on hertz is
+  4× this number under load.
+- Memory-bandwidth contention measurement (M6): does NEON's
+  rate change when concurrent QPU is reading the same LPDDR4x
+  bus? Needs the QPU kernel to exist first.
+- Power-draw delta via Himbeere plug (M7): same — needs a real
+  GPU workload to differentiate from idle.
diff --git a/docs/vulkaninfo_v3d_7_1_7_hertz.txt b/docs/vulkaninfo_v3d_7_1_7_hertz.txt
new file mode 100644
index 0000000..4f24c97
--- /dev/null
+++ b/docs/vulkaninfo_v3d_7_1_7_hertz.txt
@@ -0,0 +1,2099 @@
+==========
+VULKANINFO
+==========
+
+Vulkan Instance Version: 1.4.309
+
+
+Instance Extensions: count = 24
+===============================
+	VK_EXT_acquire_drm_display             : extension revision 1
+	VK_EXT_acquire_xlib_display            : extension revision 1
+	VK_EXT_debug_report                    : extension revision 10
+	VK_EXT_debug_utils                     : extension revision 2
+	VK_EXT_direct_mode_display             : extension revision 1
+	VK_EXT_display_surface_counter         : extension revision 1
+	VK_EXT_headless_surface                : extension revision 1
+	VK_EXT_surface_maintenance1            : extension revision 1
+	VK_EXT_swapchain_colorspace            : extension revision 5
+	VK_KHR_device_group_creation           : extension revision 1
+	VK_KHR_display                         : extension revision 23
+	VK_KHR_external_fence_capabilities     : extension revision 1
+	VK_KHR_external_memory_capabilities    : extension revision 1
+	VK_KHR_external_semaphore_capabilities : extension revision 1
+	VK_KHR_get_display_properties2         : extension revision 1
+	VK_KHR_get_physical_device_properties2 : extension revision 2
+	VK_KHR_get_surface_capabilities2       : extension revision 1
+	VK_KHR_portability_enumeration         : extension revision 1
+	VK_KHR_surface                         : extension revision 25
+	VK_KHR_surface_protected_capabilities  : extension revision 1
+	VK_KHR_wayland_surface                 : extension revision 6
+	VK_KHR_xcb_surface                     : extension revision 6
+	VK_KHR_xlib_surface                    : extension revision 6
+	VK_LUNARG_direct_driver_loading        : extension revision 1
+
+Layers: count = 2
+=================
+VK_LAYER_MESA_device_select (Linux device selection layer) Vulkan version 1.4.303, layer version 1:
+	Layer Extensions: count = 0
+	Devices: count = 2
+		GPU id = 0 (V3D 7.1.7.0)
+		Layer-Device Extensions: count = 0
+
+		GPU id = 1 (llvmpipe (LLVM 19.1.7, 128 bits))
+		Layer-Device Extensions: count = 0
+
+VK_LAYER_MESA_overlay (Mesa Overlay layer) Vulkan version 1.4.303, layer version 1:
+	Layer Extensions: count = 0
+	Devices: count = 2
+		GPU id = 0 (V3D 7.1.7.0)
+		Layer-Device Extensions: count = 0
+
+		GPU id = 1 (llvmpipe (LLVM 19.1.7, 128 bits))
+		Layer-Device Extensions: count = 0
+
+Device Properties and Extensions:
+=================================
+GPU0:
+VkPhysicalDeviceProperties:
+---------------------------
+	apiVersion        = 1.3.305 (4206897)
+	driverVersion     = 25.0.7 (104857607)
+	vendorID          = 0x14e4
+	deviceID          = 0x55701c33
+	deviceType        = PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU
+	deviceName        = V3D 7.1.7.0
+	pipelineCacheUUID = a801ad89-90bc-6e4b-dbf7-6f6038afe3ab
+
+VkPhysicalDeviceLimits:
+-----------------------
+	maxImageDimension1D                             = 4096
+	maxImageDimension2D                             = 4096
+	maxImageDimension3D                             = 4096
+	maxImageDimensionCube                           = 4096
+	maxImageArrayLayers                             = 2048
+	maxTexelBufferElements                          = 268435456
+	maxUniformBufferRange                           = 1073741824
+	maxStorageBufferRange                           = 1073741824
+	maxPushConstantsSize                            = 128
+	maxMemoryAllocationCount                        = 1048576
+	maxSamplerAllocationCount                       = 65536
+	bufferImageGranularity                          = 0x00000100
+	sparseAddressSpaceSize                          = 0x00000000
+	maxBoundDescriptorSets                          = 16
+	maxPerStageDescriptorSamplers                   = 24
+	maxPerStageDescriptorUniformBuffers             = 16
+	maxPerStageDescriptorStorageBuffers             = 8
+	maxPerStageDescriptorSampledImages              = 16
+	maxPerStageDescriptorStorageImages              = 4
+	maxPerStageDescriptorInputAttachments           = 4
+	maxPerStageResources                            = 128
+	maxDescriptorSetSamplers                        = 96
+	maxDescriptorSetUniformBuffers                  = 64
+	maxDescriptorSetUniformBuffersDynamic           = 8
+	maxDescriptorSetStorageBuffers                  = 32
+	maxDescriptorSetStorageBuffersDynamic           = 4
+	maxDescriptorSetSampledImages                   = 64
+	maxDescriptorSetStorageImages                   = 16
+	maxDescriptorSetInputAttachments                = 4
+	maxVertexInputAttributes                        = 16
+	maxVertexInputBindings                          = 16
+	maxVertexInputAttributeOffset                   = 4294967295
+	maxVertexInputBindingStride                     = 65535
+	maxVertexOutputComponents                       = 64
+	maxTessellationGenerationLevel                  = 0
+	maxTessellationPatchSize                        = 0
+	maxTessellationControlPerVertexInputComponents  = 0
+	maxTessellationControlPerVertexOutputComponents = 0
+	maxTessellationControlPerPatchOutputComponents  = 0
+	maxTessellationControlTotalOutputComponents     = 0
+	maxTessellationEvaluationInputComponents        = 0
+	maxTessellationEvaluationOutputComponents       = 0
+	maxGeometryShaderInvocations                    = 32
+	maxGeometryInputComponents                      = 64
+	maxGeometryOutputComponents                     = 64
+	maxGeometryOutputVertices                       = 256
+	maxGeometryTotalOutputComponents                = 1024
+	maxFragmentInputComponents                      = 64
+	maxFragmentOutputAttachments                    = 4
+	maxFragmentDualSrcAttachments                   = 0
+	maxFragmentCombinedOutputResources              = 20
+	maxComputeSharedMemorySize                      = 16384
+	maxComputeWorkGroupCount: count = 3
+		65535
+		65535
+		65535
+	maxComputeWorkGroupInvocations                  = 256
+	maxComputeWorkGroupSize: count = 3
+		256
+		256
+		256
+	subPixelPrecisionBits                           = 6
+	subTexelPrecisionBits                           = 8
+	mipmapPrecisionBits                             = 8
+	maxDrawIndexedIndexValue                        = 4294967295
+	maxDrawIndirectCount                            = 2147483647
+	maxSamplerLodBias                               = 14
+	maxSamplerAnisotropy                            = 16
+	maxViewports                                    = 1
+	maxViewportDimensions: count = 2
+		4096
+		4096
+	viewportBoundsRange: count = 2
+		-8192
+		8191
+	viewportSubPixelBits                            = 0
+	minMemoryMapAlignment                           = 4096
+	minTexelBufferOffsetAlignment                   = 0x00000040
+	minUniformBufferOffsetAlignment                 = 0x00000020
+	minStorageBufferOffsetAlignment                 = 0x00000020
+	minTexelOffset                                  = -8
+	maxTexelOffset                                  = 7
+	minTexelGatherOffset                            = -8
+	maxTexelGatherOffset                            = 7
+	minInterpolationOffset                          = -0.5
+	maxInterpolationOffset                          = 0.5
+	subPixelInterpolationOffsetBits                 = 6
+	maxFramebufferWidth                             = 4096
+	maxFramebufferHeight                            = 4096
+	maxFramebufferLayers                            = 256
+	framebufferColorSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	framebufferDepthSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	framebufferStencilSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	framebufferNoAttachmentsSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	maxColorAttachments                             = 8
+	sampledImageColorSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	sampledImageIntegerSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	sampledImageDepthSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	sampledImageStencilSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	storageImageSampleCounts: count = 1
+		SAMPLE_COUNT_1_BIT
+	maxSampleMaskWords                              = 1
+	timestampComputeAndGraphics                     = true
+	timestampPeriod                                 = 1
+	maxClipDistances                                = 8
+	maxCullDistances                                = 0
+	maxCombinedClipAndCullDistances                 = 8
+	discreteQueuePriorities                         = 2
+	pointSizeRange: count = 2
+		0.03125
+		512
+	lineWidthRange: count = 2
+		1
+		32
+	pointSizeGranularity                            = 0.03125
+	lineWidthGranularity                            = 0.03125
+	strictLines                                     = true
+	standardSampleLocations                         = false
+	optimalBufferCopyOffsetAlignment                = 0x00000020
+	optimalBufferCopyRowPitchAlignment              = 0x00000020
+	nonCoherentAtomSize                             = 0x00000100
+
+VkPhysicalDeviceSparseProperties:
+---------------------------------
+	residencyStandard2DBlockShape            = false
+	residencyStandard2DMultisampleBlockShape = false
+	residencyStandard3DBlockShape            = false
+	residencyAlignedMipSize                  = false
+	residencyNonResidentStrict               = false
+
+VkPhysicalDeviceCustomBorderColorPropertiesEXT:
+-----------------------------------------------
+	maxCustomBorderColorSamplers = 24
+
+VkPhysicalDeviceDrmPropertiesEXT:
+---------------------------------
+	hasPrimary   = true
+	hasRender    = true
+	primaryMajor = 226
+	primaryMinor = 1
+	renderMajor  = 226
+	renderMinor  = 128
+
+VkPhysicalDeviceLineRasterizationPropertiesKHR:
+-----------------------------------------------
+	lineSubPixelPrecisionBits = 6
+
+VkPhysicalDeviceMaintenance5PropertiesKHR:
+------------------------------------------
+	earlyFragmentMultisampleCoverageAfterSampleCounting = true
+	earlyFragmentSampleMaskTestBeforeSampleCounting     = true
+	depthStencilSwizzleOneSupport                       = true
+	polygonModePointSize                                = true
+	nonStrictSinglePixelWideLinesUseParallelogram       = true
+	nonStrictWideLinesUseParallelogram                  = true
+
+VkPhysicalDeviceMultiDrawPropertiesEXT:
+---------------------------------------
+	maxMultiDrawCount = 2048
+
+VkPhysicalDevicePerformanceQueryPropertiesKHR:
+----------------------------------------------
+	allowCommandBufferQueryCopies = true
+
+VkPhysicalDevicePipelineRobustnessPropertiesEXT:
+------------------------------------------------
+	defaultRobustnessStorageBuffers = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT
+	defaultRobustnessUniformBuffers = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT
+	defaultRobustnessVertexInputs = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT
+	defaultRobustnessImages = PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DEVICE_DEFAULT
+
+VkPhysicalDeviceProvokingVertexPropertiesEXT:
+---------------------------------------------
+	provokingVertexModePerPipeline                       = true
+	transformFeedbackPreservesTriangleFanProvokingVertex = false
+
+VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT:
+----------------------------------------------------
+	shaderModuleIdentifierAlgorithmUUID     = 4d455341-2d42-4c41-4b45-330000000000
+
+VkPhysicalDeviceVertexAttributeDivisorPropertiesKHR:
+----------------------------------------------------
+	maxVertexAttribDivisor       = 65535
+	supportsNonZeroFirstInstance = true
+
+VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT:
+----------------------------------------------------
+	maxVertexAttribDivisor = 65535
+
+VkPhysicalDeviceVulkan11Properties:
+-----------------------------------
+	deviceUUID                        = 5fd8106e-741a-cafa-e080-fdb16cf11a80
+	driverUUID                        = cd58235c-2a29-4ac7-8b40-a18d30b128ba
+	deviceNodeMask                    = 0
+	deviceLUIDValid                   = false
+	subgroupSize                      = 16
+	subgroupSupportedStages: count = 2
+		SHADER_STAGE_FRAGMENT_BIT
+		SHADER_STAGE_COMPUTE_BIT
+	subgroupSupportedOperations: count = 6
+		SUBGROUP_FEATURE_BASIC_BIT
+		SUBGROUP_FEATURE_VOTE_BIT
+		SUBGROUP_FEATURE_BALLOT_BIT
+		SUBGROUP_FEATURE_SHUFFLE_BIT
+		SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT
+		SUBGROUP_FEATURE_QUAD_BIT
+	subgroupQuadOperationsInAllStages = false
+	pointClippingBehavior             = POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES
+	maxMultiviewViewCount             = 16
+	maxMultiviewInstanceIndex         = 4294967294
+	protectedNoFault                  = false
+	maxPerSetDescriptors              = 67108863
+	maxMemoryAllocationSize           = 0x40000000
+
+VkPhysicalDeviceVulkan12Properties:
+-----------------------------------
+	driverID                                             = DRIVER_ID_MESA_V3DV
+	driverName                                           = V3DV Mesa
+	driverInfo                                           = Mesa 25.0.7-2+rpt4
+	conformanceVersion:
+		major    = 1
+		minor    = 3
+		subminor = 8
+		patch    = 3
+	denormBehaviorIndependence                           = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL
+	roundingModeIndependence                             = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL
+	shaderSignedZeroInfNanPreserveFloat16                = true
+	shaderSignedZeroInfNanPreserveFloat32                = true
+	shaderSignedZeroInfNanPreserveFloat64                = false
+	shaderDenormPreserveFloat16                          = true
+	shaderDenormPreserveFloat32                          = true
+	shaderDenormPreserveFloat64                          = false
+	shaderDenormFlushToZeroFloat16                       = false
+	shaderDenormFlushToZeroFloat32                       = false
+	shaderDenormFlushToZeroFloat64                       = false
+	shaderRoundingModeRTEFloat16                         = true
+	shaderRoundingModeRTEFloat32                         = true
+	shaderRoundingModeRTEFloat64                         = false
+	shaderRoundingModeRTZFloat16                         = false
+	shaderRoundingModeRTZFloat32                         = false
+	shaderRoundingModeRTZFloat64                         = false
+	maxUpdateAfterBindDescriptorsInAllPools              = 0
+	shaderUniformBufferArrayNonUniformIndexingNative     = false
+	shaderSampledImageArrayNonUniformIndexingNative      = false
+	shaderStorageBufferArrayNonUniformIndexingNative     = false
+	shaderStorageImageArrayNonUniformIndexingNative      = false
+	shaderInputAttachmentArrayNonUniformIndexingNative   = false
+	robustBufferAccessUpdateAfterBind                    = false
+	quadDivergentImplicitLod                             = false
+	maxPerStageDescriptorUpdateAfterBindSamplers         = 24
+	maxPerStageDescriptorUpdateAfterBindUniformBuffers   = 16
+	maxPerStageDescriptorUpdateAfterBindStorageBuffers   = 8
+	maxPerStageDescriptorUpdateAfterBindSampledImages    = 16
+	maxPerStageDescriptorUpdateAfterBindStorageImages    = 4
+	maxPerStageDescriptorUpdateAfterBindInputAttachments = 4
+	maxPerStageUpdateAfterBindResources                  = 128
+	maxDescriptorSetUpdateAfterBindSamplers              = 96
+	maxDescriptorSetUpdateAfterBindUniformBuffers        = 64
+	maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = 8
+	maxDescriptorSetUpdateAfterBindStorageBuffers        = 32
+	maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = 8
+	maxDescriptorSetUpdateAfterBindSampledImages         = 64
+	maxDescriptorSetUpdateAfterBindStorageImages         = 16
+	maxDescriptorSetUpdateAfterBindInputAttachments      = 4
+	supportedDepthResolveModes: count = 1
+		RESOLVE_MODE_SAMPLE_ZERO_BIT
+	supportedStencilResolveModes: count = 1
+		RESOLVE_MODE_SAMPLE_ZERO_BIT
+	independentResolveNone                               = false
+	independentResolve                                   = false
+	filterMinmaxSingleComponentFormats                   = false
+	filterMinmaxImageComponentMapping                    = false
+	maxTimelineSemaphoreValueDifference                  = 18446744073709551615
+	framebufferIntegerColorSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+
+VkPhysicalDeviceVulkan13Properties:
+-----------------------------------
+	minSubgroupSize                                                               = 16
+	maxSubgroupSize                                                               = 16
+	maxComputeWorkgroupSubgroups                                                  = 16
+	requiredSubgroupSizeStages: count = 1
+		SHADER_STAGE_COMPUTE_BIT
+	maxInlineUniformBlockSize                                                     = 4096
+	maxPerStageDescriptorInlineUniformBlocks                                      = 4
+	maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks                       = 4
+	maxDescriptorSetInlineUniformBlocks                                           = 4
+	maxDescriptorSetUpdateAfterBindInlineUniformBlocks                            = 4
+	maxInlineUniformTotalSize                                                     = 16384
+	integerDotProduct8BitUnsignedAccelerated                                      = false
+	integerDotProduct8BitSignedAccelerated                                        = false
+	integerDotProduct8BitMixedSignednessAccelerated                               = false
+	integerDotProduct4x8BitPackedUnsignedAccelerated                              = false
+	integerDotProduct4x8BitPackedSignedAccelerated                                = false
+	integerDotProduct4x8BitPackedMixedSignednessAccelerated                       = false
+	integerDotProduct16BitUnsignedAccelerated                                     = false
+	integerDotProduct16BitSignedAccelerated                                       = false
+	integerDotProduct16BitMixedSignednessAccelerated                              = false
+	integerDotProduct32BitUnsignedAccelerated                                     = false
+	integerDotProduct32BitSignedAccelerated                                       = false
+	integerDotProduct32BitMixedSignednessAccelerated                              = false
+	integerDotProduct64BitUnsignedAccelerated                                     = false
+	integerDotProduct64BitSignedAccelerated                                       = false
+	integerDotProduct64BitMixedSignednessAccelerated                              = false
+	integerDotProductAccumulatingSaturating8BitUnsignedAccelerated                = false
+	integerDotProductAccumulatingSaturating8BitSignedAccelerated                  = false
+	integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated         = false
+	integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated        = false
+	integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated          = false
+	integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false
+	integerDotProductAccumulatingSaturating16BitUnsignedAccelerated               = false
+	integerDotProductAccumulatingSaturating16BitSignedAccelerated                 = false
+	integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated        = false
+	integerDotProductAccumulatingSaturating32BitUnsignedAccelerated               = false
+	integerDotProductAccumulatingSaturating32BitSignedAccelerated                 = false
+	integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated        = false
+	integerDotProductAccumulatingSaturating64BitUnsignedAccelerated               = false
+	integerDotProductAccumulatingSaturating64BitSignedAccelerated                 = false
+	integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated        = false
+	storageTexelBufferOffsetAlignmentBytes                                        = 0x00000040
+	storageTexelBufferOffsetSingleTexelAlignment                                  = false
+	uniformTexelBufferOffsetAlignmentBytes                                        = 0x00000040
+	uniformTexelBufferOffsetSingleTexelAlignment                                  = false
+	maxBufferSize                                                                 = 0x40000000
+
+Device Extensions: count = 92
+	VK_EXT_4444_formats                        : extension revision 1
+	VK_EXT_attachment_feedback_loop_layout     : extension revision 2
+	VK_EXT_border_color_swizzle                : extension revision 1
+	VK_EXT_color_write_enable                  : extension revision 1
+	VK_EXT_custom_border_color                 : extension revision 12
+	VK_EXT_depth_clamp_zero_one                : extension revision 1
+	VK_EXT_depth_clip_control                  : extension revision 1
+	VK_EXT_depth_clip_enable                   : extension revision 1
+	VK_EXT_extended_dynamic_state              : extension revision 1
+	VK_EXT_extended_dynamic_state2             : extension revision 1
+	VK_EXT_external_memory_dma_buf             : extension revision 1
+	VK_EXT_host_query_reset                    : extension revision 1
+	VK_EXT_image_drm_format_modifier           : extension revision 2
+	VK_EXT_image_robustness                    : extension revision 1
+	VK_EXT_index_type_uint8                    : extension revision 1
+	VK_EXT_inline_uniform_block                : extension revision 1
+	VK_EXT_line_rasterization                  : extension revision 1
+	VK_EXT_load_store_op_none                  : extension revision 1
+	VK_EXT_memory_budget                       : extension revision 1
+	VK_EXT_multi_draw                          : extension revision 1
+	VK_EXT_physical_device_drm                 : extension revision 1
+	VK_EXT_pipeline_creation_cache_control     : extension revision 3
+	VK_EXT_pipeline_creation_feedback          : extension revision 1
+	VK_EXT_pipeline_robustness                 : extension revision 1
+	VK_EXT_primitive_topology_list_restart     : extension revision 1
+	VK_EXT_private_data                        : extension revision 1
+	VK_EXT_provoking_vertex                    : extension revision 1
+	VK_EXT_queue_family_foreign                : extension revision 1
+	VK_EXT_separate_stencil_usage              : extension revision 1
+	VK_EXT_shader_demote_to_helper_invocation  : extension revision 1
+	VK_EXT_shader_module_identifier            : extension revision 1
+	VK_EXT_subgroup_size_control               : extension revision 2
+	VK_EXT_swapchain_maintenance1              : extension revision 1
+	VK_EXT_texel_buffer_alignment              : extension revision 1
+	VK_EXT_tooling_info                        : extension revision 1
+	VK_EXT_vertex_attribute_divisor            : extension revision 3
+	VK_KHR_16bit_storage                       : extension revision 1
+	VK_KHR_8bit_storage                        : extension revision 1
+	VK_KHR_bind_memory2                        : extension revision 1
+	VK_KHR_buffer_device_address               : extension revision 1
+	VK_KHR_copy_commands2                      : extension revision 1
+	VK_KHR_create_renderpass2                  : extension revision 1
+	VK_KHR_dedicated_allocation                : extension revision 3
+	VK_KHR_depth_stencil_resolve               : extension revision 1
+	VK_KHR_descriptor_update_template          : extension revision 1
+	VK_KHR_device_group                        : extension revision 4
+	VK_KHR_driver_properties                   : extension revision 1
+	VK_KHR_dynamic_rendering                   : extension revision 1
+	VK_KHR_external_fence                      : extension revision 1
+	VK_KHR_external_fence_fd                   : extension revision 1
+	VK_KHR_external_memory                     : extension revision 1
+	VK_KHR_external_memory_fd                  : extension revision 1
+	VK_KHR_external_semaphore                  : extension revision 1
+	VK_KHR_external_semaphore_fd               : extension revision 1
+	VK_KHR_format_feature_flags2               : extension revision 2
+	VK_KHR_get_memory_requirements2            : extension revision 1
+	VK_KHR_image_format_list                   : extension revision 1
+	VK_KHR_imageless_framebuffer               : extension revision 1
+	VK_KHR_incremental_present                 : extension revision 2
+	VK_KHR_index_type_uint8                    : extension revision 1
+	VK_KHR_line_rasterization                  : extension revision 1
+	VK_KHR_load_store_op_none                  : extension revision 1
+	VK_KHR_maintenance1                        : extension revision 2
+	VK_KHR_maintenance2                        : extension revision 1
+	VK_KHR_maintenance3                        : extension revision 1
+	VK_KHR_maintenance4                        : extension revision 2
+	VK_KHR_maintenance5                        : extension revision 1
+	VK_KHR_multiview                           : extension revision 1
+	VK_KHR_performance_query                   : extension revision 1
+	VK_KHR_pipeline_executable_properties      : extension revision 1
+	VK_KHR_relaxed_block_layout                : extension revision 1
+	VK_KHR_sampler_mirror_clamp_to_edge        : extension revision 3
+	VK_KHR_sampler_ycbcr_conversion            : extension revision 14
+	VK_KHR_separate_depth_stencil_layouts      : extension revision 1
+	VK_KHR_shader_expect_assume                : extension revision 1
+	VK_KHR_shader_float_controls               : extension revision 4
+	VK_KHR_shader_integer_dot_product          : extension revision 1
+	VK_KHR_shader_non_semantic_info            : extension revision 1
+	VK_KHR_shader_relaxed_extended_instruction : extension revision 1
+	VK_KHR_shader_terminate_invocation         : extension revision 1
+	VK_KHR_spirv_1_4                           : extension revision 1
+	VK_KHR_storage_buffer_storage_class        : extension revision 1
+	VK_KHR_swapchain                           : extension revision 70
+	VK_KHR_swapchain_mutable_format            : extension revision 1
+	VK_KHR_synchronization2                    : extension revision 1
+	VK_KHR_timeline_semaphore                  : extension revision 2
+	VK_KHR_uniform_buffer_standard_layout      : extension revision 1
+	VK_KHR_variable_pointers                   : extension revision 1
+	VK_KHR_vertex_attribute_divisor            : extension revision 1
+	VK_KHR_vulkan_memory_model                 : extension revision 3
+	VK_KHR_workgroup_memory_explicit_layout    : extension revision 1
+	VK_KHR_zero_initialize_workgroup_memory    : extension revision 1
+
+VkQueueFamilyProperties:
+========================
+	queueProperties[0]:
+	-------------------
+		minImageTransferGranularity = (1,1,1)
+		queueCount                  = 1
+		queueFlags                  = QUEUE_GRAPHICS_BIT | QUEUE_COMPUTE_BIT | QUEUE_TRANSFER_BIT
+		timestampValidBits          = 64
+		present support             = false
+
+VkPhysicalDeviceMemoryProperties:
+=================================
+memoryHeaps: count = 1
+	memoryHeaps[0]:
+		size   = 4294967296 (0x100000000) (4.00 GiB)
+		budget = 3292721971 (0xc442f333) (3.07 GiB)
+		usage  = 0 (0x00000000) (0.00 B)
+		flags: count = 1
+			MEMORY_HEAP_DEVICE_LOCAL_BIT
+memoryTypes: count = 1
+	memoryTypes[0]:
+		heapIndex     = 0
+		propertyFlags = 0x0007: count = 3
+			MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+			MEMORY_PROPERTY_HOST_VISIBLE_BIT
+			MEMORY_PROPERTY_HOST_COHERENT_BIT
+		usable for:
+			IMAGE_TILING_OPTIMAL:
+				color images
+				FORMAT_D16_UNORM
+				FORMAT_X8_D24_UNORM_PACK32
+				FORMAT_D32_SFLOAT
+				FORMAT_D24_UNORM_S8_UINT
+				(non-sparse)
+			IMAGE_TILING_LINEAR:
+				color images
+				(non-sparse)
+
+VkPhysicalDeviceFeatures:
+=========================
+	robustBufferAccess                      = true
+	fullDrawIndexUint32                     = true
+	imageCubeArray                          = true
+	independentBlend                        = true
+	geometryShader                          = true
+	tessellationShader                      = false
+	sampleRateShading                       = true
+	dualSrcBlend                            = false
+	logicOp                                 = true
+	multiDrawIndirect                       = false
+	drawIndirectFirstInstance               = true
+	depthClamp                              = true
+	depthBiasClamp                          = true
+	fillModeNonSolid                        = true
+	depthBounds                             = true
+	wideLines                               = true
+	largePoints                             = true
+	alphaToOne                              = true
+	multiViewport                           = false
+	samplerAnisotropy                       = true
+	textureCompressionETC2                  = true
+	textureCompressionASTC_LDR              = true
+	textureCompressionBC                    = false
+	occlusionQueryPrecise                   = true
+	pipelineStatisticsQuery                 = false
+	vertexPipelineStoresAndAtomics          = true
+	fragmentStoresAndAtomics                = true
+	shaderTessellationAndGeometryPointSize  = true
+	shaderImageGatherExtended               = true
+	shaderStorageImageExtendedFormats       = true
+	shaderStorageImageMultisample           = false
+	shaderStorageImageReadWithoutFormat     = true
+	shaderStorageImageWriteWithoutFormat    = false
+	shaderUniformBufferArrayDynamicIndexing = false
+	shaderSampledImageArrayDynamicIndexing  = false
+	shaderStorageBufferArrayDynamicIndexing = false
+	shaderStorageImageArrayDynamicIndexing  = false
+	shaderClipDistance                      = true
+	shaderCullDistance                      = false
+	shaderFloat64                           = false
+	shaderInt64                             = false
+	shaderInt16                             = false
+	shaderResourceResidency                 = false
+	shaderResourceMinLod                    = false
+	sparseBinding                           = false
+	sparseResidencyBuffer                   = false
+	sparseResidencyImage2D                  = false
+	sparseResidencyImage3D                  = false
+	sparseResidency2Samples                 = false
+	sparseResidency4Samples                 = false
+	sparseResidency8Samples                 = false
+	sparseResidency16Samples                = false
+	sparseResidencyAliased                  = false
+	variableMultisampleRate                 = false
+	inheritedQueries                        = true
+
+VkPhysicalDevice4444FormatsFeaturesEXT:
+---------------------------------------
+	formatA4R4G4B4 = true
+	formatA4B4G4R4 = true
+
+VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT:
+--------------------------------------------------------
+	attachmentFeedbackLoopLayout = true
+
+VkPhysicalDeviceBorderColorSwizzleFeaturesEXT:
+----------------------------------------------
+	borderColorSwizzle          = true
+	borderColorSwizzleFromImage = true
+
+VkPhysicalDeviceColorWriteEnableFeaturesEXT:
+--------------------------------------------
+	colorWriteEnable = true
+
+VkPhysicalDeviceCustomBorderColorFeaturesEXT:
+---------------------------------------------
+	customBorderColors             = true
+	customBorderColorWithoutFormat = false
+
+VkPhysicalDeviceDepthClampZeroOneFeaturesEXT:
+---------------------------------------------
+	depthClampZeroOne = true
+
+VkPhysicalDeviceDepthClipControlFeaturesEXT:
+--------------------------------------------
+	depthClipControl = true
+
+VkPhysicalDeviceDepthClipEnableFeaturesEXT:
+-------------------------------------------
+	depthClipEnable = true
+
+VkPhysicalDeviceExtendedDynamicState2FeaturesEXT:
+-------------------------------------------------
+	extendedDynamicState2                   = true
+	extendedDynamicState2LogicOp            = false
+	extendedDynamicState2PatchControlPoints = false
+
+VkPhysicalDeviceExtendedDynamicStateFeaturesEXT:
+------------------------------------------------
+	extendedDynamicState = true
+
+VkPhysicalDeviceIndexTypeUint8FeaturesKHR:
+------------------------------------------
+	indexTypeUint8 = true
+
+VkPhysicalDeviceLineRasterizationFeaturesKHR:
+---------------------------------------------
+	rectangularLines         = true
+	bresenhamLines           = true
+	smoothLines              = true
+	stippledRectangularLines = false
+	stippledBresenhamLines   = false
+	stippledSmoothLines      = false
+
+VkPhysicalDeviceMaintenance5FeaturesKHR:
+----------------------------------------
+	maintenance5 = true
+
+VkPhysicalDeviceMultiDrawFeaturesEXT:
+-------------------------------------
+	multiDraw = true
+
+VkPhysicalDevicePerformanceQueryFeaturesKHR:
+--------------------------------------------
+	performanceCounterQueryPools         = true
+	performanceCounterMultipleQueryPools = false
+
+VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR:
+--------------------------------------------------------
+	pipelineExecutableInfo = true
+
+VkPhysicalDevicePipelineRobustnessFeaturesEXT:
+----------------------------------------------
+	pipelineRobustness = true
+
+VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT:
+--------------------------------------------------------
+	primitiveTopologyListRestart      = true
+	primitiveTopologyPatchListRestart = false
+
+VkPhysicalDeviceProvokingVertexFeaturesEXT:
+-------------------------------------------
+	provokingVertexLast                       = true
+	transformFeedbackPreservesProvokingVertex = false
+
+VkPhysicalDeviceShaderExpectAssumeFeaturesKHR:
+----------------------------------------------
+	shaderExpectAssume = true
+
+VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT:
+--------------------------------------------------
+	shaderModuleIdentifier = true
+
+VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR:
+------------------------------------------------------------
+	shaderRelaxedExtendedInstruction = true
+
+VkPhysicalDeviceSwapchainMaintenance1FeaturesEXT:
+-------------------------------------------------
+	swapchainMaintenance1 = true
+
+VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT:
+------------------------------------------------
+	texelBufferAlignment = true
+
+VkPhysicalDeviceVertexAttributeDivisorFeaturesKHR:
+--------------------------------------------------
+	vertexAttributeInstanceRateDivisor     = true
+	vertexAttributeInstanceRateZeroDivisor = false
+
+VkPhysicalDeviceVulkan11Features:
+---------------------------------
+	storageBuffer16BitAccess           = true
+	uniformAndStorageBuffer16BitAccess = true
+	storagePushConstant16              = true
+	storageInputOutput16               = false
+	multiview                          = true
+	multiviewGeometryShader            = false
+	multiviewTessellationShader        = false
+	variablePointersStorageBuffer      = true
+	variablePointers                   = false
+	protectedMemory                    = false
+	samplerYcbcrConversion             = true
+	shaderDrawParameters               = false
+
+VkPhysicalDeviceVulkan12Features:
+---------------------------------
+	samplerMirrorClampToEdge                           = true
+	drawIndirectCount                                  = false
+	storageBuffer8BitAccess                            = true
+	uniformAndStorageBuffer8BitAccess                  = true
+	storagePushConstant8                               = true
+	shaderBufferInt64Atomics                           = false
+	shaderSharedInt64Atomics                           = false
+	shaderFloat16                                      = false
+	shaderInt8                                         = false
+	descriptorIndexing                                 = false
+	shaderInputAttachmentArrayDynamicIndexing          = false
+	shaderUniformTexelBufferArrayDynamicIndexing       = false
+	shaderStorageTexelBufferArrayDynamicIndexing       = false
+	shaderUniformBufferArrayNonUniformIndexing         = false
+	shaderSampledImageArrayNonUniformIndexing          = false
+	shaderStorageBufferArrayNonUniformIndexing         = false
+	shaderStorageImageArrayNonUniformIndexing          = false
+	shaderInputAttachmentArrayNonUniformIndexing       = false
+	shaderUniformTexelBufferArrayNonUniformIndexing    = false
+	shaderStorageTexelBufferArrayNonUniformIndexing    = false
+	descriptorBindingUniformBufferUpdateAfterBind      = false
+	descriptorBindingSampledImageUpdateAfterBind       = false
+	descriptorBindingStorageImageUpdateAfterBind       = false
+	descriptorBindingStorageBufferUpdateAfterBind      = false
+	descriptorBindingUniformTexelBufferUpdateAfterBind = false
+	descriptorBindingStorageTexelBufferUpdateAfterBind = false
+	descriptorBindingUpdateUnusedWhilePending          = false
+	descriptorBindingPartiallyBound                    = false
+	descriptorBindingVariableDescriptorCount           = false
+	runtimeDescriptorArray                             = false
+	samplerFilterMinmax                                = false
+	scalarBlockLayout                                  = true
+	imagelessFramebuffer                               = true
+	uniformBufferStandardLayout                        = true
+	shaderSubgroupExtendedTypes                        = true
+	separateDepthStencilLayouts                        = true
+	hostQueryReset                                     = true
+	timelineSemaphore                                  = true
+	bufferDeviceAddress                                = true
+	bufferDeviceAddressCaptureReplay                   = false
+	bufferDeviceAddressMultiDevice                     = false
+	vulkanMemoryModel                                  = true
+	vulkanMemoryModelDeviceScope                       = true
+	vulkanMemoryModelAvailabilityVisibilityChains      = true
+	shaderOutputViewportIndex                          = false
+	shaderOutputLayer                                  = false
+	subgroupBroadcastDynamicId                         = true
+
+VkPhysicalDeviceVulkan13Features:
+---------------------------------
+	robustImageAccess                                  = true
+	inlineUniformBlock                                 = true
+	descriptorBindingInlineUniformBlockUpdateAfterBind = false
+	pipelineCreationCacheControl                       = true
+	privateData                                        = true
+	shaderDemoteToHelperInvocation                     = true
+	shaderTerminateInvocation                          = true
+	subgroupSizeControl                                = true
+	computeFullSubgroups                               = true
+	synchronization2                                   = true
+	textureCompressionASTC_HDR                         = false
+	shaderZeroInitializeWorkgroupMemory                = true
+	dynamicRendering                                   = true
+	shaderIntegerDotProduct                            = true
+	maintenance4                                       = true
+
+VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR:
+---------------------------------------------------------
+	workgroupMemoryExplicitLayout                  = true
+	workgroupMemoryExplicitLayoutScalarBlockLayout = false
+	workgroupMemoryExplicitLayout8BitAccess        = true
+	workgroupMemoryExplicitLayout16BitAccess       = true
+
+
+GPU1:
+VkPhysicalDeviceProperties:
+---------------------------
+	apiVersion        = 1.4.305 (4210993)
+	driverVersion     = 0.0.1 (1)
+	vendorID          = 0x10005
+	deviceID          = 0x0000
+	deviceType        = PHYSICAL_DEVICE_TYPE_CPU
+	deviceName        = llvmpipe (LLVM 19.1.7, 128 bits)
+	pipelineCacheUUID = 32352e30-2e37-2d32-2b72-707434616161
+
+VkPhysicalDeviceLimits:
+-----------------------
+	maxImageDimension1D                             = 16384
+	maxImageDimension2D                             = 16384
+	maxImageDimension3D                             = 4096
+	maxImageDimensionCube                           = 32768
+	maxImageArrayLayers                             = 2048
+	maxTexelBufferElements                          = 134217728
+	maxUniformBufferRange                           = 65536
+	maxStorageBufferRange                           = 134217728
+	maxPushConstantsSize                            = 256
+	maxMemoryAllocationCount                        = 4294967295
+	maxSamplerAllocationCount                       = 32768
+	bufferImageGranularity                          = 0x00000040
+	sparseAddressSpaceSize                          = 0x80000000
+	maxBoundDescriptorSets                          = 8
+	maxPerStageDescriptorSamplers                   = 1000000
+	maxPerStageDescriptorUniformBuffers             = 1000000
+	maxPerStageDescriptorStorageBuffers             = 1000000
+	maxPerStageDescriptorSampledImages              = 1000000
+	maxPerStageDescriptorStorageImages              = 1000000
+	maxPerStageDescriptorInputAttachments           = 1000000
+	maxPerStageResources                            = 1000000
+	maxDescriptorSetSamplers                        = 1000000
+	maxDescriptorSetUniformBuffers                  = 1000000
+	maxDescriptorSetUniformBuffersDynamic           = 1000000
+	maxDescriptorSetStorageBuffers                  = 1000000
+	maxDescriptorSetStorageBuffersDynamic           = 1000000
+	maxDescriptorSetSampledImages                   = 1000000
+	maxDescriptorSetStorageImages                   = 1000000
+	maxDescriptorSetInputAttachments                = 1000000
+	maxVertexInputAttributes                        = 32
+	maxVertexInputBindings                          = 32
+	maxVertexInputAttributeOffset                   = 2047
+	maxVertexInputBindingStride                     = 2048
+	maxVertexOutputComponents                       = 128
+	maxTessellationGenerationLevel                  = 64
+	maxTessellationPatchSize                        = 32
+	maxTessellationControlPerVertexInputComponents  = 128
+	maxTessellationControlPerVertexOutputComponents = 128
+	maxTessellationControlPerPatchOutputComponents  = 128
+	maxTessellationControlTotalOutputComponents     = 4096
+	maxTessellationEvaluationInputComponents        = 128
+	maxTessellationEvaluationOutputComponents       = 128
+	maxGeometryShaderInvocations                    = 32
+	maxGeometryInputComponents                      = 64
+	maxGeometryOutputComponents                     = 128
+	maxGeometryOutputVertices                       = 1024
+	maxGeometryTotalOutputComponents                = 1024
+	maxFragmentInputComponents                      = 128
+	maxFragmentOutputAttachments                    = 8
+	maxFragmentDualSrcAttachments                   = 2
+	maxFragmentCombinedOutputResources              = 104
+	maxComputeSharedMemorySize                      = 32768
+	maxComputeWorkGroupCount: count = 3
+		65535
+		65535
+		65535
+	maxComputeWorkGroupInvocations                  = 1024
+	maxComputeWorkGroupSize: count = 3
+		1024
+		1024
+		1024
+	subPixelPrecisionBits                           = 8
+	subTexelPrecisionBits                           = 8
+	mipmapPrecisionBits                             = 6
+	maxDrawIndexedIndexValue                        = 4294967295
+	maxDrawIndirectCount                            = 4294967295
+	maxSamplerLodBias                               = 16
+	maxSamplerAnisotropy                            = 16
+	maxViewports                                    = 16
+	maxViewportDimensions: count = 2
+		16384
+		16384
+	viewportBoundsRange: count = 2
+		-32768
+		32768
+	viewportSubPixelBits                            = 0
+	minMemoryMapAlignment                           = 64
+	minTexelBufferOffsetAlignment                   = 0x00000010
+	minUniformBufferOffsetAlignment                 = 0x00000010
+	minStorageBufferOffsetAlignment                 = 0x00000010
+	minTexelOffset                                  = -32
+	maxTexelOffset                                  = 31
+	minTexelGatherOffset                            = -32
+	maxTexelGatherOffset                            = 31
+	minInterpolationOffset                          = -2
+	maxInterpolationOffset                          = 2
+	subPixelInterpolationOffsetBits                 = 8
+	maxFramebufferWidth                             = 16384
+	maxFramebufferHeight                            = 16384
+	maxFramebufferLayers                            = 2048
+	framebufferColorSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	framebufferDepthSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	framebufferStencilSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	framebufferNoAttachmentsSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	maxColorAttachments                             = 8
+	sampledImageColorSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	sampledImageIntegerSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	sampledImageDepthSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	sampledImageStencilSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	storageImageSampleCounts: count = 2
+		SAMPLE_COUNT_1_BIT
+		SAMPLE_COUNT_4_BIT
+	maxSampleMaskWords                              = 1
+	timestampComputeAndGraphics                     = true
+	timestampPeriod                                 = 1
+	maxClipDistances                                = 8
+	maxCullDistances                                = 8
+	maxCombinedClipAndCullDistances                 = 8
+	discreteQueuePriorities                         = 2
+	pointSizeRange: count = 2
+		0
+		256
+	lineWidthRange: count = 2
+		1
+		255
+	pointSizeGranularity                            = 0.125
+	lineWidthGranularity                            = 0.0078125
+	strictLines                                     = true
+	standardSampleLocations                         = true
+	optimalBufferCopyOffsetAlignment                = 0x00000080
+	optimalBufferCopyRowPitchAlignment              = 0x00000080
+	nonCoherentAtomSize                             = 0x00000040
+
+VkPhysicalDeviceSparseProperties:
+---------------------------------
+	residencyStandard2DBlockShape            = true
+	residencyStandard2DMultisampleBlockShape = true
+	residencyStandard3DBlockShape            = true
+	residencyAlignedMipSize                  = false
+	residencyNonResidentStrict               = false
+
+VkPhysicalDeviceAccelerationStructurePropertiesKHR:
+---------------------------------------------------
+	maxGeometryCount                                           = 16777215
+	maxInstanceCount                                           = 16777215
+	maxPrimitiveCount                                          = 16777215
+	maxPerStageDescriptorAccelerationStructures                = 1000000
+	maxPerStageDescriptorUpdateAfterBindAccelerationStructures = 1000000
+	maxDescriptorSetAccelerationStructures                     = 1000000
+	maxDescriptorSetUpdateAfterBindAccelerationStructures      = 1000000
+	minAccelerationStructureScratchOffsetAlignment             = 8
+
+VkPhysicalDeviceComputeShaderDerivativesPropertiesKHR:
+------------------------------------------------------
+	meshAndTaskShaderDerivatives = true
+
+VkPhysicalDeviceCustomBorderColorPropertiesEXT:
+-----------------------------------------------
+	maxCustomBorderColorSamplers = 32768
+
+VkPhysicalDeviceDescriptorBufferDensityMapPropertiesEXT:
+--------------------------------------------------------
+	combinedImageSamplerDensityMapDescriptorSize = 0
+
+VkPhysicalDeviceDescriptorBufferPropertiesEXT:
+----------------------------------------------
+	combinedImageSamplerDescriptorSingleArray            = true
+	bufferlessPushDescriptors                            = true
+	allowSamplerImageViewPostSubmitCreation              = false
+	descriptorBufferOffsetAlignment                      = 0x00000004
+	maxDescriptorBufferBindings                          = 8
+	maxResourceDescriptorBufferBindings                  = 8
+	maxSamplerDescriptorBufferBindings                   = 8
+	maxEmbeddedImmutableSamplerBindings                  = 8
+	maxEmbeddedImmutableSamplers                         = 2032
+	bufferCaptureReplayDescriptorDataSize                = 0
+	imageCaptureReplayDescriptorDataSize                 = 0
+	imageViewCaptureReplayDescriptorDataSize             = 0
+	samplerCaptureReplayDescriptorDataSize               = 0
+	accelerationStructureCaptureReplayDescriptorDataSize = 0
+	samplerDescriptorSize                                = 256
+	combinedImageSamplerDescriptorSize                   = 256
+	sampledImageDescriptorSize                           = 256
+	storageImageDescriptorSize                           = 256
+	uniformTexelBufferDescriptorSize                     = 256
+	robustUniformTexelBufferDescriptorSize               = 256
+	storageTexelBufferDescriptorSize                     = 256
+	robustStorageTexelBufferDescriptorSize               = 256
+	uniformBufferDescriptorSize                          = 256
+	robustUniformBufferDescriptorSize                    = 256
+	storageBufferDescriptorSize                          = 256
+	robustStorageBufferDescriptorSize                    = 256
+	inputAttachmentDescriptorSize                        = 256
+	accelerationStructureDescriptorSize                  = 256
+	maxSamplerDescriptorBufferRange                      = 0xffffffff
+	maxResourceDescriptorBufferRange                     = 0xffffffff
+	samplerDescriptorBufferAddressSpaceSize              = 0xffffffff
+	resourceDescriptorBufferAddressSpaceSize             = 0xffffffff
+	descriptorBufferAddressSpaceSize                     = 0xffffffff
+
+VkPhysicalDeviceDeviceGeneratedCommandsPropertiesEXT:
+-----------------------------------------------------
+	maxIndirectPipelineCount                      = 4096
+	maxIndirectShaderObjectCount                  = 4096
+	maxIndirectSequenceCount                      = 1048576
+	maxIndirectCommandsTokenCount                 = 16
+	maxIndirectCommandsTokenOffset                = 2047
+	maxIndirectCommandsIndirectStride             = 2048
+	supportedIndirectCommandsInputModes: count = 2
+		INDIRECT_COMMANDS_INPUT_MODE_VULKAN_INDEX_BUFFER_EXT
+		INDIRECT_COMMANDS_INPUT_MODE_DXGI_INDEX_BUFFER_EXT
+	supportedIndirectCommandsShaderStages: count = 16
+		SHADER_STAGE_VERTEX_BIT
+		SHADER_STAGE_TESSELLATION_CONTROL_BIT
+		SHADER_STAGE_TESSELLATION_EVALUATION_BIT
+		SHADER_STAGE_GEOMETRY_BIT
+		SHADER_STAGE_FRAGMENT_BIT
+		SHADER_STAGE_COMPUTE_BIT
+		SHADER_STAGE_RAYGEN_BIT_KHR
+		SHADER_STAGE_ANY_HIT_BIT_KHR
+		SHADER_STAGE_CLOSEST_HIT_BIT_KHR
+		SHADER_STAGE_MISS_BIT_KHR
+		SHADER_STAGE_INTERSECTION_BIT_KHR
+		SHADER_STAGE_CALLABLE_BIT_KHR
+		SHADER_STAGE_TASK_BIT_EXT
+		SHADER_STAGE_MESH_BIT_EXT
+		SHADER_STAGE_SUBPASS_SHADING_BIT_HUAWEI
+		SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI
+	supportedIndirectCommandsShaderStagesPipelineBinding: count = 16
+		SHADER_STAGE_VERTEX_BIT
+		SHADER_STAGE_TESSELLATION_CONTROL_BIT
+		SHADER_STAGE_TESSELLATION_EVALUATION_BIT
+		SHADER_STAGE_GEOMETRY_BIT
+		SHADER_STAGE_FRAGMENT_BIT
+		SHADER_STAGE_COMPUTE_BIT
+		SHADER_STAGE_RAYGEN_BIT_KHR
+		SHADER_STAGE_ANY_HIT_BIT_KHR
+		SHADER_STAGE_CLOSEST_HIT_BIT_KHR
+		SHADER_STAGE_MISS_BIT_KHR
+		SHADER_STAGE_INTERSECTION_BIT_KHR
+		SHADER_STAGE_CALLABLE_BIT_KHR
+		SHADER_STAGE_TASK_BIT_EXT
+		SHADER_STAGE_MESH_BIT_EXT
+		SHADER_STAGE_SUBPASS_SHADING_BIT_HUAWEI
+		SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI
+	supportedIndirectCommandsShaderStagesShaderBinding: count = 16
+		SHADER_STAGE_VERTEX_BIT
+		SHADER_STAGE_TESSELLATION_CONTROL_BIT
+		SHADER_STAGE_TESSELLATION_EVALUATION_BIT
+		SHADER_STAGE_GEOMETRY_BIT
+		SHADER_STAGE_FRAGMENT_BIT
+		SHADER_STAGE_COMPUTE_BIT
+		SHADER_STAGE_RAYGEN_BIT_KHR
+		SHADER_STAGE_ANY_HIT_BIT_KHR
+		SHADER_STAGE_CLOSEST_HIT_BIT_KHR
+		SHADER_STAGE_MISS_BIT_KHR
+		SHADER_STAGE_INTERSECTION_BIT_KHR
+		SHADER_STAGE_CALLABLE_BIT_KHR
+		SHADER_STAGE_TASK_BIT_EXT
+		SHADER_STAGE_MESH_BIT_EXT
+		SHADER_STAGE_SUBPASS_SHADING_BIT_HUAWEI
+		SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI
+	deviceGeneratedCommandsTransformFeedback      = true
+	deviceGeneratedCommandsMultiDrawIndirectCount = true
+
+VkPhysicalDeviceExtendedDynamicState3PropertiesEXT:
+---------------------------------------------------
+	dynamicPrimitiveTopologyUnrestricted = true
+
+VkPhysicalDeviceExternalMemoryHostPropertiesEXT:
+------------------------------------------------
+	minImportedHostPointerAlignment = 0x00001000
+
+VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT:
+-----------------------------------------------------
+	graphicsPipelineLibraryFastLinking                        = true
+	graphicsPipelineLibraryIndependentInterpolationDecoration = true
+
+VkPhysicalDeviceLayeredApiPropertiesListKHR:
+--------------------------------------------
+	layeredApiCount               = 0
+	pLayeredApis                  = NULL
+
+VkPhysicalDeviceLegacyVertexAttributesPropertiesEXT:
+----------------------------------------------------
+	nativeUnalignedPerformance = true
+
+VkPhysicalDeviceMaintenance7PropertiesKHR:
+------------------------------------------
+	robustFragmentShadingRateAttachmentAccess                 = false
+	separateDepthStencilAttachmentAccess                      = true
+	maxDescriptorSetTotalUniformBuffersDynamic                = 1000000
+	maxDescriptorSetTotalStorageBuffersDynamic                = 1000000
+	maxDescriptorSetTotalBuffersDynamic                       = 1000000
+	maxDescriptorSetUpdateAfterBindTotalUniformBuffersDynamic = 1000000
+	maxDescriptorSetUpdateAfterBindTotalStorageBuffersDynamic = 1000000
+	maxDescriptorSetUpdateAfterBindTotalBuffersDynamic        = 1000000
+
+VkPhysicalDeviceMeshShaderPropertiesEXT:
+----------------------------------------
+	maxTaskWorkGroupTotalCount            = 4194304
+	maxTaskWorkGroupCount: count = 3
+		65536
+		65536
+		65536
+	maxTaskWorkGroupInvocations           = 1024
+	maxTaskWorkGroupSize: count = 3
+		1024
+		1024
+		1024
+	maxTaskPayloadSize                    = 16384
+	maxTaskSharedMemorySize               = 32768
+	maxTaskPayloadAndSharedMemorySize     = 32768
+	maxMeshWorkGroupTotalCount            = 4194304
+	maxMeshWorkGroupCount: count = 3
+		65536
+		65536
+		65536
+	maxMeshWorkGroupInvocations           = 1024
+	maxMeshWorkGroupSize: count = 3
+		1024
+		1024
+		1024
+	maxMeshSharedMemorySize               = 28672
+	maxMeshPayloadAndSharedMemorySize     = 45056
+	maxMeshOutputMemorySize               = 32768
+	maxMeshPayloadAndOutputMemorySize     = 49152
+	maxMeshOutputComponents               = 128
+	maxMeshOutputVertices                 = 256
+	maxMeshOutputPrimitives               = 256
+	maxMeshOutputLayers                   = 8
+	maxMeshMultiviewViewCount             = 0
+	meshOutputPerVertexGranularity        = 1
+	meshOutputPerPrimitiveGranularity     = 1
+	maxPreferredTaskWorkGroupInvocations  = 64
+	maxPreferredMeshWorkGroupInvocations  = 128
+	prefersLocalInvocationVertexOutput    = true
+	prefersLocalInvocationPrimitiveOutput = true
+	prefersCompactVertexOutput            = true
+	prefersCompactPrimitiveOutput         = false
+
+VkPhysicalDeviceMultiDrawPropertiesEXT:
+---------------------------------------
+	maxMultiDrawCount = 2048
+
+VkPhysicalDeviceNestedCommandBufferPropertiesEXT:
+-------------------------------------------------
+	maxCommandBufferNestingLevel = 4294967295
+
+VkPhysicalDeviceProvokingVertexPropertiesEXT:
+---------------------------------------------
+	provokingVertexModePerPipeline                       = true
+	transformFeedbackPreservesTriangleFanProvokingVertex = true
+
+VkPhysicalDeviceRayTracingPipelinePropertiesKHR:
+------------------------------------------------
+	shaderGroupHandleSize              = 32
+	maxRayRecursionDepth               = 31
+	maxShaderGroupStride               = 16384
+	shaderGroupBaseAlignment           = 32
+	shaderGroupHandleCaptureReplaySize = 0
+	maxRayDispatchInvocationCount      = 67108864
+	shaderGroupHandleAlignment         = 16
+	maxRayHitAttributeSize             = 32
+
+VkPhysicalDeviceRobustness2PropertiesEXT:
+-----------------------------------------
+	robustStorageBufferAccessSizeAlignment = 0x00000001
+	robustUniformBufferAccessSizeAlignment = 0x00000001
+
+VkPhysicalDeviceShaderObjectPropertiesEXT:
+------------------------------------------
+	shaderBinaryUUID     = 32352e30-2e37-2d32-2b72-707434616161
+	shaderBinaryVersion  = 1
+
+VkPhysicalDeviceTransformFeedbackPropertiesEXT:
+-----------------------------------------------
+	maxTransformFeedbackStreams                = 4
+	maxTransformFeedbackBuffers                = 4
+	maxTransformFeedbackBufferSize             = 0xffffffff
+	maxTransformFeedbackStreamDataSize         = 512
+	maxTransformFeedbackBufferDataSize         = 512
+	maxTransformFeedbackBufferDataStride       = 512
+	transformFeedbackQueries                   = true
+	transformFeedbackStreamsLinesTriangles     = false
+	transformFeedbackRasterizationStreamSelect = false
+	transformFeedbackDraw                      = true
+
+VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT:
+----------------------------------------------------
+	maxVertexAttribDivisor = 4294967295
+
+VkPhysicalDeviceVulkan11Properties:
+-----------------------------------
+	deviceUUID                        = 6d657361-3235-2e30-2e37-2d322b727000
+	driverUUID                        = 6c6c766d-7069-7065-5555-494400000000
+	deviceNodeMask                    = 0
+	deviceLUIDValid                   = false
+	subgroupSize                      = 4
+	subgroupSupportedStages: count = 4
+		SHADER_STAGE_FRAGMENT_BIT
+		SHADER_STAGE_COMPUTE_BIT
+		SHADER_STAGE_TASK_BIT_EXT
+		SHADER_STAGE_MESH_BIT_EXT
+	subgroupSupportedOperations: count = 10
+		SUBGROUP_FEATURE_BASIC_BIT
+		SUBGROUP_FEATURE_VOTE_BIT
+		SUBGROUP_FEATURE_ARITHMETIC_BIT
+		SUBGROUP_FEATURE_BALLOT_BIT
+		SUBGROUP_FEATURE_SHUFFLE_BIT
+		SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT
+		SUBGROUP_FEATURE_CLUSTERED_BIT
+		SUBGROUP_FEATURE_QUAD_BIT
+		SUBGROUP_FEATURE_ROTATE_BIT
+		SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT
+	subgroupQuadOperationsInAllStages = false
+	pointClippingBehavior             = POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES
+	maxMultiviewViewCount             = 6
+	maxMultiviewInstanceIndex         = 2147483647
+	protectedNoFault                  = false
+	maxPerSetDescriptors              = 1000000
+	maxMemoryAllocationSize           = 0x80000000
+
+VkPhysicalDeviceVulkan12Properties:
+-----------------------------------
+	driverID                                             = DRIVER_ID_MESA_LLVMPIPE
+	driverName                                           = llvmpipe
+	driverInfo                                           = Mesa 25.0.7-2+rpt4 (LLVM 19.1.7)
+	conformanceVersion:
+		major    = 1
+		minor    = 3
+		subminor = 1
+		patch    = 1
+	denormBehaviorIndependence                           = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL
+	roundingModeIndependence                             = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL
+	shaderSignedZeroInfNanPreserveFloat16                = true
+	shaderSignedZeroInfNanPreserveFloat32                = true
+	shaderSignedZeroInfNanPreserveFloat64                = true
+	shaderDenormPreserveFloat16                          = false
+	shaderDenormPreserveFloat32                          = false
+	shaderDenormPreserveFloat64                          = false
+	shaderDenormFlushToZeroFloat16                       = false
+	shaderDenormFlushToZeroFloat32                       = false
+	shaderDenormFlushToZeroFloat64                       = false
+	shaderRoundingModeRTEFloat16                         = true
+	shaderRoundingModeRTEFloat32                         = true
+	shaderRoundingModeRTEFloat64                         = true
+	shaderRoundingModeRTZFloat16                         = false
+	shaderRoundingModeRTZFloat32                         = false
+	shaderRoundingModeRTZFloat64                         = false
+	maxUpdateAfterBindDescriptorsInAllPools              = 4294967295
+	shaderUniformBufferArrayNonUniformIndexingNative     = true
+	shaderSampledImageArrayNonUniformIndexingNative      = true
+	shaderStorageBufferArrayNonUniformIndexingNative     = true
+	shaderStorageImageArrayNonUniformIndexingNative      = true
+	shaderInputAttachmentArrayNonUniformIndexingNative   = true
+	robustBufferAccessUpdateAfterBind                    = true
+	quadDivergentImplicitLod                             = true
+	maxPerStageDescriptorUpdateAfterBindSamplers         = 1000000
+	maxPerStageDescriptorUpdateAfterBindUniformBuffers   = 1000000
+	maxPerStageDescriptorUpdateAfterBindStorageBuffers   = 1000000
+	maxPerStageDescriptorUpdateAfterBindSampledImages    = 1000000
+	maxPerStageDescriptorUpdateAfterBindStorageImages    = 1000000
+	maxPerStageDescriptorUpdateAfterBindInputAttachments = 1000000
+	maxPerStageUpdateAfterBindResources                  = 1000000
+	maxDescriptorSetUpdateAfterBindSamplers              = 1000000
+	maxDescriptorSetUpdateAfterBindUniformBuffers        = 1000000
+	maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = 1000000
+	maxDescriptorSetUpdateAfterBindStorageBuffers        = 1000000
+	maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = 1000000
+	maxDescriptorSetUpdateAfterBindSampledImages         = 1000000
+	maxDescriptorSetUpdateAfterBindStorageImages         = 1000000
+	maxDescriptorSetUpdateAfterBindInputAttachments      = 1000000
+	supportedDepthResolveModes: count = 2
+		RESOLVE_MODE_SAMPLE_ZERO_BIT
+		RESOLVE_MODE_AVERAGE_BIT
+	supportedStencilResolveModes: count = 1
+		RESOLVE_MODE_SAMPLE_ZERO_BIT
+	independentResolveNone                               = false
+	independentResolve                                   = false
+	filterMinmaxSingleComponentFormats                   = true
+	filterMinmaxImageComponentMapping                    = true
+	maxTimelineSemaphoreValueDifference                  = 18446744073709551615
+	framebufferIntegerColorSampleCounts: count = 1
+		SAMPLE_COUNT_1_BIT
+
+VkPhysicalDeviceVulkan13Properties:
+-----------------------------------
+	minSubgroupSize                                                               = 4
+	maxSubgroupSize                                                               = 4
+	maxComputeWorkgroupSubgroups                                                  = 32
+	requiredSubgroupSizeStages: count = 2
+		SHADER_STAGE_FRAGMENT_BIT
+		SHADER_STAGE_COMPUTE_BIT
+	maxInlineUniformBlockSize                                                     = 4096
+	maxPerStageDescriptorInlineUniformBlocks                                      = 8
+	maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks                       = 8
+	maxDescriptorSetInlineUniformBlocks                                           = 8
+	maxDescriptorSetUpdateAfterBindInlineUniformBlocks                            = 8
+	maxInlineUniformTotalSize                                                     = 262144
+	integerDotProduct8BitUnsignedAccelerated                                      = false
+	integerDotProduct8BitSignedAccelerated                                        = false
+	integerDotProduct8BitMixedSignednessAccelerated                               = false
+	integerDotProduct4x8BitPackedUnsignedAccelerated                              = false
+	integerDotProduct4x8BitPackedSignedAccelerated                                = false
+	integerDotProduct4x8BitPackedMixedSignednessAccelerated                       = false
+	integerDotProduct16BitUnsignedAccelerated                                     = false
+	integerDotProduct16BitSignedAccelerated                                       = false
+	integerDotProduct16BitMixedSignednessAccelerated                              = false
+	integerDotProduct32BitUnsignedAccelerated                                     = false
+	integerDotProduct32BitSignedAccelerated                                       = false
+	integerDotProduct32BitMixedSignednessAccelerated                              = false
+	integerDotProduct64BitUnsignedAccelerated                                     = false
+	integerDotProduct64BitSignedAccelerated                                       = false
+	integerDotProduct64BitMixedSignednessAccelerated                              = false
+	integerDotProductAccumulatingSaturating8BitUnsignedAccelerated                = false
+	integerDotProductAccumulatingSaturating8BitSignedAccelerated                  = false
+	integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated         = false
+	integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated        = false
+	integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated          = false
+	integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false
+	integerDotProductAccumulatingSaturating16BitUnsignedAccelerated               = false
+	integerDotProductAccumulatingSaturating16BitSignedAccelerated                 = false
+	integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated        = false
+	integerDotProductAccumulatingSaturating32BitUnsignedAccelerated               = false
+	integerDotProductAccumulatingSaturating32BitSignedAccelerated                 = false
+	integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated        = false
+	integerDotProductAccumulatingSaturating64BitUnsignedAccelerated               = false
+	integerDotProductAccumulatingSaturating64BitSignedAccelerated                 = false
+	integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated        = false
+	storageTexelBufferOffsetAlignmentBytes                                        = 0x00000010
+	storageTexelBufferOffsetSingleTexelAlignment                                  = true
+	uniformTexelBufferOffsetAlignmentBytes                                        = 0x00000010
+	uniformTexelBufferOffsetSingleTexelAlignment                                  = true
+	maxBufferSize                                                                 = 0xffffffff
+
+VkPhysicalDeviceVulkan14Properties:
+-----------------------------------
+	lineSubPixelPrecisionBits                           = 8
+	maxVertexAttribDivisor                              = 4294967295
+	supportsNonZeroFirstInstance                        = false
+	maxPushDescriptors                                  = 32
+	dynamicRenderingLocalReadDepthStencilAttachments    = false
+	dynamicRenderingLocalReadMultisampledAttachments    = false
+	earlyFragmentMultisampleCoverageAfterSampleCounting = true
+	earlyFragmentSampleMaskTestBeforeSampleCounting     = false
+	depthStencilSwizzleOneSupport                       = false
+	polygonModePointSize                                = true
+	nonStrictSinglePixelWideLinesUseParallelogram       = false
+	nonStrictWideLinesUseParallelogram                  = false
+	blockTexelViewCompatibleMultipleLayers              = true
+	maxCombinedImageSamplerDescriptorCount              = 3
+	fragmentShadingRateClampCombinerInputs              = false
+	defaultRobustnessStorageBuffers                     = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2
+	defaultRobustnessUniformBuffers                     = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2
+	defaultRobustnessVertexInputs                       = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2
+	defaultRobustnessImages                             = PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2
+	copySrcLayoutCount                                  = 23
+	pCopySrcLayouts: count = 23
+		IMAGE_LAYOUT_GENERAL
+		IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL
+		IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
+		IMAGE_LAYOUT_PREINITIALIZED
+		IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_PRESENT_SRC_KHR
+		IMAGE_LAYOUT_VIDEO_DECODE_DST_KHR
+		IMAGE_LAYOUT_VIDEO_DECODE_SRC_KHR
+		IMAGE_LAYOUT_VIDEO_DECODE_DPB_KHR
+		IMAGE_LAYOUT_SHARED_PRESENT_KHR
+		IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT
+		IMAGE_LAYOUT_FRAGMENT_SHADING_RATE_ATTACHMENT_OPTIMAL_KHR
+	copyDstLayoutCount                                  = 23
+	pCopyDstLayouts: count = 23
+		IMAGE_LAYOUT_GENERAL
+		IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL
+		IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
+		IMAGE_LAYOUT_PREINITIALIZED
+		IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_READ_ONLY_OPTIMAL
+		IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
+		IMAGE_LAYOUT_PRESENT_SRC_KHR
+		IMAGE_LAYOUT_VIDEO_DECODE_DST_KHR
+		IMAGE_LAYOUT_VIDEO_DECODE_SRC_KHR
+		IMAGE_LAYOUT_VIDEO_DECODE_DPB_KHR
+		IMAGE_LAYOUT_SHARED_PRESENT_KHR
+		IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT
+		IMAGE_LAYOUT_FRAGMENT_SHADING_RATE_ATTACHMENT_OPTIMAL_KHR
+	optimalTilingLayoutUUID                             = 32352e30-2e37-2d32-2b72-707434616161
+	identicalMemoryTypeRequirements                     = false
+
+Device Extensions: count = 156
+	VK_AMDX_shader_enqueue                             : extension revision 2
+	VK_ANDROID_external_memory_android_hardware_buffer : extension revision 5
+	VK_ARM_rasterization_order_attachment_access       : extension revision 1
+	VK_EXT_4444_formats                                : extension revision 1
+	VK_EXT_attachment_feedback_loop_dynamic_state      : extension revision 1
+	VK_EXT_attachment_feedback_loop_layout             : extension revision 2
+	VK_EXT_border_color_swizzle                        : extension revision 1
+	VK_EXT_calibrated_timestamps                       : extension revision 2
+	VK_EXT_color_write_enable                          : extension revision 1
+	VK_EXT_conditional_rendering                       : extension revision 2
+	VK_EXT_custom_border_color                         : extension revision 12
+	VK_EXT_depth_clip_control                          : extension revision 1
+	VK_EXT_depth_clip_enable                           : extension revision 1
+	VK_EXT_depth_range_unrestricted                    : extension revision 1
+	VK_EXT_descriptor_buffer                           : extension revision 1
+	VK_EXT_descriptor_indexing                         : extension revision 2
+	VK_EXT_device_generated_commands                   : extension revision 1
+	VK_EXT_dynamic_rendering_unused_attachments        : extension revision 1
+	VK_EXT_extended_dynamic_state                      : extension revision 1
+	VK_EXT_extended_dynamic_state2                     : extension revision 1
+	VK_EXT_extended_dynamic_state3                     : extension revision 2
+	VK_EXT_external_memory_dma_buf                     : extension revision 1
+	VK_EXT_external_memory_host                        : extension revision 1
+	VK_EXT_graphics_pipeline_library                   : extension revision 1
+	VK_EXT_host_image_copy                             : extension revision 1
+	VK_EXT_host_query_reset                            : extension revision 1
+	VK_EXT_image_2d_view_of_3d                         : extension revision 1
+	VK_EXT_image_drm_format_modifier                   : extension revision 2
+	VK_EXT_image_robustness                            : extension revision 1
+	VK_EXT_image_sliced_view_of_3d                     : extension revision 1
+	VK_EXT_index_type_uint8                            : extension revision 1
+	VK_EXT_inline_uniform_block                        : extension revision 1
+	VK_EXT_legacy_vertex_attributes                    : extension revision 1
+	VK_EXT_line_rasterization                          : extension revision 1
+	VK_EXT_load_store_op_none                          : extension revision 1
+	VK_EXT_memory_budget                               : extension revision 1
+	VK_EXT_memory_priority                             : extension revision 1
+	VK_EXT_mesh_shader                                 : extension revision 1
+	VK_EXT_multi_draw                                  : extension revision 1
+	VK_EXT_multisampled_render_to_single_sampled       : extension revision 1
+	VK_EXT_mutable_descriptor_type                     : extension revision 1
+	VK_EXT_nested_command_buffer                       : extension revision 1
+	VK_EXT_non_seamless_cube_map                       : extension revision 1
+	VK_EXT_pageable_device_local_memory                : extension revision 1
+	VK_EXT_pipeline_creation_cache_control             : extension revision 3
+	VK_EXT_pipeline_creation_feedback                  : extension revision 1
+	VK_EXT_pipeline_library_group_handles              : extension revision 1
+	VK_EXT_pipeline_protected_access                   : extension revision 1
+	VK_EXT_pipeline_robustness                         : extension revision 1
+	VK_EXT_post_depth_coverage                         : extension revision 1
+	VK_EXT_primitive_topology_list_restart             : extension revision 1
+	VK_EXT_primitives_generated_query                  : extension revision 1
+	VK_EXT_private_data                                : extension revision 1
+	VK_EXT_provoking_vertex                            : extension revision 1
+	VK_EXT_queue_family_foreign                        : extension revision 1
+	VK_EXT_rasterization_order_attachment_access       : extension revision 1
+	VK_EXT_robustness2                                 : extension revision 1
+	VK_EXT_sampler_filter_minmax                       : extension revision 2
+	VK_EXT_scalar_block_layout                         : extension revision 1
+	VK_EXT_separate_stencil_usage                      : extension revision 1
+	VK_EXT_shader_atomic_float                         : extension revision 1
+	VK_EXT_shader_atomic_float2                        : extension revision 1
+	VK_EXT_shader_demote_to_helper_invocation          : extension revision 1
+	VK_EXT_shader_object                               : extension revision 1
+	VK_EXT_shader_replicated_composites                : extension revision 1
+	VK_EXT_shader_stencil_export                       : extension revision 1
+	VK_EXT_shader_subgroup_ballot                      : extension revision 1
+	VK_EXT_shader_subgroup_vote                        : extension revision 1
+	VK_EXT_shader_viewport_index_layer                 : extension revision 1
+	VK_EXT_subgroup_size_control                       : extension revision 2
+	VK_EXT_swapchain_maintenance1                      : extension revision 1
+	VK_EXT_texel_buffer_alignment                      : extension revision 1
+	VK_EXT_transform_feedback                          : extension revision 1
+	VK_EXT_vertex_attribute_divisor                    : extension revision 3
+	VK_EXT_vertex_input_dynamic_state                  : extension revision 2
+	VK_EXT_ycbcr_2plane_444_formats                    : extension revision 1
+	VK_EXT_ycbcr_image_arrays                          : extension revision 1
+	VK_GOOGLE_decorate_string                          : extension revision 1
+	VK_GOOGLE_hlsl_functionality1                      : extension revision 1
+	VK_KHR_16bit_storage                               : extension revision 1
+	VK_KHR_8bit_storage                                : extension revision 1
+	VK_KHR_acceleration_structure                      : extension revision 13
+	VK_KHR_bind_memory2                                : extension revision 1
+	VK_KHR_buffer_device_address                       : extension revision 1
+	VK_KHR_compute_shader_derivatives                  : extension revision 1
+	VK_KHR_copy_commands2                              : extension revision 1
+	VK_KHR_create_renderpass2                          : extension revision 1
+	VK_KHR_dedicated_allocation                        : extension revision 3
+	VK_KHR_deferred_host_operations                    : extension revision 4
+	VK_KHR_depth_stencil_resolve                       : extension revision 1
+	VK_KHR_descriptor_update_template                  : extension revision 1
+	VK_KHR_device_group                                : extension revision 4
+	VK_KHR_draw_indirect_count                         : extension revision 1
+	VK_KHR_driver_properties                           : extension revision 1
+	VK_KHR_dynamic_rendering                           : extension revision 1
+	VK_KHR_dynamic_rendering_local_read                : extension revision 1
+	VK_KHR_external_fence                              : extension revision 1
+	VK_KHR_external_fence_fd                           : extension revision 1
+	VK_KHR_external_memory                             : extension revision 1
+	VK_KHR_external_memory_fd                          : extension revision 1
+	VK_KHR_external_semaphore                          : extension revision 1
+	VK_KHR_external_semaphore_fd                       : extension revision 1
+	VK_KHR_format_feature_flags2                       : extension revision 2
+	VK_KHR_get_memory_requirements2                    : extension revision 1
+	VK_KHR_global_priority                             : extension revision 1
+	VK_KHR_image_format_list                           : extension revision 1
+	VK_KHR_imageless_framebuffer                       : extension revision 1
+	VK_KHR_incremental_present                         : extension revision 2
+	VK_KHR_index_type_uint8                            : extension revision 1
+	VK_KHR_line_rasterization                          : extension revision 1
+	VK_KHR_load_store_op_none                          : extension revision 1
+	VK_KHR_maintenance1                                : extension revision 2
+	VK_KHR_maintenance2                                : extension revision 1
+	VK_KHR_maintenance3                                : extension revision 1
+	VK_KHR_maintenance4                                : extension revision 2
+	VK_KHR_maintenance5                                : extension revision 1
+	VK_KHR_maintenance6                                : extension revision 1
+	VK_KHR_maintenance7                                : extension revision 1
+	VK_KHR_maintenance8                                : extension revision 1
+	VK_KHR_map_memory2                                 : extension revision 1
+	VK_KHR_multiview                                   : extension revision 1
+	VK_KHR_pipeline_library                            : extension revision 1
+	VK_KHR_push_descriptor                             : extension revision 2
+	VK_KHR_ray_query                                   : extension revision 1
+	VK_KHR_ray_tracing_maintenance1                    : extension revision 1
+	VK_KHR_ray_tracing_pipeline                        : extension revision 1
+	VK_KHR_ray_tracing_position_fetch                  : extension revision 1
+	VK_KHR_relaxed_block_layout                        : extension revision 1
+	VK_KHR_sampler_mirror_clamp_to_edge                : extension revision 3
+	VK_KHR_sampler_ycbcr_conversion                    : extension revision 14
+	VK_KHR_separate_depth_stencil_layouts              : extension revision 1
+	VK_KHR_shader_atomic_int64                         : extension revision 1
+	VK_KHR_shader_clock                                : extension revision 1
+	VK_KHR_shader_draw_parameters                      : extension revision 1
+	VK_KHR_shader_expect_assume                        : extension revision 1
+	VK_KHR_shader_float16_int8                         : extension revision 1
+	VK_KHR_shader_float_controls                       : extension revision 4
+	VK_KHR_shader_float_controls2                      : extension revision 1
+	VK_KHR_shader_integer_dot_product                  : extension revision 1
+	VK_KHR_shader_maximal_reconvergence                : extension revision 1
+	VK_KHR_shader_non_semantic_info                    : extension revision 1
+	VK_KHR_shader_relaxed_extended_instruction         : extension revision 1
+	VK_KHR_shader_subgroup_extended_types              : extension revision 1
+	VK_KHR_shader_subgroup_rotate                      : extension revision 2
+	VK_KHR_shader_terminate_invocation                 : extension revision 1
+	VK_KHR_spirv_1_4                                   : extension revision 1
+	VK_KHR_storage_buffer_storage_class                : extension revision 1
+	VK_KHR_swapchain                                   : extension revision 70
+	VK_KHR_swapchain_mutable_format                    : extension revision 1
+	VK_KHR_synchronization2                            : extension revision 1
+	VK_KHR_timeline_semaphore                          : extension revision 2
+	VK_KHR_uniform_buffer_standard_layout              : extension revision 1
+	VK_KHR_variable_pointers                           : extension revision 1
+	VK_KHR_vertex_attribute_divisor                    : extension revision 1
+	VK_KHR_vulkan_memory_model                         : extension revision 3
+	VK_KHR_zero_initialize_workgroup_memory            : extension revision 1
+
+VkQueueFamilyProperties:
+========================
+	queueProperties[0]:
+	-------------------
+		minImageTransferGranularity = (1,1,1)
+		queueCount                  = 1
+		queueFlags                  = QUEUE_GRAPHICS_BIT | QUEUE_COMPUTE_BIT | QUEUE_TRANSFER_BIT | QUEUE_SPARSE_BINDING_BIT
+		timestampValidBits          = 64
+		present support             = false
+
+VkPhysicalDeviceMemoryProperties:
+=================================
+memoryHeaps: count = 1
+	memoryHeaps[0]:
+		size   = 8454619136 (0x1f7ef4000) (7.87 GiB)
+		budget = 8454619136 (0x1f7ef4000) (7.87 GiB)
+		usage  = 4796039168 (0x11dddc000) (4.47 GiB)
+		flags: count = 1
+			MEMORY_HEAP_DEVICE_LOCAL_BIT
+memoryTypes: count = 1
+	memoryTypes[0]:
+		heapIndex     = 0
+		propertyFlags = 0x000f: count = 4
+			MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+			MEMORY_PROPERTY_HOST_VISIBLE_BIT
+			MEMORY_PROPERTY_HOST_COHERENT_BIT
+			MEMORY_PROPERTY_HOST_CACHED_BIT
+		usable for:
+			IMAGE_TILING_OPTIMAL:
+				color images
+				FORMAT_D16_UNORM
+				FORMAT_X8_D24_UNORM_PACK32
+				FORMAT_D32_SFLOAT
+				FORMAT_S8_UINT
+				FORMAT_D24_UNORM_S8_UINT
+				FORMAT_D32_SFLOAT_S8_UINT
+			IMAGE_TILING_LINEAR:
+				color images
+
+VkPhysicalDeviceFeatures:
+=========================
+	robustBufferAccess                      = true
+	fullDrawIndexUint32                     = true
+	imageCubeArray                          = true
+	independentBlend                        = true
+	geometryShader                          = true
+	tessellationShader                      = true
+	sampleRateShading                       = true
+	dualSrcBlend                            = true
+	logicOp                                 = true
+	multiDrawIndirect                       = true
+	drawIndirectFirstInstance               = true
+	depthClamp                              = true
+	depthBiasClamp                          = true
+	fillModeNonSolid                        = true
+	depthBounds                             = false
+	wideLines                               = true
+	largePoints                             = true
+	alphaToOne                              = true
+	multiViewport                           = true
+	samplerAnisotropy                       = true
+	textureCompressionETC2                  = false
+	textureCompressionASTC_LDR              = false
+	textureCompressionBC                    = true
+	occlusionQueryPrecise                   = true
+	pipelineStatisticsQuery                 = true
+	vertexPipelineStoresAndAtomics          = true
+	fragmentStoresAndAtomics                = true
+	shaderTessellationAndGeometryPointSize  = true
+	shaderImageGatherExtended               = true
+	shaderStorageImageExtendedFormats       = true
+	shaderStorageImageMultisample           = true
+	shaderStorageImageReadWithoutFormat     = true
+	shaderStorageImageWriteWithoutFormat    = true
+	shaderUniformBufferArrayDynamicIndexing = true
+	shaderSampledImageArrayDynamicIndexing  = true
+	shaderStorageBufferArrayDynamicIndexing = true
+	shaderStorageImageArrayDynamicIndexing  = true
+	shaderClipDistance                      = true
+	shaderCullDistance                      = true
+	shaderFloat64                           = true
+	shaderInt64                             = true
+	shaderInt16                             = true
+	shaderResourceResidency                 = true
+	shaderResourceMinLod                    = false
+	sparseBinding                           = true
+	sparseResidencyBuffer                   = true
+	sparseResidencyImage2D                  = true
+	sparseResidencyImage3D                  = true
+	sparseResidency2Samples                 = false
+	sparseResidency4Samples                 = false
+	sparseResidency8Samples                 = false
+	sparseResidency16Samples                = false
+	sparseResidencyAliased                  = true
+	variableMultisampleRate                 = false
+	inheritedQueries                        = false
+
+VkPhysicalDevice4444FormatsFeaturesEXT:
+---------------------------------------
+	formatA4R4G4B4 = true
+	formatA4B4G4R4 = true
+
+VkPhysicalDeviceAccelerationStructureFeaturesKHR:
+-------------------------------------------------
+	accelerationStructure                                 = true
+	accelerationStructureCaptureReplay                    = false
+	accelerationStructureIndirectBuild                    = false
+	accelerationStructureHostCommands                     = false
+	descriptorBindingAccelerationStructureUpdateAfterBind = true
+
+VkPhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT:
+--------------------------------------------------------------
+	attachmentFeedbackLoopDynamicState = true
+
+VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT:
+--------------------------------------------------------
+	attachmentFeedbackLoopLayout = true
+
+VkPhysicalDeviceBorderColorSwizzleFeaturesEXT:
+----------------------------------------------
+	borderColorSwizzle          = true
+	borderColorSwizzleFromImage = true
+
+VkPhysicalDeviceColorWriteEnableFeaturesEXT:
+--------------------------------------------
+	colorWriteEnable = true
+
+VkPhysicalDeviceComputeShaderDerivativesFeaturesKHR:
+----------------------------------------------------
+	computeDerivativeGroupQuads  = true
+	computeDerivativeGroupLinear = true
+
+VkPhysicalDeviceConditionalRenderingFeaturesEXT:
+------------------------------------------------
+	conditionalRendering          = true
+	inheritedConditionalRendering = false
+
+VkPhysicalDeviceCustomBorderColorFeaturesEXT:
+---------------------------------------------
+	customBorderColors             = true
+	customBorderColorWithoutFormat = true
+
+VkPhysicalDeviceDepthClipControlFeaturesEXT:
+--------------------------------------------
+	depthClipControl = true
+
+VkPhysicalDeviceDepthClipEnableFeaturesEXT:
+-------------------------------------------
+	depthClipEnable = true
+
+VkPhysicalDeviceDescriptorBufferFeaturesEXT:
+--------------------------------------------
+	descriptorBuffer                   = true
+	descriptorBufferCaptureReplay      = false
+	descriptorBufferImageLayoutIgnored = true
+	descriptorBufferPushDescriptors    = true
+
+VkPhysicalDeviceDeviceGeneratedCommandsFeaturesEXT:
+---------------------------------------------------
+	deviceGeneratedCommands        = true
+	dynamicGeneratedPipelineLayout = true
+
+VkPhysicalDeviceDynamicRenderingUnusedAttachmentsFeaturesEXT:
+-------------------------------------------------------------
+	dynamicRenderingUnusedAttachments = true
+
+VkPhysicalDeviceExtendedDynamicState2FeaturesEXT:
+-------------------------------------------------
+	extendedDynamicState2                   = true
+	extendedDynamicState2LogicOp            = true
+	extendedDynamicState2PatchControlPoints = true
+
+VkPhysicalDeviceExtendedDynamicState3FeaturesEXT:
+-------------------------------------------------
+	extendedDynamicState3TessellationDomainOrigin         = true
+	extendedDynamicState3DepthClampEnable                 = true
+	extendedDynamicState3PolygonMode                      = true
+	extendedDynamicState3RasterizationSamples             = true
+	extendedDynamicState3SampleMask                       = true
+	extendedDynamicState3AlphaToCoverageEnable            = true
+	extendedDynamicState3AlphaToOneEnable                 = true
+	extendedDynamicState3LogicOpEnable                    = true
+	extendedDynamicState3ColorBlendEnable                 = true
+	extendedDynamicState3ColorBlendEquation               = true
+	extendedDynamicState3ColorWriteMask                   = true
+	extendedDynamicState3RasterizationStream              = false
+	extendedDynamicState3ConservativeRasterizationMode    = false
+	extendedDynamicState3ExtraPrimitiveOverestimationSize = false
+	extendedDynamicState3DepthClipEnable                  = true
+	extendedDynamicState3SampleLocationsEnable            = false
+	extendedDynamicState3ColorBlendAdvanced               = false
+	extendedDynamicState3ProvokingVertexMode              = true
+	extendedDynamicState3LineRasterizationMode            = true
+	extendedDynamicState3LineStippleEnable                = true
+	extendedDynamicState3DepthClipNegativeOneToOne        = true
+	extendedDynamicState3ViewportWScalingEnable           = false
+	extendedDynamicState3ViewportSwizzle                  = false
+	extendedDynamicState3CoverageToColorEnable            = false
+	extendedDynamicState3CoverageToColorLocation          = false
+	extendedDynamicState3CoverageModulationMode           = false
+	extendedDynamicState3CoverageModulationTableEnable    = false
+	extendedDynamicState3CoverageModulationTable          = false
+	extendedDynamicState3CoverageReductionMode            = false
+	extendedDynamicState3RepresentativeFragmentTestEnable = false
+	extendedDynamicState3ShadingRateImageEnable           = false
+
+VkPhysicalDeviceExtendedDynamicStateFeaturesEXT:
+------------------------------------------------
+	extendedDynamicState = true
+
+VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT:
+---------------------------------------------------
+	graphicsPipelineLibrary = true
+
+VkPhysicalDeviceImage2DViewOf3DFeaturesEXT:
+-------------------------------------------
+	image2DViewOf3D   = true
+	sampler2DViewOf3D = true
+
+VkPhysicalDeviceImageSlicedViewOf3DFeaturesEXT:
+-----------------------------------------------
+	imageSlicedViewOf3D = true
+
+VkPhysicalDeviceLegacyVertexAttributesFeaturesEXT:
+--------------------------------------------------
+	legacyVertexAttributes = true
+
+VkPhysicalDeviceMaintenance7FeaturesKHR:
+----------------------------------------
+	maintenance7 = true
+
+VkPhysicalDeviceMemoryPriorityFeaturesEXT:
+------------------------------------------
+	memoryPriority = true
+
+VkPhysicalDeviceMeshShaderFeaturesEXT:
+--------------------------------------
+	taskShader                             = true
+	meshShader                             = true
+	multiviewMeshShader                    = false
+	primitiveFragmentShadingRateMeshShader = false
+	meshShaderQueries                      = true
+
+VkPhysicalDeviceMultiDrawFeaturesEXT:
+-------------------------------------
+	multiDraw = true
+
+VkPhysicalDeviceMultisampledRenderToSingleSampledFeaturesEXT:
+-------------------------------------------------------------
+	multisampledRenderToSingleSampled = true
+
+VkPhysicalDeviceMutableDescriptorTypeFeaturesEXT:
+-------------------------------------------------
+	mutableDescriptorType = true
+
+VkPhysicalDeviceNestedCommandBufferFeaturesEXT:
+-----------------------------------------------
+	nestedCommandBuffer                = true
+	nestedCommandBufferRendering       = true
+	nestedCommandBufferSimultaneousUse = true
+
+VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT:
+----------------------------------------------
+	nonSeamlessCubeMap = true
+
+VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT:
+-----------------------------------------------------
+	pageableDeviceLocalMemory = true
+
+VkPhysicalDevicePipelineLibraryGroupHandlesFeaturesEXT:
+-------------------------------------------------------
+	pipelineLibraryGroupHandles = true
+
+VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT:
+--------------------------------------------------------
+	primitiveTopologyListRestart      = true
+	primitiveTopologyPatchListRestart = true
+
+VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT:
+----------------------------------------------------
+	primitivesGeneratedQuery                      = true
+	primitivesGeneratedQueryWithRasterizerDiscard = true
+	primitivesGeneratedQueryWithNonZeroStreams    = true
+
+VkPhysicalDeviceProvokingVertexFeaturesEXT:
+-------------------------------------------
+	provokingVertexLast                       = true
+	transformFeedbackPreservesProvokingVertex = true
+
+VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT:
+--------------------------------------------------------------
+	rasterizationOrderColorAttachmentAccess   = true
+	rasterizationOrderDepthAttachmentAccess   = true
+	rasterizationOrderStencilAttachmentAccess = true
+
+VkPhysicalDeviceRayQueryFeaturesKHR:
+------------------------------------
+	rayQuery = true
+
+VkPhysicalDeviceRayTracingMaintenance1FeaturesKHR:
+--------------------------------------------------
+	rayTracingMaintenance1               = true
+	rayTracingPipelineTraceRaysIndirect2 = true
+
+VkPhysicalDeviceRayTracingPipelineFeaturesKHR:
+----------------------------------------------
+	rayTracingPipeline                                    = true
+	rayTracingPipelineShaderGroupHandleCaptureReplay      = false
+	rayTracingPipelineShaderGroupHandleCaptureReplayMixed = false
+	rayTracingPipelineTraceRaysIndirect                   = true
+	rayTraversalPrimitiveCulling                          = true
+
+VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR:
+---------------------------------------------------
+	rayTracingPositionFetch = true
+
+VkPhysicalDeviceRobustness2FeaturesEXT:
+---------------------------------------
+	robustBufferAccess2 = true
+	robustImageAccess2  = true
+	nullDescriptor      = true
+
+VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT:
+----------------------------------------------
+	shaderBufferFloat16Atomics      = false
+	shaderBufferFloat16AtomicAdd    = false
+	shaderBufferFloat16AtomicMinMax = false
+	shaderBufferFloat32AtomicMinMax = true
+	shaderBufferFloat64AtomicMinMax = false
+	shaderSharedFloat16Atomics      = false
+	shaderSharedFloat16AtomicAdd    = false
+	shaderSharedFloat16AtomicMinMax = false
+	shaderSharedFloat32AtomicMinMax = true
+	shaderSharedFloat64AtomicMinMax = false
+	shaderImageFloat32AtomicMinMax  = true
+	sparseImageFloat32AtomicMinMax  = false
+
+VkPhysicalDeviceShaderAtomicFloatFeaturesEXT:
+---------------------------------------------
+	shaderBufferFloat32Atomics   = true
+	shaderBufferFloat32AtomicAdd = true
+	shaderBufferFloat64Atomics   = false
+	shaderBufferFloat64AtomicAdd = false
+	shaderSharedFloat32Atomics   = true
+	shaderSharedFloat32AtomicAdd = true
+	shaderSharedFloat64Atomics   = false
+	shaderSharedFloat64AtomicAdd = false
+	shaderImageFloat32Atomics    = true
+	shaderImageFloat32AtomicAdd  = true
+	sparseImageFloat32Atomics    = true
+	sparseImageFloat32AtomicAdd  = true
+
+VkPhysicalDeviceShaderClockFeaturesKHR:
+---------------------------------------
+	shaderSubgroupClock = true
+	shaderDeviceClock   = true
+
+VkPhysicalDeviceShaderMaximalReconvergenceFeaturesKHR:
+------------------------------------------------------
+	shaderMaximalReconvergence = true
+
+VkPhysicalDeviceShaderObjectFeaturesEXT:
+----------------------------------------
+	shaderObject = true
+
+VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR:
+------------------------------------------------------------
+	shaderRelaxedExtendedInstruction = true
+
+VkPhysicalDeviceShaderReplicatedCompositesFeaturesEXT:
+------------------------------------------------------
+	shaderReplicatedComposites = true
+
+VkPhysicalDeviceSwapchainMaintenance1FeaturesEXT:
+-------------------------------------------------
+	swapchainMaintenance1 = true
+
+VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT:
+------------------------------------------------
+	texelBufferAlignment = true
+
+VkPhysicalDeviceTransformFeedbackFeaturesEXT:
+---------------------------------------------
+	transformFeedback = true
+	geometryStreams   = true
+
+VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT:
+---------------------------------------------------
+	vertexInputDynamicState = true
+
+VkPhysicalDeviceVulkan11Features:
+---------------------------------
+	storageBuffer16BitAccess           = true
+	uniformAndStorageBuffer16BitAccess = true
+	storagePushConstant16              = true
+	storageInputOutput16               = false
+	multiview                          = true
+	multiviewGeometryShader            = true
+	multiviewTessellationShader        = true
+	variablePointersStorageBuffer      = true
+	variablePointers                   = true
+	protectedMemory                    = false
+	samplerYcbcrConversion             = true
+	shaderDrawParameters               = true
+
+VkPhysicalDeviceVulkan12Features:
+---------------------------------
+	samplerMirrorClampToEdge                           = true
+	drawIndirectCount                                  = true
+	storageBuffer8BitAccess                            = true
+	uniformAndStorageBuffer8BitAccess                  = true
+	storagePushConstant8                               = true
+	shaderBufferInt64Atomics                           = true
+	shaderSharedInt64Atomics                           = true
+	shaderFloat16                                      = true
+	shaderInt8                                         = true
+	descriptorIndexing                                 = true
+	shaderInputAttachmentArrayDynamicIndexing          = true
+	shaderUniformTexelBufferArrayDynamicIndexing       = true
+	shaderStorageTexelBufferArrayDynamicIndexing       = true
+	shaderUniformBufferArrayNonUniformIndexing         = true
+	shaderSampledImageArrayNonUniformIndexing          = true
+	shaderStorageBufferArrayNonUniformIndexing         = true
+	shaderStorageImageArrayNonUniformIndexing          = true
+	shaderInputAttachmentArrayNonUniformIndexing       = true
+	shaderUniformTexelBufferArrayNonUniformIndexing    = true
+	shaderStorageTexelBufferArrayNonUniformIndexing    = true
+	descriptorBindingUniformBufferUpdateAfterBind      = true
+	descriptorBindingSampledImageUpdateAfterBind       = true
+	descriptorBindingStorageImageUpdateAfterBind       = true
+	descriptorBindingStorageBufferUpdateAfterBind      = true
+	descriptorBindingUniformTexelBufferUpdateAfterBind = true
+	descriptorBindingStorageTexelBufferUpdateAfterBind = true
+	descriptorBindingUpdateUnusedWhilePending          = true
+	descriptorBindingPartiallyBound                    = true
+	descriptorBindingVariableDescriptorCount           = true
+	runtimeDescriptorArray                             = true
+	samplerFilterMinmax                                = true
+	scalarBlockLayout                                  = true
+	imagelessFramebuffer                               = true
+	uniformBufferStandardLayout                        = true
+	shaderSubgroupExtendedTypes                        = true
+	separateDepthStencilLayouts                        = true
+	hostQueryReset                                     = true
+	timelineSemaphore                                  = true
+	bufferDeviceAddress                                = true
+	bufferDeviceAddressCaptureReplay                   = false
+	bufferDeviceAddressMultiDevice                     = false
+	vulkanMemoryModel                                  = true
+	vulkanMemoryModelDeviceScope                       = true
+	vulkanMemoryModelAvailabilityVisibilityChains      = true
+	shaderOutputViewportIndex                          = true
+	shaderOutputLayer                                  = true
+	subgroupBroadcastDynamicId                         = true
+
+VkPhysicalDeviceVulkan13Features:
+---------------------------------
+	robustImageAccess                                  = true
+	inlineUniformBlock                                 = true
+	descriptorBindingInlineUniformBlockUpdateAfterBind = true
+	pipelineCreationCacheControl                       = true
+	privateData                                        = true
+	shaderDemoteToHelperInvocation                     = true
+	shaderTerminateInvocation                          = true
+	subgroupSizeControl                                = true
+	computeFullSubgroups                               = true
+	synchronization2                                   = true
+	textureCompressionASTC_HDR                         = false
+	shaderZeroInitializeWorkgroupMemory                = true
+	dynamicRendering                                   = true
+	shaderIntegerDotProduct                            = true
+	maintenance4                                       = true
+
+VkPhysicalDeviceVulkan14Features:
+---------------------------------
+	globalPriorityQuery                    = true
+	shaderSubgroupRotate                   = true
+	shaderSubgroupRotateClustered          = true
+	shaderFloatControls2                   = true
+	shaderExpectAssume                     = true
+	rectangularLines                       = true
+	bresenhamLines                         = true
+	smoothLines                            = true
+	stippledRectangularLines               = true
+	stippledBresenhamLines                 = true
+	stippledSmoothLines                    = true
+	vertexAttributeInstanceRateDivisor     = true
+	vertexAttributeInstanceRateZeroDivisor = true
+	indexTypeUint8                         = true
+	dynamicRenderingLocalRead              = true
+	maintenance5                           = true
+	maintenance6                           = true
+	pipelineProtectedAccess                = true
+	pipelineRobustness                     = true
+	hostImageCopy                          = true
+	pushDescriptor                         = true
+
+VkPhysicalDeviceYcbcr2Plane444FormatsFeaturesEXT:
+-------------------------------------------------
+	ycbcr2plane444Formats = true
+
+VkPhysicalDeviceYcbcrImageArraysFeaturesEXT:
+--------------------------------------------
+	ycbcrImageArrays = true
+
+
diff --git a/external/ffmpeg-snapshot/COPYING.LGPLv2.1 b/external/ffmpeg-snapshot/COPYING.LGPLv2.1
new file mode 100644
index 0000000..58af0d3
--- /dev/null
+++ b/external/ffmpeg-snapshot/COPYING.LGPLv2.1
@@ -0,0 +1,502 @@
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/external/ffmpeg-snapshot/PROVENANCE.md b/external/ffmpeg-snapshot/PROVENANCE.md
new file mode 100644
index 0000000..e04d294
--- /dev/null
+++ b/external/ffmpeg-snapshot/PROVENANCE.md
@@ -0,0 +1,92 @@
+# FFmpeg source snapshot
+
+Verbatim subset of FFmpeg source pinned for use as reference
+implementations of the VP9 8×8 inverse DCT (Phase 1 target of
+`daedalus-fourier`). See `../../docs/phase2.md §2` and `§5` for
+the rationale.
+
+## Upstream pin
+
+- **Repository**: https://github.com/FFmpeg/FFmpeg
+- **Tag**: `n7.1.3` (matches `libavcodec61 8:7.1.3-0+deb13u1+rpt1`
+  shipping in Debian Trixie on the dev host `hertz`)
+- **Annotated tag object**: `0a9a757e96fdf053697084bbd1f620edeac9d084`
+- **Commit object (tag target)**: `f46e514491172d15bd74b4abb1814cd2f05a763e`
+- **Snapshot fetched**: 2026-05-18 (UTC), via
+  `https://raw.githubusercontent.com/FFmpeg/FFmpeg/n7.1.3/<path>`
+
+## Files in this snapshot
+
+All files are byte-for-byte copies of the upstream source at the
+tagged commit, no modifications.
+
+| Path | Lines | Bytes | SHA-256 |
+|---|---|---|---|
+| `libavcodec/vp9dsp_template.c` | 2578 | 89045 | `41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f` |
+| `libavcodec/aarch64/vp9itxfm_neon.S` | 1580 | 63534 | `82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6` |
+| `libavcodec/aarch64/neon.S` | 173 | 7496 | `72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538` |
+| `libavutil/aarch64/asm.S` | 260 | 8069 | `c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3` |
+| `COPYING.LGPLv2.1` | 502 | — | `b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe` |
+
+Verify with:
+
+```sh
+( cd external/ffmpeg-snapshot && sha256sum -c <<'EOF'
+41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f  libavcodec/vp9dsp_template.c
+82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6  libavcodec/aarch64/vp9itxfm_neon.S
+72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538  libavcodec/aarch64/neon.S
+c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3  libavutil/aarch64/asm.S
+b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe  COPYING.LGPLv2.1
+EOF
+)
+```
+
+## License
+
+LGPL-2.1-or-later. See `COPYING.LGPLv2.1`. Original copyright
+holders include the FFmpeg authors and Google Inc. (2016) for
+the aarch64 NEON paths. The snapshot inherits FFmpeg's license
+in full.
+
+## Why each file is in this snapshot
+
+- `libavcodec/vp9dsp_template.c` — contains `idct_idct_8x8_add_c`,
+  the bit-exact C reference for the Phase 1 kernel under test (M1).
+- `libavcodec/aarch64/vp9itxfm_neon.S` — contains
+  `ff_vp9_idct_idct_8x8_add_neon`, the NEON throughput baseline
+  (M3). Also defines `idct8`, `dmbutterfly0`, `dmbutterfly`,
+  `dmbutterfly_l`, `butterfly_8h`, and the `idct_coeffs` constant
+  table.
+- `libavcodec/aarch64/neon.S` — defines `transpose_8x8H` used by
+  `vp9itxfm_neon.S`.
+- `libavutil/aarch64/asm.S` — defines `function`, `endfunc`,
+  `movrel`, `const`, `endconst`, and other assembly preamble
+  macros required to assemble the above NEON files.
+
+## Re-vendoring procedure
+
+If the upstream pin needs to change (e.g., hertz updates to a
+newer libavcodec):
+
+```sh
+TAG=nX.Y.Z
+BASE=https://raw.githubusercontent.com/FFmpeg/FFmpeg/$TAG
+cd external/ffmpeg-snapshot
+for f in libavcodec/vp9dsp_template.c \
+         libavcodec/aarch64/vp9itxfm_neon.S \
+         libavcodec/aarch64/neon.S \
+         libavutil/aarch64/asm.S \
+         COPYING.LGPLv2.1; do
+  curl -sSf -o "$f" "$BASE/$f"
+done
+sha256sum libavcodec/vp9dsp_template.c \
+          libavcodec/aarch64/vp9itxfm_neon.S \
+          libavcodec/aarch64/neon.S \
+          libavutil/aarch64/asm.S \
+          COPYING.LGPLv2.1
+# update this PROVENANCE.md with the new tag, commit hash, and hashes
+```
+
+After re-vendoring, re-run the bit-exact gate (M1) and throughput
+baseline (M3) — both can shift across FFmpeg versions even when
+the VP9 spec doesn't change (e.g., NEON micro-optimizations).
diff --git a/external/ffmpeg-snapshot/config.h b/external/ffmpeg-snapshot/config.h
new file mode 100644
index 0000000..a5586a6
--- /dev/null
+++ b/external/ffmpeg-snapshot/config.h
@@ -0,0 +1,27 @@
+/*
+ * Minimal config.h shim for assembling the vendored FFmpeg .S files
+ * outside the FFmpeg build tree.
+ *
+ * The vendored .S files (vp9itxfm_neon.S, neon.S, asm.S) reference
+ * exactly 7 preprocessor symbols, enumerated below. Values target
+ * aarch64-Linux with modern binutils (≥2.41) — matches the Debian
+ * Trixie environment on hertz (the project's dev host).
+ *
+ * See ../../docs/phase2.md §5 for the source-copy rationale and
+ * PROVENANCE.md for the upstream pin (FFmpeg n7.1.3).
+ */
+#pragma once
+
+#define HAVE_AS_FUNC                       1
+#define HAVE_AS_ARCH_DIRECTIVE             1
+#define AS_ARCH_LEVEL                      armv8-a
+#define HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE  1
+#define HAVE_AS_ARCHEXT_I8MM_DIRECTIVE     1
+#define HAVE_SECTION_DATA_REL_RO           1
+#define CONFIG_PIC                         1
+
+/* Symbol prefix for exported labels. On ELF/Linux this is empty
+ * (no leading underscore). FFmpeg's configure script normally
+ * defines this in the generated config.h; we replicate the
+ * Linux-target value here. */
+#define EXTERN_ASM
diff --git a/external/ffmpeg-snapshot/libavcodec/aarch64/neon.S b/external/ffmpeg-snapshot/libavcodec/aarch64/neon.S
new file mode 100644
index 0000000..f6fb13b
--- /dev/null
+++ b/external/ffmpeg-snapshot/libavcodec/aarch64/neon.S
@@ -0,0 +1,173 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+.macro clip min, max, regs:vararg
+.irp x, \regs
+        smax            \x, \x, \min
+.endr
+.irp x, \regs
+        smin            \x, \x, \max
+.endr
+.endm
+
+.macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1            \r8\().8b,  \r0\().8b,  \r1\().8b
+        trn2            \r9\().8b,  \r0\().8b,  \r1\().8b
+        trn1            \r1\().8b,  \r2\().8b,  \r3\().8b
+        trn2            \r3\().8b,  \r2\().8b,  \r3\().8b
+        trn1            \r0\().8b,  \r4\().8b,  \r5\().8b
+        trn2            \r5\().8b,  \r4\().8b,  \r5\().8b
+        trn1            \r2\().8b,  \r6\().8b,  \r7\().8b
+        trn2            \r7\().8b,  \r6\().8b,  \r7\().8b
+
+        trn1            \r4\().4h,  \r0\().4h,  \r2\().4h
+        trn2            \r2\().4h,  \r0\().4h,  \r2\().4h
+        trn1            \r6\().4h,  \r5\().4h,  \r7\().4h
+        trn2            \r7\().4h,  \r5\().4h,  \r7\().4h
+        trn1            \r5\().4h,  \r9\().4h,  \r3\().4h
+        trn2            \r9\().4h,  \r9\().4h,  \r3\().4h
+        trn1            \r3\().4h,  \r8\().4h,  \r1\().4h
+        trn2            \r8\().4h,  \r8\().4h,  \r1\().4h
+
+        trn1            \r0\().2s,  \r3\().2s,  \r4\().2s
+        trn2            \r4\().2s,  \r3\().2s,  \r4\().2s
+
+        trn1            \r1\().2s,  \r5\().2s,  \r6\().2s
+        trn2            \r5\().2s,  \r5\().2s,  \r6\().2s
+
+        trn2            \r6\().2s,  \r8\().2s,  \r2\().2s
+        trn1            \r2\().2s,  \r8\().2s,  \r2\().2s
+
+        trn1            \r3\().2s,  \r9\().2s,  \r7\().2s
+        trn2            \r7\().2s,  \r9\().2s,  \r7\().2s
+.endm
+
+.macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+        trn1            \t0\().16b, \r0\().16b, \r1\().16b
+        trn2            \t1\().16b, \r0\().16b, \r1\().16b
+        trn1            \r1\().16b, \r2\().16b, \r3\().16b
+        trn2            \r3\().16b, \r2\().16b, \r3\().16b
+        trn1            \r0\().16b, \r4\().16b, \r5\().16b
+        trn2            \r5\().16b, \r4\().16b, \r5\().16b
+        trn1            \r2\().16b, \r6\().16b, \r7\().16b
+        trn2            \r7\().16b, \r6\().16b, \r7\().16b
+
+        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
+        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
+        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
+        trn1            \r5\().8h,  \t1\().8h,  \r3\().8h
+        trn2            \t1\().8h,  \t1\().8h,  \r3\().8h
+        trn1            \r3\().8h,  \t0\().8h,  \r1\().8h
+        trn2            \t0\().8h,  \t0\().8h,  \r1\().8h
+
+        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
+        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s
+
+        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
+        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s
+
+        trn2            \r6\().4s,  \t0\().4s,  \r2\().4s
+        trn1            \r2\().4s,  \t0\().4s,  \r2\().4s
+
+        trn1            \r3\().4s,  \t1\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \t1\().4s,  \r7\().4s
+.endm
+
+.macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().16b, \r0\().16b,  \r1\().16b
+        trn2            \t5\().16b, \r0\().16b,  \r1\().16b
+        trn1            \t6\().16b, \r2\().16b,  \r3\().16b
+        trn2            \t7\().16b, \r2\().16b,  \r3\().16b
+
+        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
+        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
+        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
+        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
+.endm
+
+.macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8b,  \r0\().8b,  \r1\().8b
+        trn2            \t5\().8b,  \r0\().8b,  \r1\().8b
+        trn1            \t6\().8b,  \r2\().8b,  \r3\().8b
+        trn2            \t7\().8b,  \r2\().8b,  \r3\().8b
+
+        trn1            \r0\().4h,  \t4\().4h,  \t6\().4h
+        trn2            \r2\().4h,  \t4\().4h,  \t6\().4h
+        trn1            \r1\().4h,  \t5\().4h,  \t7\().4h
+        trn2            \r3\().4h,  \t5\().4h,  \t7\().4h
+.endm
+
+.macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
+        trn1            \r4\().4h,  \r0\().4h,  \r1\().4h
+        trn2            \r5\().4h,  \r0\().4h,  \r1\().4h
+        trn1            \r6\().4h,  \r2\().4h,  \r3\().4h
+        trn2            \r7\().4h,  \r2\().4h,  \r3\().4h
+
+        trn1            \r0\().2s,  \r4\().2s,  \r6\().2s
+        trn2            \r2\().2s,  \r4\().2s,  \r6\().2s
+        trn1            \r1\().2s,  \r5\().2s,  \r7\().2s
+        trn2            \r3\().2s,  \r5\().2s,  \r7\().2s
+.endm
+
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
+.macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1            \r8\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \r9\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
+        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
+        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
+        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
+
+        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
+        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
+        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
+        trn1            \r5\().4s,  \r9\().4s,  \r3\().4s
+        trn2            \r9\().4s,  \r9\().4s,  \r3\().4s
+        trn1            \r3\().4s,  \r8\().4s,  \r1\().4s
+        trn2            \r8\().4s,  \r8\().4s,  \r1\().4s
+
+        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
+        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d
+
+        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d
+
+        trn2            \r6\().2d,  \r8\().2d,  \r2\().2d
+        trn1            \r2\().2d,  \r8\().2d,  \r2\().2d
+
+        trn1            \r3\().2d,  \r9\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \r9\().2d,  \r7\().2d
+
+.endm
diff --git a/external/ffmpeg-snapshot/libavcodec/aarch64/vp9itxfm_neon.S b/external/ffmpeg-snapshot/libavcodec/aarch64/vp9itxfm_neon.S
new file mode 100644
index 0000000..a27f7b8
--- /dev/null
+++ b/external/ffmpeg-snapshot/libavcodec/aarch64/vp9itxfm_neon.S
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
+// in/out are .8h registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+        neg             \tmp4\().4h, v0.4h
+.endif
+        add             \tmp1\().8h, \in1\().8h,  \in2\().8h
+        sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
+.if \neg > 0
+        smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
+        smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
+.else
+        smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
+        smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
+.endif
+.ifb \tmp5
+        rshrn           \out1\().4h, \tmp3\().4s, #14
+        rshrn2          \out1\().8h, \tmp4\().4s, #14
+        smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
+        smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
+        rshrn           \out2\().4h, \tmp3\().4s, #14
+        rshrn2          \out2\().8h, \tmp4\().4s, #14
+.else
+        smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
+        smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
+        rshrn           \out1\().4h, \tmp3\().4s, #14
+        rshrn2          \out1\().8h, \tmp4\().4s, #14
+        rshrn           \out2\().4h, \tmp5\().4s, #14
+        rshrn2          \out2\().8h, \tmp6\().4s, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
+        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
+        rshrn           \out1\().4h,  \tmp1\().4s, #14
+        rshrn2          \out1\().8h,  \tmp2\().4s, #14
+        rshrn           \out2\().4h,  \tmp1\().4s, #14
+        rshrn2          \out2\().8h,  \tmp2\().4s, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .4s registers, in are 2 x .8h registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+        smull           \out1\().4s, \in1\().4h, \coef1
+        smull2          \out2\().4s, \in1\().8h, \coef1
+        smull           \out3\().4s, \in1\().4h, \coef2
+        smull2          \out4\().4s, \in1\().8h, \coef2
+        smlsl           \out1\().4s, \in2\().4h, \coef2
+        smlsl2          \out2\().4s, \in2\().8h, \coef2
+        smlal           \out3\().4s, \in2\().4h, \coef1
+        smlal2          \out4\().4s, \in2\().8h, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .8h registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+        neg             \tmp3\().4s, \tmp3\().4s
+        neg             \tmp4\().4s, \tmp4\().4s
+.endif
+        rshrn           \inout1\().4h, \tmp1\().4s,  #14
+        rshrn2          \inout1\().8h, \tmp2\().4s,  #14
+        rshrn           \inout2\().4h, \tmp3\().4s,  #14
+        rshrn2          \inout2\().8h, \tmp4\().4s,  #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout1\().4h, \coef1
+        smull2          \tmp2\().4s, \inout1\().8h, \coef1
+        smull           \tmp3\().4s, \inout1\().4h, \coef2
+        smull2          \tmp4\().4s, \inout1\().8h, \coef2
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout2\().4h, \coef2
+        smull2          \tmp2\().4s, \inout2\().8h, \coef2
+        smull           \tmp3\().4s, \inout2\().4h, \coef1
+        smull2          \tmp4\().4s, \inout2\().8h, \coef1
+        neg             \tmp1\().4s, \tmp1\().4s
+        neg             \tmp2\().4s, \tmp2\().4s
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().4s, \in\().4h, \coef
+        smull2          \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().4h, \in1\().4s, \shift
+        rshrn2          \out\().8h, \in2\().4s, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_8h out1, out2, in1, in2
+        add             \out1\().8h, \in1\().8h, \in2\().8h
+        sub             \out2\().8h, \in1\().8h, \in2\().8h
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_8h_r out1, out2, in1, in2
+        sub             \out1\().8h, \in1\().8h, \in2\().8h
+        add             \out2\().8h, \in1\().8h, \in2\().8h
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .8h registers, in are 4 x .4s registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        add             \tmp1\().4s, \in1\().4s, \in3\().4s
+        add             \tmp2\().4s, \in2\().4s, \in4\().4s
+        sub             \tmp3\().4s, \in1\().4s, \in3\().4s
+        sub             \tmp4\().4s, \in2\().4s, \in4\().4s
+        rshrn           \out1\().4h, \tmp1\().4s,  #14
+        rshrn2          \out1\().8h, \tmp2\().4s,  #14
+        rshrn           \out2\().4h, \tmp3\().4s,  #14
+        rshrn2          \out2\().8h, \tmp4\().4s,  #14
+.endm
+
+.macro iwht4 c0, c1, c2, c3
+        add             \c0\().4h, \c0\().4h, \c1\().4h
+        sub             v17.4h,    \c2\().4h, \c3\().4h
+        sub             v16.4h,    \c0\().4h, v17.4h
+        sshr            v16.4h,    v16.4h,    #1
+        sub             \c2\().4h, v16.4h,    \c1\().4h
+        sub             \c1\().4h, v16.4h,    \c3\().4h
+        add             \c3\().4h, v17.4h,    \c2\().4h
+        sub             \c0\().4h, \c0\().4h, \c1\().4h
+.endm
+
+.macro idct4 c0, c1, c2, c3
+        smull           v22.4s,    \c1\().4h, v0.h[3]
+        smull           v20.4s,    \c1\().4h, v0.h[2]
+        add             v16.4h,    \c0\().4h, \c2\().4h
+        sub             v17.4h,    \c0\().4h, \c2\().4h
+        smlal           v22.4s,    \c3\().4h, v0.h[2]
+        smull           v18.4s,    v16.4h,    v0.h[0]
+        smull           v19.4s,    v17.4h,    v0.h[0]
+        smlsl           v20.4s,    \c3\().4h, v0.h[3]
+        rshrn           v22.4h,    v22.4s,    #14
+        rshrn           v18.4h,    v18.4s,    #14
+        rshrn           v19.4h,    v19.4s,    #14
+        rshrn           v20.4h,    v20.4s,    #14
+        add             \c0\().4h, v18.4h,    v22.4h
+        sub             \c3\().4h, v18.4h,    v22.4h
+        add             \c1\().4h, v19.4h,    v20.4h
+        sub             \c2\().4h, v19.4h,    v20.4h
+.endm
+
+.macro iadst4 c0, c1, c2, c3
+        smull           v16.4s,    \c0\().4h, v0.h[4]
+        smlal           v16.4s,    \c2\().4h, v0.h[5]
+        smlal           v16.4s,    \c3\().4h, v0.h[6]
+        smull           v17.4s,    \c0\().4h, v0.h[6]
+        smlsl           v17.4s,    \c2\().4h, v0.h[4]
+        sub             \c0\().4h, \c0\().4h, \c2\().4h
+        smlsl           v17.4s,    \c3\().4h, v0.h[5]
+        add             \c0\().4h, \c0\().4h, \c3\().4h
+        smull           v19.4s,    \c1\().4h, v0.h[7]
+        smull           v18.4s,    \c0\().4h, v0.h[7]
+        add             v20.4s,    v16.4s,    v19.4s
+        add             v21.4s,    v17.4s,    v19.4s
+        rshrn           \c0\().4h, v20.4s,    #14
+        add             v16.4s,    v16.4s,    v17.4s
+        rshrn           \c1\().4h, v21.4s,    #14
+        sub             v16.4s,    v16.4s,    v19.4s
+        rshrn           \c2\().4h, v18.4s,    #14
+        rshrn           \c3\().4h, v16.4s,    #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.4h}, [x4]
+.endif
+.ifc \txfm1,iadst
+        movrel          x4,  iadst4_coeffs
+        ld1             {v0.d}[1], [x4]
+.endif
+.else
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.8h}, [x4]
+.endif
+
+        movi            v31.8h, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.ne            1f
+        // DC-only for idct/idct
+        ld1             {v2.h}[0], [x2]
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        st1             {v31.h}[0], [x2]
+        dup             v4.4h,  v2.h[0]
+        mov             v5.16b, v4.16b
+        mov             v6.16b, v4.16b
+        mov             v7.16b, v4.16b
+        b               2f
+.endif
+
+1:
+        ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
+        st1             {v31.8h}, [x2], #16
+
+.ifc \txfm1,iwht
+        sshr            v4.4h,  v4.4h,  #2
+        sshr            v5.4h,  v5.4h,  #2
+        sshr            v6.4h,  v6.4h,  #2
+        sshr            v7.4h,  v7.4h,  #2
+.endif
+
+        \txfm1\()4      v4,  v5,  v6,  v7
+
+        st1             {v31.8h}, [x2], #16
+        // Transpose 4x4 with 16 bit elements
+        transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
+
+        \txfm2\()4      v4,  v5,  v6,  v7
+2:
+        ld1             {v0.s}[0],   [x0], x1
+        ld1             {v1.s}[0],   [x0], x1
+.ifnc \txfm1,iwht
+        srshr           v4.4h,  v4.4h,  #4
+        srshr           v5.4h,  v5.4h,  #4
+        srshr           v6.4h,  v6.4h,  #4
+        srshr           v7.4h,  v7.4h,  #4
+.endif
+        uaddw           v4.8h,  v4.8h,  v0.8b
+        uaddw           v5.8h,  v5.8h,  v1.8b
+        ld1             {v2.s}[0],   [x0], x1
+        ld1             {v3.s}[0],   [x0], x1
+        sqxtun          v0.8b,  v4.8h
+        sqxtun          v1.8b,  v5.8h
+        sub             x0,  x0,  x1, lsl #2
+
+        uaddw           v6.8h,  v6.8h,  v2.8b
+        uaddw           v7.8h,  v7.8h,  v3.8b
+        st1             {v0.s}[0],  [x0], x1
+        sqxtun          v2.8b,  v6.8h
+        sqxtun          v3.8b,  v7.8h
+
+        st1             {v1.s}[0],  [x0], x1
+        st1             {v2.s}[0],  [x0], x1
+        st1             {v3.s}[0],  [x0], x1
+
+        ret
+endfunc
+.endm
+
+itxfm_func4x4 idct,  idct
+itxfm_func4x4 iadst, idct
+itxfm_func4x4 idct,  iadst
+itxfm_func4x4 iadst, iadst
+itxfm_func4x4 iwht,  iwht
+
+
+.macro idct8
+        dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
+        dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
+        dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
+        dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
+
+        butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
+        butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
+        butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
+        butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
+
+        dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
+
+        butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
+        butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
+        butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
+        butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
+.endm
+
+.macro iadst8
+        dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
+        dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
+        dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
+        dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
+
+        dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
+        dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
+        dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
+        dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
+
+        butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
+        butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
+        neg             v23.8h,   v23.8h  // v23 = out[7]
+
+        dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
+        neg             v19.8h,   v19.8h  // v19 = out[3]
+
+        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
+        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
+
+        dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
+        dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
+        neg             v17.8h,   v17.8h  // v17 = out[1]
+
+        dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
+        neg             v21.8h,   v21.8h  // v21 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
+        // The iadst also uses a few coefficients from
+        // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+        movrel          x4,  idct_coeffs
+.else
+        movrel          x4,  iadst8_coeffs
+        ld1             {v1.8h}, [x4], #16
+.endif
+        ld1             {v0.8h}, [x4]
+
+        movi            v2.8h, #0
+        movi            v3.8h, #0
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.ne            1f
+        // DC-only for idct/idct
+        ld1             {v2.h}[0],  [x2]
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        st1             {v3.h}[0],  [x2]
+        dup             v16.8h,  v2.h[0]
+        mov             v17.16b, v16.16b
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v16.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v16.16b
+        mov             v22.16b, v16.16b
+        mov             v23.16b, v16.16b
+        b               2f
+.endif
+1:
+        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
+        ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
+        sub             x2,  x2,  #128
+        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
+        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
+
+        \txfm1\()8
+
+        // Transpose 8x8 with 16 bit elements
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+        \txfm2\()8
+2:
+        mov             x3,  x0
+        // Add into the destination
+        ld1             {v0.8b},  [x0], x1
+        srshr           v16.8h, v16.8h, #5
+        ld1             {v1.8b},  [x0], x1
+        srshr           v17.8h, v17.8h, #5
+        ld1             {v2.8b},  [x0], x1
+        srshr           v18.8h, v18.8h, #5
+        uaddw           v16.8h, v16.8h, v0.8b
+        ld1             {v3.8b},  [x0], x1
+        srshr           v19.8h, v19.8h, #5
+        uaddw           v17.8h, v17.8h, v1.8b
+        ld1             {v4.8b},  [x0], x1
+        srshr           v20.8h, v20.8h, #5
+        uaddw           v18.8h, v18.8h, v2.8b
+        sqxtun          v0.8b,  v16.8h
+        ld1             {v5.8b},  [x0], x1
+        srshr           v21.8h, v21.8h, #5
+        uaddw           v19.8h, v19.8h, v3.8b
+        sqxtun          v1.8b,  v17.8h
+        ld1             {v6.8b},  [x0], x1
+        srshr           v22.8h, v22.8h, #5
+        uaddw           v20.8h, v20.8h, v4.8b
+        sqxtun          v2.8b,  v18.8h
+        ld1             {v7.8b},  [x0], x1
+        srshr           v23.8h, v23.8h, #5
+        uaddw           v21.8h, v21.8h, v5.8b
+        sqxtun          v3.8b,  v19.8h
+
+        st1             {v0.8b},  [x3], x1
+        uaddw           v22.8h, v22.8h, v6.8b
+        st1             {v1.8b},  [x3], x1
+        sqxtun          v4.8b,  v20.8h
+        st1             {v2.8b},  [x3], x1
+        uaddw           v23.8h, v23.8h, v7.8b
+        st1             {v3.8b},  [x3], x1
+        sqxtun          v5.8b,  v21.8h
+        st1             {v4.8b},  [x3], x1
+        sqxtun          v6.8b,  v22.8h
+        st1             {v5.8b},  [x3], x1
+        sqxtun          v7.8b,  v23.8h
+
+        st1             {v6.8b},  [x3], x1
+        st1             {v7.8b},  [x3], x1
+
+        ret
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.h}[0], [x2]
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        dup             v2.8h,  v2.h[0]
+        st1             {v1.h}[0], [x2]
+
+        srshr           v2.8h,  v2.8h,  #6
+
+        mov             x3,  x0
+        mov             x4,  #16
+1:
+        // Loop to add the constant from v2 into all 16x16 outputs
+        subs            x4,  x4,  #2
+        ld1             {v3.16b},  [x0], x1
+        ld1             {v4.16b},  [x0], x1
+        uaddw           v16.8h, v2.8h,  v3.8b
+        uaddw2          v17.8h, v2.8h,  v3.16b
+        uaddw           v18.8h, v2.8h,  v4.8b
+        uaddw2          v19.8h, v2.8h,  v4.16b
+        sqxtun          v3.8b,  v16.8h
+        sqxtun2         v3.16b, v17.8h
+        sqxtun          v4.8b,  v18.8h
+        sqxtun2         v4.16b, v19.8h
+        st1             {v3.16b},  [x3], x1
+        st1             {v4.16b},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct16_end
+        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
+        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
+        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
+        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
+        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
+        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
+        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
+        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
+
+        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
+        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
+
+        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
+        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
+        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
+        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
+        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
+        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
+        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
+        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
+        ret
+.endm
+
+function idct16
+        dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
+        dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
+        dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        dsmull_h        v24, v25, v19, v1.h[7]
+        dsmull_h        v4,  v5,  v17, v1.h[0]
+        dsmull_h        v7,  v6,  v18, v0.h[5]
+        dsmull_h        v30, v31, v18, v0.h[4]
+        neg             v24.4s,  v24.4s
+        neg             v25.4s,  v25.4s
+        dsmull_h        v29, v28, v17, v1.h[1]
+        dsmull_h        v26, v27, v19, v1.h[6]
+        dsmull_h        v22, v23, v16, v0.h[0]
+        drshrn_h        v24, v24, v25, #14
+        drshrn_h        v16, v4,  v5,  #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v6,  v30, v31, #14
+        drshrn_h        v29, v29, v28, #14
+        drshrn_h        v17, v26, v27, #14
+        drshrn_h        v28, v22, v23, #14
+
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
+        neg             v22.4s,  v22.4s
+        neg             v23.4s,  v23.4s
+        drshrn_h        v27, v20, v21, #14
+        drshrn_h        v21, v22, v23, #14
+        drshrn_h        v23, v18, v19, #14
+        drshrn_h        v25, v30, v31, #14
+        mov             v4.16b,  v28.16b
+        mov             v5.16b,  v28.16b
+        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
+        mov             v20.16b, v28.16b
+        idct16_end
+endfunc
+
+function iadst16
+        ld1             {v0.8h,v1.8h}, [x11]
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
+        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
+        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
+        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
+        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
+        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
+        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
+        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
+
+        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
+        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
+        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
+        ld1             {v0.8h}, [x10]
+        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
+        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
+        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
+
+        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
+        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
+        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
+        butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
+        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
+
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
+        butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
+        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
+        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
+
+        butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
+        butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
+
+        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
+        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
+        neg             v29.8h, v29.8h                   // v29 = out[13]
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
+        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
+
+        butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
+        butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
+
+        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
+        neg             v19.8h, v19.8h                   // v19 = out[3]
+        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
+
+        butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
+        butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
+
+        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
+        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
+
+        neg             v31.8h,  v5.8h                    // v31 = out[15]
+        neg             v17.8h,  v3.8h                    // v17 = out[1]
+
+        mov             v16.16b, v2.16b
+        mov             v30.16b, v4.16b
+        ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+        ld1             {v\i\().8h},  [\src], \inc
+.endm
+.macro store i, dst, inc
+        st1             {v\i\().8h},  [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+        movi            v\i\()\size,  \imm
+.endm
+.macro load_clear i, src, inc
+        ld1             {v\i\().8h}, [\src]
+        st1             {v2.8h},  [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
+        srshr           \coef0, \coef0, #6
+        ld1             {v2.8b},  [x0], x1
+        srshr           \coef1, \coef1, #6
+        ld1             {v3.8b},  [x3], x1
+        srshr           \coef2, \coef2, #6
+        ld1             {v4.8b},  [x0], x1
+        srshr           \coef3, \coef3, #6
+        uaddw           \coef0, \coef0, v2.8b
+        ld1             {v5.8b},  [x3], x1
+        uaddw           \coef1, \coef1, v3.8b
+        srshr           \coef4, \coef4, #6
+        ld1             {v6.8b},  [x0], x1
+        srshr           \coef5, \coef5, #6
+        ld1             {v7.8b},  [x3], x1
+        sqxtun          v2.8b,  \coef0
+        srshr           \coef6, \coef6, #6
+        sqxtun          v3.8b,  \coef1
+        srshr           \coef7, \coef7, #6
+        uaddw           \coef2, \coef2, v4.8b
+        ld1             {\tmp1},  [x0], x1
+        uaddw           \coef3, \coef3, v5.8b
+        ld1             {\tmp2},  [x3], x1
+        sqxtun          v4.8b,  \coef2
+        sub             x0,  x0,  x1, lsl #2
+        sub             x3,  x3,  x1, lsl #2
+        sqxtun          v5.8b,  \coef3
+        uaddw           \coef4, \coef4, v6.8b
+        st1             {v2.8b},  [x0], x1
+        uaddw           \coef5, \coef5, v7.8b
+        st1             {v3.8b},  [x3], x1
+        sqxtun          v6.8b,  \coef4
+        st1             {v4.8b},  [x0], x1
+        sqxtun          v7.8b,  \coef5
+        st1             {v5.8b},  [x3], x1
+        uaddw           \coef6, \coef6, \tmp1
+        st1             {v6.8b},  [x0], x1
+        uaddw           \coef7, \coef7, \tmp2
+        st1             {v7.8b},  [x3], x1
+        sqxtun          \tmp1,  \coef6
+        sqxtun          \tmp2,  \coef7
+        st1             {\tmp1},  [x0], x1
+        st1             {\tmp2},  [x3], x1
+.endm
+
+// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x8 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_8x16_pass1_neon
+        mov             x14, x30
+
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              \txfm\()16
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        cmp             x1,  #8
+        b.eq            1f
+.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+        store           \i,  x0,  #16
+.endr
+        ret             x14
+1:
+        // Special case: For the last input column (x1 == 8),
+        // which would be stored as the last row in the temp buffer,
+        // don't store the first 8x8 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // last 8x8 block).
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        add             x0,  x0,  #16
+        store           \i,  x0,  #16
+.endr
+        mov             v24.16b, v16.16b
+        mov             v25.16b, v17.16b
+        mov             v26.16b, v18.16b
+        mov             v27.16b, v19.16b
+        mov             v28.16b, v20.16b
+        mov             v29.16b, v21.16b
+        mov             v30.16b, v22.16b
+        mov             v31.16b, v23.16b
+        ret             x14
+endfunc
+
+// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 8x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_8x16_pass2_neon
+        mov             x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              \txfm\()16
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        ret             x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct16x16_dc_add_neon
+.endif
+        mov             x15, x30
+        // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
+.ifnc \txfm1\()_\txfm2,idct_idct
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d14, d15, [sp, #0x30]
+        stp             d12, d13, [sp, #0x20]
+        stp             d10, d11, [sp, #0x10]
+.endif
+
+        sub             sp,  sp,  #512
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        movrel          x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+        movrel          x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+        ld1             {v0.8h,v1.8h}, [x10]
+.endif
+        mov             x9,  #32
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #10
+        b.le            idct16x16_quarter_add_neon
+        cmp             w3,  #38
+        b.le            idct16x16_half_add_neon
+.endif
+
+.irp i, 0, 8
+        add             x0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 8
+        cmp             w3,  #38
+        b.le            1f
+.endif
+.endif
+        mov             x1,  #\i
+        add             x2,  x6,  #(\i*2)
+        bl              \txfm1\()16_1d_8x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+        ld1             {v0.8h,v1.8h}, [x10]
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        // Set v24-v31 to zero, for the in-register passthrough of
+        // coefficients to pass 2. Since we only do two slices, this can
+        // only ever happen for the second slice. So we only need to store
+        // zeros to the temp buffer for the second half of the buffer.
+        // Move x0 to the second half, and use x9 == 32 as increment.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        movi_v          \i,  .16b, #0
+        st1             {v24.8h},  [x0], x9
+.endr
+3:
+.endif
+
+.irp i, 0, 8
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        mov             x3,  #\i
+        bl              \txfm2\()16_1d_8x16_pass2_neon
+.endr
+
+        add             sp,  sp,  #512
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d8,  d9,  [sp], #0x40
+.endif
+        ret             x15
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_8x16_pass1_quarter_neon
+        mov             x14, x30
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_quarter
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        // Since only a 4x4 part of the input was nonzero, this means that
+        // only 4 rows are nonzero after transposing, and the second pass
+        // only reads the topmost 4 rows. Therefore only store the topmost
+        // 4 rows.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27
+        store           \i,  x0,  x9
+.endr
+        ret             x14
+endfunc
+
+function idct16_1d_8x16_pass2_quarter_neon
+        mov             x14, x30
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_quarter
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        ret             x14
+endfunc
+
+function idct16_1d_8x16_pass1_half_neon
+        mov             x14, x30
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_half
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i,  x0,  x9
+.endr
+        ret             x14
+endfunc
+
+function idct16_1d_8x16_pass2_half_neon
+        mov             x14, x30
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_half
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        ret             x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_neon
+        add             x0,  sp,  #(0*32)
+        add             x2,  x6,  #(0*2)
+        bl              idct16_1d_8x16_pass1_\size\()_neon
+.irp i, 0, 8
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        mov             x3,  #\i
+        bl              idct16_1d_8x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #512
+        ret             x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.h}[0], [x2]
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        dup             v2.8h,  v2.h[0]
+        st1             {v1.h}[0], [x2]
+
+        srshr           v0.8h,  v2.8h,  #6
+
+        mov             x3,  x0
+        mov             x4,  #32
+1:
+        // Loop to add the constant v0 into all 32x32 outputs
+        subs            x4,  x4,  #2
+        ld1             {v1.16b,v2.16b},  [x0], x1
+        uaddw           v16.8h, v0.8h,  v1.8b
+        uaddw2          v17.8h, v0.8h,  v1.16b
+        ld1             {v3.16b,v4.16b},  [x0], x1
+        uaddw           v18.8h, v0.8h,  v2.8b
+        uaddw2          v19.8h, v0.8h,  v2.16b
+        uaddw           v20.8h, v0.8h,  v3.8b
+        uaddw2          v21.8h, v0.8h,  v3.16b
+        uaddw           v22.8h, v0.8h,  v4.8b
+        uaddw2          v23.8h, v0.8h,  v4.16b
+        sqxtun          v1.8b,  v16.8h
+        sqxtun2         v1.16b, v17.8h
+        sqxtun          v2.8b,  v18.8h
+        sqxtun2         v2.16b, v19.8h
+        sqxtun          v3.8b,  v20.8h
+        sqxtun2         v3.16b, v21.8h
+        st1             {v1.16b,v2.16b},  [x3], x1
+        sqxtun          v4.8b,  v22.8h
+        sqxtun2         v4.16b, v23.8h
+        st1             {v3.16b,v4.16b},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct32_end
+        butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
+        butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
+        butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
+        butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
+        butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
+        butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
+        butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
+        butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
+
+        dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
+        dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
+        dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
+        dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+        butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
+        butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+        butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
+        butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+        butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
+        butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
+        butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
+        butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
+
+        dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
+        dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
+        dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
+        dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
+        ret
+.endm
+
+function idct32_odd
+        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        dsmull_h        v4,  v5,  v16, v8.h[0]
+        dsmull_h        v28, v29, v19, v8.h[7]
+        dsmull_h        v30, v31, v16, v8.h[1]
+        dsmull_h        v22, v23, v17, v9.h[6]
+        dsmull_h        v7,  v6,  v17, v9.h[7]
+        dsmull_h        v26, v27, v19, v8.h[6]
+        dsmull_h        v20, v21, v18, v9.h[0]
+        dsmull_h        v24, v25, v18, v9.h[1]
+
+        neg             v28.4s, v28.4s
+        neg             v29.4s, v29.4s
+        neg             v7.4s,  v7.4s
+        neg             v6.4s,  v6.4s
+
+        drshrn_h        v4,  v4,  v5,  #14
+        drshrn_h        v5,  v28, v29, #14
+        drshrn_h        v29, v30, v31, #14
+        drshrn_h        v28, v22, v23, #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v31, v26, v27, #14
+        drshrn_h        v6,  v20, v21, #14
+        drshrn_h        v30, v24, v25, #14
+
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
+        drshrn_h        v23, v16, v17, #14
+        drshrn_h        v24, v18, v19, #14
+        neg             v20.4s, v20.4s
+        neg             v21.4s, v21.4s
+        drshrn_h        v27, v27, v26, #14
+        drshrn_h        v20, v20, v21, #14
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
+        drshrn_h        v21, v16, v17, #14
+        drshrn_h        v26, v18, v19, #14
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
+        drshrn_h        v25, v16, v17, #14
+        neg             v18.4s, v18.4s
+        neg             v19.4s, v19.4s
+        drshrn_h        v22, v18, v19, #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_8x32_pass1\suffix\()_neon
+        mov             x14, x30
+        movi            v2.8h,  #0
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
+        // two transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the registers a, b horizontally, followed by the
+        // same registers b, a mirrored.
+.macro store_rev a, b
+        // There's no rev128 instruction, but we reverse each 64 bit
+        // half, and then flip them using an ext with 8 bytes offset.
+        rev64           v3.8h, \b
+        st1             {\a},  [x0], #16
+        rev64           v2.8h, \a
+        ext             v3.16b, v3.16b, v3.16b, #8
+        st1             {\b},  [x0], #16
+        ext             v2.16b, v2.16b, v2.16b, #8
+        st1             {v3.8h},  [x0], #16
+        st1             {v2.8h},  [x0], #16
+.endm
+        store_rev       v16.8h, v24.8h
+        store_rev       v17.8h, v25.8h
+        store_rev       v18.8h, v26.8h
+        store_rev       v19.8h, v27.8h
+        store_rev       v20.8h, v28.8h
+        store_rev       v21.8h, v29.8h
+        store_rev       v22.8h, v30.8h
+        store_rev       v23.8h, v31.8h
+        sub             x0,  x0,  #512
+.purgem store_rev
+
+        // Move x2 back to the start of the input, and move
+        // to the first odd row
+.ifb \suffix
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        add             x2,  x2,  #64
+
+        movi            v2.8h,  #0
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
+        transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
+
+        // Store the registers a, b horizontally,
+        // adding into the output first, and the mirrored,
+        // subtracted from the output.
+.macro store_rev a, b
+        ld1             {v4.8h},  [x0]
+        rev64           v3.8h, \b
+        add             v4.8h, v4.8h, \a
+        rev64           v2.8h, \a
+        st1             {v4.8h},  [x0], #16
+        ext             v3.16b, v3.16b, v3.16b, #8
+        ld1             {v5.8h},  [x0]
+        ext             v2.16b, v2.16b, v2.16b, #8
+        add             v5.8h, v5.8h, \b
+        st1             {v5.8h},  [x0], #16
+        ld1             {v6.8h},  [x0]
+        sub             v6.8h, v6.8h, v3.8h
+        st1             {v6.8h},  [x0], #16
+        ld1             {v7.8h},  [x0]
+        sub             v7.8h, v7.8h, v2.8h
+        st1             {v7.8h},  [x0], #16
+.endm
+
+        store_rev       v31.8h, v23.8h
+        store_rev       v30.8h, v22.8h
+        store_rev       v29.8h, v21.8h
+        store_rev       v28.8h, v20.8h
+        store_rev       v27.8h, v19.8h
+        store_rev       v26.8h, v18.8h
+        store_rev       v25.8h, v17.8h
+        store_rev       v24.8h, v16.8h
+.purgem store_rev
+        ret             x14
+endfunc
+
+// This is mostly the same as 8x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_8x32_pass2\suffix\()_neon
+        mov             x14, x30
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i, x2, x9
+.endr
+
+        sub             x2,  x2,  x9, lsl #4
+        add             x2,  x2,  #64
+
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        sub             x2,  x2,  #64
+
+        bl              idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+        ld1             {v4.8h},  [x2], x9
+        ld1             {v5.8h},  [x2], x9
+        add             v4.8h, v4.8h, \a
+        ld1             {v6.8h},  [x2], x9
+        add             v5.8h, v5.8h, \b
+        ld1             {v7.8h},  [x2], x9
+        add             v6.8h, v6.8h, \c
+        add             v7.8h, v7.8h, \d
+.else
+        ld1             {v4.8h},  [x2], x7
+        ld1             {v5.8h},  [x2], x7
+        sub             v4.8h, v4.8h, \a
+        ld1             {v6.8h},  [x2], x7
+        sub             v5.8h, v5.8h, \b
+        ld1             {v7.8h},  [x2], x7
+        sub             v6.8h, v6.8h, \c
+        sub             v7.8h, v7.8h, \d
+.endif
+        ld1             {v10.8b}, [x0], x1
+        ld1             {v11.8b}, [x0], x1
+        srshr           v4.8h, v4.8h, #6
+        ld1             {v2.8b}, [x0], x1
+        srshr           v5.8h, v5.8h, #6
+        uaddw           v4.8h, v4.8h, v10.8b
+        ld1             {v3.8b}, [x0], x1
+        srshr           v6.8h, v6.8h, #6
+        uaddw           v5.8h, v5.8h, v11.8b
+        srshr           v7.8h, v7.8h, #6
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v6.8h, v6.8h, v2.8b
+        sqxtun          v4.8b, v4.8h
+        uaddw           v7.8h, v7.8h, v3.8b
+        sqxtun          v5.8b, v5.8h
+        st1             {v4.8b}, [x0], x1
+        sqxtun          v6.8b, v6.8h
+        st1             {v5.8b}, [x0], x1
+        sqxtun          v7.8b, v7.8h
+        st1             {v6.8b}, [x0], x1
+        st1             {v7.8b}, [x0], x1
+.endm
+        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
+        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
+        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
+        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
+        sub             x2,  x2,  x9
+        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
+        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
+        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
+        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
+.purgem load_acc_store
+        ret             x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 34, 135, 336
+endconst
+
+function ff_vp9_idct_idct_32x32_add_neon, export=1
+        cmp             w3,  #1
+        b.eq            idct32x32_dc_add_neon
+
+        movrel          x10, idct_coeffs
+
+        mov             x15, x30
+
+        stp             d8,  d9,  [sp, #-0x20]!
+        stp             d10, d11, [sp, #0x10]
+
+        sub             sp,  sp,  #2048
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        // Double stride of the input, since we only read every other line
+        mov             x9,  #128
+        neg             x7,  x9
+
+        ld1             {v0.8h,v1.8h}, [x10], #32
+        ld1             {v8.8h,v9.8h}, [x10]
+
+        cmp             w3,  #34
+        b.le            idct32x32_quarter_add_neon
+        cmp             w3,  #135
+        b.le            idct32x32_half_add_neon
+
+        movrel          x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 8, 16, 24
+        add             x0,  sp,  #(\i*64)
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(32 - \i)/4
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*2)
+        bl              idct32_1d_8x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
+.endr
+        b.ne            2b
+3:
+.irp i, 0, 8, 16, 24
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        bl              idct32_1d_8x32_pass2_neon
+.endr
+
+        add             sp,  sp,  #2048
+
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], #0x20
+
+        ret             x15
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
+        add             x0,  sp,  #(0*64)
+        add             x2,  x6,  #(0*2)
+        bl              idct32_1d_8x32_pass1_\size\()_neon
+.ifc \size,half
+        add             x0,  sp,  #(8*64)
+        add             x2,  x6,  #(8*2)
+        bl              idct32_1d_8x32_pass1_\size\()_neon
+.endif
+.irp i, 0, 8, 16, 24
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        bl              idct32_1d_8x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #2048
+
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], #0x20
+
+        ret             x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/external/ffmpeg-snapshot/libavcodec/vp9dsp_template.c b/external/ffmpeg-snapshot/libavcodec/vp9dsp_template.c
new file mode 100644
index 0000000..9e5b251
--- /dev/null
+++ b/external/ffmpeg-snapshot/libavcodec/vp9dsp_template.c
@@ -0,0 +1,2578 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "bit_depth_template.c"
+#include "vp9dsp.h"
+
+#if BIT_DEPTH != 12
+
+// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
+// back with h264pred.[ch]
+
+static void vert_4x4_c(uint8_t *restrict _dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4 = AV_RN4PA(top);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, p4);
+    AV_WN4PA(dst + stride * 1, p4);
+    AV_WN4PA(dst + stride * 2, p4);
+    AV_WN4PA(dst + stride * 3, p4);
+}
+
+static void vert_8x8_c(uint8_t *restrict _dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+#if BIT_DEPTH == 8
+    uint64_t p8 = AV_RN64A(top);
+#else
+    pixel4 p4a = AV_RN4PA(top + 0);
+    pixel4 p4b = AV_RN4PA(top + 4);
+#endif
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+#if BIT_DEPTH == 8
+        AV_WN64A(dst, p8);
+#else
+        AV_WN4PA(dst + 0, p4a);
+        AV_WN4PA(dst + 4, p4b);
+#endif
+        dst += stride;
+    }
+}
+
+static void vert_16x16_c(uint8_t *restrict _dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+#if BIT_DEPTH == 8
+    uint64_t p8a = AV_RN64A(top);
+    uint64_t p8b = AV_RN64A(top + 8);
+#else
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+#endif
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+#if BIT_DEPTH == 8
+        AV_WN64A(dst +  0, p8a);
+        AV_WN64A(dst +  8, p8b);
+#else
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+#endif
+        dst += stride;
+    }
+}
+
+static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+#if BIT_DEPTH == 8
+    uint64_t p8a = AV_RN64A(top);
+    uint64_t p8b = AV_RN64A(top + 8);
+    uint64_t p8c = AV_RN64A(top + 16);
+    uint64_t p8d = AV_RN64A(top + 24);
+#else
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+    pixel4 p4e = AV_RN4PA(top + 16);
+    pixel4 p4f = AV_RN4PA(top + 20);
+    pixel4 p4g = AV_RN4PA(top + 24);
+    pixel4 p4h = AV_RN4PA(top + 28);
+#endif
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+#if BIT_DEPTH == 8
+        AV_WN64A(dst +  0, p8a);
+        AV_WN64A(dst +  8, p8b);
+        AV_WN64A(dst + 16, p8c);
+        AV_WN64A(dst + 24, p8d);
+#else
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+        AV_WN4PA(dst + 16, p4e);
+        AV_WN4PA(dst + 20, p4f);
+        AV_WN4PA(dst + 24, p4g);
+        AV_WN4PA(dst + 28, p4h);
+#endif
+        dst += stride;
+    }
+}
+
+static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
+    AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
+    AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
+    AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
+}
+
+static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
+
+        AV_WN4PA(dst + 0, p4);
+        AV_WN4PA(dst + 4, p4);
+        dst += stride;
+    }
+}
+
+static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        dst += stride;
+    }
+}
+
+static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        AV_WN4PA(dst + 16, p4);
+        AV_WN4PA(dst + 20, p4);
+        AV_WN4PA(dst + 24, p4);
+        AV_WN4PA(dst + 28, p4);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 4; y++) {
+        int l_m_tl = left[3 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        int l_m_tl = left[7 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst[4] = av_clip_pixel(top[4] + l_m_tl);
+        dst[5] = av_clip_pixel(top[5] + l_m_tl);
+        dst[6] = av_clip_pixel(top[6] + l_m_tl);
+        dst[7] = av_clip_pixel(top[7] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        int l_m_tl = left[15 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        int l_m_tl = left[31 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst[16] = av_clip_pixel(top[16] + l_m_tl);
+        dst[17] = av_clip_pixel(top[17] + l_m_tl);
+        dst[18] = av_clip_pixel(top[18] + l_m_tl);
+        dst[19] = av_clip_pixel(top[19] + l_m_tl);
+        dst[20] = av_clip_pixel(top[20] + l_m_tl);
+        dst[21] = av_clip_pixel(top[21] + l_m_tl);
+        dst[22] = av_clip_pixel(top[22] + l_m_tl);
+        dst[23] = av_clip_pixel(top[23] + l_m_tl);
+        dst[24] = av_clip_pixel(top[24] + l_m_tl);
+        dst[25] = av_clip_pixel(top[25] + l_m_tl);
+        dst[26] = av_clip_pixel(top[26] + l_m_tl);
+        dst[27] = av_clip_pixel(top[27] + l_m_tl);
+        dst[28] = av_clip_pixel(top[28] + l_m_tl);
+        dst[29] = av_clip_pixel(top[29] + l_m_tl);
+        dst[30] = av_clip_pixel(top[30] + l_m_tl);
+        dst[31] = av_clip_pixel(top[31] + l_m_tl);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
+                                top[0] + top[1] + top[2] + top[3] + 4) >> 3);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
+          top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
+          left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
+          left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
+          left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
+          top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
+          top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
+          top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] +
+          left[4] + left[5] + left[6] + left[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
+          left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
+          left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
+          left[30] + left[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
+          top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
+          top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
+          top[30] + top[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);}
+
+static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+#if BIT_DEPTH == 8
+#define memset_bpc memset
+#else
+static inline void memset_bpc(uint16_t *dst, int val, int len) {
+    int n;
+    for (n = 0; n < len; n++) {
+        dst[n] = val;
+    }
+}
+#endif
+
+#define DST(x, y) dst[(x) + (y) * stride]
+
+static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+    DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
+    DST(3,3) = a7;  // note: this is different from vp8 and such
+}
+
+#define def_diag_downleft(size) \
+static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                              const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel v[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) \
+        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) { \
+        memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
+    } \
+}
+
+def_diag_downleft(8)
+def_diag_downleft(16)
+def_diag_downleft(32)
+
+static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                 const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_diag_downright(size) \
+static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                               const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size + size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i           ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+        v[size + 1 + i] = (top[i]  + top[i + 1]  * 2 + top[i + 2]  + 2) >> 2; \
+    } \
+    v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
+    v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
+    v[size    ] = (top[-1] + top[0]  * 2 + top[ 1] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
+}
+
+def_diag_downright(8)
+def_diag_downright(16)
+def_diag_downright(32)
+
+static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                             const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
+    DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
+    DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
+    DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a2 + a3 + 1) >> 1;
+    DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_vert_right(size) \
+static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                           const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size/2 - 2; i++) { \
+        vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
+        ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
+    } \
+    vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
+    ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
+    vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
+    for (i = 0; i < size - 1; i++) { \
+        ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2     *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
+        memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
+    } \
+}
+
+def_vert_right(8)
+def_vert_right(16)
+def_vert_right(32)
+
+static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
+        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
+
+    stride /= sizeof(pixel);
+    DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
+    DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
+    DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
+    DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
+    DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,3) = (l2 + l3 + 1) >> 1;
+    DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+}
+
+#define def_hor_down(size) \
+static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                         const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size * 3 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2       ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
+        v[i*2    + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
+        v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+    v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
+    v[size*2 - 1] = (top[0]  + top[-1] * 2 + left[size - 1] + 2) >> 2; \
+    v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
+}
+
+def_hor_down(8)
+def_hor_down(16)
+def_hor_down(32)
+
+static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 + 1) >> 1;
+    DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
+    DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
+    DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
+    DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,2) = (a4 + a5 + 1) >> 1;
+    DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+}
+
+#define def_vert_left(size) \
+static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                          const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel ve[size - 1], vo[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    } \
+    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
+    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2      * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst +  j*2      * stride + size - j - 1, top[size - 1], j + 1); \
+        memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
+    } \
+}
+
+def_vert_left(8)
+def_vert_left(16)
+def_vert_left(32)
+
+static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (l0 + l1 + 1) >> 1;
+    DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
+    DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
+    DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
+    DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
+}
+
+#define def_hor_up(size) \
+static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                       const uint8_t *_left, const uint8_t *top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size*2 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2    ] = (left[i] + left[i + 1] + 1) >> 1; \
+        v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+    } \
+    v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) \
+        memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
+    for (j = size / 2; j < size; j++) { \
+        memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
+                   2 + j*2 - size); \
+    } \
+}
+
+def_hor_up(8)
+def_hor_up(16)
+def_hor_up(32)
+
+#undef DST
+
+#endif /* BIT_DEPTH != 12 */
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
+{
+#define init_intra_pred_bd_aware(tx, sz) \
+    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_##sz##_c; \
+    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_##sz##_c; \
+    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_##sz##_c; \
+    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_intrapred_init_10(dsp);
+#define init_intra_pred(tx, sz) \
+    init_intra_pred_bd_aware(tx, sz)
+#else
+    #define init_intra_pred(tx, sz) \
+    dsp->intra_pred[tx][VERT_PRED]            = vert_##sz##_c; \
+    dsp->intra_pred[tx][HOR_PRED]             = hor_##sz##_c; \
+    dsp->intra_pred[tx][DC_PRED]              = dc_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
+    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_##sz##_c; \
+    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_##sz##_c; \
+    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_##sz##_c; \
+    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_##sz##_c; \
+    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_##sz##_c; \
+    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_##sz##_c; \
+    init_intra_pred_bd_aware(tx, sz)
+#endif
+
+    init_intra_pred(TX_4X4,   4x4);
+    init_intra_pred(TX_8X8,   8x8);
+    init_intra_pred(TX_16X16, 16x16);
+    init_intra_pred(TX_32X32, 32x32);
+
+#undef init_intra_pred
+#undef init_intra_pred_bd_aware
+}
+
+#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
+static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
+                                                    ptrdiff_t stride, \
+                                                    int16_t *_block, int eob) \
+{ \
+    int i, j; \
+    pixel *dst = (pixel *) _dst; \
+    dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
+\
+    stride /= sizeof(pixel); \
+    if (has_dconly && eob == 1) { \
+        const int t  = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
+                                            * 11585 + (1 << 13)) >> 14; \
+        block[0] = 0; \
+        for (i = 0; i < sz; i++) { \
+            for (j = 0; j < sz; j++) \
+                dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                                (bits ? \
+                                                 (int)(t + (1U << (bits - 1))) >> bits : \
+                                                 t)); \
+            dst++; \
+        } \
+        return; \
+    } \
+\
+    for (i = 0; i < sz; i++) \
+        type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
+    memset(block, 0, sz * sz * sizeof(*block)); \
+    for (i = 0; i < sz; i++) { \
+        type_b##sz##_1d(tmp + i, sz, out, 1); \
+        for (j = 0; j < sz; j++) \
+            dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                            (bits ? \
+                                             (int)(out[j] + (1U << (bits - 1))) >> bits : \
+                                             out[j])); \
+        dst++; \
+    } \
+}
+
+#define itxfm_wrap(sz, bits) \
+itxfm_wrapper(idct,  idct,  sz, bits, 1) \
+itxfm_wrapper(iadst, idct,  sz, bits, 0) \
+itxfm_wrapper(idct,  iadst, sz, bits, 0) \
+itxfm_wrapper(iadst, iadst, sz, bits, 0)
+
+#define IN(x) ((dctint) in[(x) * stride])
+
+static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
+    t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
+    t2 = (IN(1) *  6270 - IN(3) * 15137 + (1 << 13)) >> 14;
+    t3 = (IN(1) * 15137 + IN(3) *  6270 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t3;
+    out[1] = t1 + t2;
+    out[2] = t1 - t2;
+    out[3] = t0 - t3;
+}
+
+static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
+    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
+    t2 = 13377 * (IN(0) - IN(2) + IN(3));
+    t3 = 13377 * IN(1);
+
+    out[0] = (t0 + t3      + (1 << 13)) >> 14;
+    out[1] = (t1 + t3      + (1 << 13)) >> 14;
+    out[2] = (t2           + (1 << 13)) >> 14;
+    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(4, 4)
+
+static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
+    t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
+    t2a = (IN(2) *  6270 - IN(6) * 15137 + (1 << 13)) >> 14;
+    t3a = (IN(2) * 15137 + IN(6) *  6270 + (1 << 13)) >> 14;
+    t4a = (IN(1) *  3196 - IN(7) * 16069 + (1 << 13)) >> 14;
+    t5a = (IN(5) * 13623 - IN(3) *  9102 + (1 << 13)) >> 14;
+    t6a = (IN(5) *  9102 + IN(3) * 13623 + (1 << 13)) >> 14;
+    t7a = (IN(1) * 16069 + IN(7) *  3196 + (1 << 13)) >> 14;
+
+    t0  = t0a + t3a;
+    t1  = t1a + t2a;
+    t2  = t1a - t2a;
+    t3  = t0a - t3a;
+    t4  = t4a + t5a;
+    t5a = t4a - t5a;
+    t7  = t7a + t6a;
+    t6a = t7a - t6a;
+
+    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
+    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t7;
+    out[1] = t1 + t6;
+    out[2] = t2 + t5;
+    out[3] = t3 + t4;
+    out[4] = t3 - t4;
+    out[5] = t2 - t5;
+    out[6] = t1 - t6;
+    out[7] = t0 - t7;
+}
+
+static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = 16305 * IN(7) +  1606 * IN(0);
+    t1a =  1606 * IN(7) - 16305 * IN(0);
+    t2a = 14449 * IN(5) +  7723 * IN(2);
+    t3a =  7723 * IN(5) - 14449 * IN(2);
+    t4a = 10394 * IN(3) + 12665 * IN(4);
+    t5a = 12665 * IN(3) - 10394 * IN(4);
+    t6a =  4756 * IN(1) + 15679 * IN(6);
+    t7a = 15679 * IN(1) -  4756 * IN(6);
+
+    t0 = (t0a + t4a + (1 << 13)) >> 14;
+    t1 = (t1a + t5a + (1 << 13)) >> 14;
+    t2 = (t2a + t6a + (1 << 13)) >> 14;
+    t3 = (t3a + t7a + (1 << 13)) >> 14;
+    t4 = (t0a - t4a + (1 << 13)) >> 14;
+    t5 = (t1a - t5a + (1 << 13)) >> 14;
+    t6 = (t2a - t6a + (1 << 13)) >> 14;
+    t7 = (t3a - t7a + (1 << 13)) >> 14;
+
+    t4a = 15137U * t4 +  6270U * t5;
+    t5a =  6270U * t4 - 15137U * t5;
+    t6a = 15137U * t7 -  6270U * t6;
+    t7a =  6270U * t7 + 15137U * t6;
+
+    out[0] =   t0 + t2;
+    out[7] = -(t1 + t3);
+    t2     =   t0 - t2;
+    t3     =   t1 - t3;
+
+    out[1] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
+    out[6] =   (dctint)((1U << 13) + t5a + t7a) >> 14;
+    t6     =   (dctint)((1U << 13) + t4a - t6a) >> 14;
+    t7     =   (dctint)((1U << 13) + t5a - t7a) >> 14;
+
+    out[3] = -((dctint)((t2 + t3) * 11585U + (1 << 13)) >> 14);
+    out[4] =   (dctint)((t2 - t3) * 11585U + (1 << 13)) >> 14;
+    out[2] =   (dctint)((t6 + t7) * 11585U + (1 << 13)) >> 14;
+    out[5] = -((dctint)((t6 - t7) * 11585U + (1 << 13)) >> 14);
+}
+
+itxfm_wrap(8, 5)
+
+static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0a  = (dctint)((IN(0) + IN(8)) * 11585U + (1 << 13)) >> 14;
+    t1a  = (dctint)((IN(0) - IN(8)) * 11585U + (1 << 13)) >> 14;
+    t2a  = (dctint)(IN(4)  *  6270U - IN(12) * 15137U + (1 << 13)) >> 14;
+    t3a  = (dctint)(IN(4)  * 15137U + IN(12) *  6270U + (1 << 13)) >> 14;
+    t4a  = (dctint)(IN(2)  *  3196U - IN(14) * 16069U + (1 << 13)) >> 14;
+    t7a  = (dctint)(IN(2)  * 16069U + IN(14) *  3196U + (1 << 13)) >> 14;
+    t5a  = (dctint)(IN(10) * 13623U - IN(6)  *  9102U + (1 << 13)) >> 14;
+    t6a  = (dctint)(IN(10) *  9102U + IN(6)  * 13623U + (1 << 13)) >> 14;
+    t8a  = (dctint)(IN(1)  *  1606U - IN(15) * 16305U + (1 << 13)) >> 14;
+    t15a = (dctint)(IN(1)  * 16305U + IN(15) *  1606U + (1 << 13)) >> 14;
+    t9a  = (dctint)(IN(9)  * 12665U - IN(7)  * 10394U + (1 << 13)) >> 14;
+    t14a = (dctint)(IN(9)  * 10394U + IN(7)  * 12665U + (1 << 13)) >> 14;
+    t10a = (dctint)(IN(5)  *  7723U - IN(11) * 14449U + (1 << 13)) >> 14;
+    t13a = (dctint)(IN(5)  * 14449U + IN(11) *  7723U + (1 << 13)) >> 14;
+    t11a = (dctint)(IN(13) * 15679U - IN(3)  *  4756U + (1 << 13)) >> 14;
+    t12a = (dctint)(IN(13) *  4756U + IN(3)  * 15679U + (1 << 13)) >> 14;
+
+    t0  = t0a  + t3a;
+    t1  = t1a  + t2a;
+    t2  = t1a  - t2a;
+    t3  = t0a  - t3a;
+    t4  = t4a  + t5a;
+    t5  = t4a  - t5a;
+    t6  = t7a  - t6a;
+    t7  = t7a  + t6a;
+    t8  = t8a  + t9a;
+    t9  = t8a  - t9a;
+    t10 = t11a - t10a;
+    t11 = t11a + t10a;
+    t12 = t12a + t13a;
+    t13 = t12a - t13a;
+    t14 = t15a - t14a;
+    t15 = t15a + t14a;
+
+    t5a  = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14;
+    t6a  = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14;
+    t9a  = (dctint)(  t14 *  6270U - t9  * 15137U  + (1 << 13)) >> 14;
+    t14a = (dctint)(  t14 * 15137U + t9  *  6270U  + (1 << 13)) >> 14;
+    t10a = (dctint)(-(t13 * 15137U + t10 *  6270U) + (1 << 13)) >> 14;
+    t13a = (dctint)(  t13 *  6270U - t10 * 15137U  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4   = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7   = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+
+    t10a = (dctint)((t13  - t10)  * 11585U + (1 << 13)) >> 14;
+    t13a = (dctint)((t13  + t10)  * 11585U + (1 << 13)) >> 14;
+    t11  = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14;
+    t12  = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14;
+
+    out[ 0] = t0a + t15a;
+    out[ 1] = t1a + t14;
+    out[ 2] = t2a + t13a;
+    out[ 3] = t3a + t12;
+    out[ 4] = t4  + t11;
+    out[ 5] = t5  + t10a;
+    out[ 6] = t6  + t9;
+    out[ 7] = t7  + t8a;
+    out[ 8] = t7  - t8a;
+    out[ 9] = t6  - t9;
+    out[10] = t5  - t10a;
+    out[11] = t4  - t11;
+    out[12] = t3a - t12;
+    out[13] = t2a - t13a;
+    out[14] = t1a - t14;
+    out[15] = t0a - t15a;
+}
+
+static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
+                                        dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0  = IN(15) * 16364U + IN(0)  *   804U;
+    t1  = IN(15) *   804U - IN(0)  * 16364U;
+    t2  = IN(13) * 15893U + IN(2)  *  3981U;
+    t3  = IN(13) *  3981U - IN(2)  * 15893U;
+    t4  = IN(11) * 14811U + IN(4)  *  7005U;
+    t5  = IN(11) *  7005U - IN(4)  * 14811U;
+    t6  = IN(9)  * 13160U + IN(6)  *  9760U;
+    t7  = IN(9)  *  9760U - IN(6)  * 13160U;
+    t8  = IN(7)  * 11003U + IN(8)  * 12140U;
+    t9  = IN(7)  * 12140U - IN(8)  * 11003U;
+    t10 = IN(5)  *  8423U + IN(10) * 14053U;
+    t11 = IN(5)  * 14053U - IN(10) *  8423U;
+    t12 = IN(3)  *  5520U + IN(12) * 15426U;
+    t13 = IN(3)  * 15426U - IN(12) *  5520U;
+    t14 = IN(1)  *  2404U + IN(14) * 16207U;
+    t15 = IN(1)  * 16207U - IN(14) *  2404U;
+
+    t0a  = (dctint)((1U << 13) + t0 + t8 ) >> 14;
+    t1a  = (dctint)((1U << 13) + t1 + t9 ) >> 14;
+    t2a  = (dctint)((1U << 13) + t2 + t10) >> 14;
+    t3a  = (dctint)((1U << 13) + t3 + t11) >> 14;
+    t4a  = (dctint)((1U << 13) + t4 + t12) >> 14;
+    t5a  = (dctint)((1U << 13) + t5 + t13) >> 14;
+    t6a  = (dctint)((1U << 13) + t6 + t14) >> 14;
+    t7a  = (dctint)((1U << 13) + t7 + t15) >> 14;
+    t8a  = (dctint)((1U << 13) + t0 - t8 ) >> 14;
+    t9a  = (dctint)((1U << 13) + t1 - t9 ) >> 14;
+    t10a = (dctint)((1U << 13) + t2 - t10) >> 14;
+    t11a = (dctint)((1U << 13) + t3 - t11) >> 14;
+    t12a = (dctint)((1U << 13) + t4 - t12) >> 14;
+    t13a = (dctint)((1U << 13) + t5 - t13) >> 14;
+    t14a = (dctint)((1U << 13) + t6 - t14) >> 14;
+    t15a = (dctint)((1U << 13) + t7 - t15) >> 14;
+
+    t8   = t8a  * 16069U + t9a  *  3196U;
+    t9   = t8a  *  3196U - t9a  * 16069U;
+    t10  = t10a *  9102U + t11a * 13623U;
+    t11  = t10a * 13623U - t11a *  9102U;
+    t12  = t13a * 16069U - t12a *  3196U;
+    t13  = t13a *  3196U + t12a * 16069U;
+    t14  = t15a *  9102U - t14a * 13623U;
+    t15  = t15a * 13623U + t14a *  9102U;
+
+    t0   = t0a + t4a;
+    t1   = t1a + t5a;
+    t2   = t2a + t6a;
+    t3   = t3a + t7a;
+    t4   = t0a - t4a;
+    t5   = t1a - t5a;
+    t6   = t2a - t6a;
+    t7   = t3a - t7a;
+    t8a  = (dctint)((1U << 13) + t8  + t12) >> 14;
+    t9a  = (dctint)((1U << 13) + t9  + t13) >> 14;
+    t10a = (dctint)((1U << 13) + t10 + t14) >> 14;
+    t11a = (dctint)((1U << 13) + t11 + t15) >> 14;
+    t12a = (dctint)((1U << 13) + t8  - t12) >> 14;
+    t13a = (dctint)((1U << 13) + t9  - t13) >> 14;
+    t14a = (dctint)((1U << 13) + t10 - t14) >> 14;
+    t15a = (dctint)((1U << 13) + t11 - t15) >> 14;
+
+    t4a  = t4 * 15137U + t5 *  6270U;
+    t5a  = t4 *  6270U - t5 * 15137U;
+    t6a  = t7 * 15137U - t6 *  6270U;
+    t7a  = t7 *  6270U + t6 * 15137U;
+    t12  = t12a * 15137U + t13a *  6270U;
+    t13  = t12a *  6270U - t13a * 15137U;
+    t14  = t15a * 15137U - t14a *  6270U;
+    t15  = t15a *  6270U + t14a * 15137U;
+
+    out[ 0] =   t0 + t2;
+    out[15] = -(t1 + t3);
+    t2a     =   t0 - t2;
+    t3a     =   t1 - t3;
+    out[ 3] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
+    out[12] =   (dctint)((1U << 13) + t5a + t7a) >> 14;
+    t6      =   (dctint)((1U << 13) + t4a - t6a) >> 14;
+    t7      =   (dctint)((1U << 13) + t5a - t7a) >> 14;
+    out[ 1] = -(t8a + t10a);
+    out[14] =   t9a + t11a;
+    t10     =   t8a - t10a;
+    t11     =   t9a - t11a;
+    out[ 2] =   (dctint)((1U << 13) + t12 + t14) >> 14;
+    out[13] = -((dctint)((1U << 13) + t13 + t15) >> 14);
+    t14a    =   (dctint)((1U << 13) + t12 - t14) >> 14;
+    t15a    =   (dctint)((1U << 13) + t13 - t15) >> 14;
+
+    out[ 7] = (dctint)(-(t2a  + t3a)  * 11585U  + (1 << 13)) >> 14;
+    out[ 8] = (dctint)( (t2a  - t3a)  * 11585U  + (1 << 13)) >> 14;
+    out[ 4] = (dctint)( (t7   + t6)   * 11585U  + (1 << 13)) >> 14;
+    out[11] = (dctint)( (t7   - t6)   * 11585U  + (1 << 13)) >> 14;
+    out[ 6] = (dctint)( (t11  + t10)  * 11585U  + (1 << 13)) >> 14;
+    out[ 9] = (dctint)( (t11  - t10)  * 11585U  + (1 << 13)) >> 14;
+    out[ 5] = (dctint)(-(t14a + t15a) * 11585U  + (1 << 13)) >> 14;
+    out[10] = (dctint)( (t14a - t15a) * 11585U  + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(16, 6)
+
+static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0a  = (dctint)((IN(0) + IN(16)) * 11585U         + (1 << 13)) >> 14;
+    dctint t1a  = (dctint)((IN(0) - IN(16)) * 11585U         + (1 << 13)) >> 14;
+    dctint t2a  = (dctint)(IN( 8) *  6270U - IN(24) * 15137U + (1 << 13)) >> 14;
+    dctint t3a  = (dctint)(IN( 8) * 15137U + IN(24) *  6270U + (1 << 13)) >> 14;
+    dctint t4a  = (dctint)(IN( 4) *  3196U - IN(28) * 16069U + (1 << 13)) >> 14;
+    dctint t7a  = (dctint)(IN( 4) * 16069U + IN(28) *  3196U + (1 << 13)) >> 14;
+    dctint t5a  = (dctint)(IN(20) * 13623U - IN(12) *  9102U + (1 << 13)) >> 14;
+    dctint t6a  = (dctint)(IN(20) *  9102U + IN(12) * 13623U + (1 << 13)) >> 14;
+    dctint t8a  = (dctint)(IN( 2) *  1606U - IN(30) * 16305U + (1 << 13)) >> 14;
+    dctint t15a = (dctint)(IN( 2) * 16305U + IN(30) *  1606U + (1 << 13)) >> 14;
+    dctint t9a  = (dctint)(IN(18) * 12665U - IN(14) * 10394U + (1 << 13)) >> 14;
+    dctint t14a = (dctint)(IN(18) * 10394U + IN(14) * 12665U + (1 << 13)) >> 14;
+    dctint t10a = (dctint)(IN(10) *  7723U - IN(22) * 14449U + (1 << 13)) >> 14;
+    dctint t13a = (dctint)(IN(10) * 14449U + IN(22) *  7723U + (1 << 13)) >> 14;
+    dctint t11a = (dctint)(IN(26) * 15679U - IN( 6) *  4756U + (1 << 13)) >> 14;
+    dctint t12a = (dctint)(IN(26) *  4756U + IN( 6) * 15679U + (1 << 13)) >> 14;
+    dctint t16a = (dctint)(IN( 1) *   804U - IN(31) * 16364U + (1 << 13)) >> 14;
+    dctint t31a = (dctint)(IN( 1) * 16364U + IN(31) *   804U + (1 << 13)) >> 14;
+    dctint t17a = (dctint)(IN(17) * 12140U - IN(15) * 11003U + (1 << 13)) >> 14;
+    dctint t30a = (dctint)(IN(17) * 11003U + IN(15) * 12140U + (1 << 13)) >> 14;
+    dctint t18a = (dctint)(IN( 9) *  7005U - IN(23) * 14811U + (1 << 13)) >> 14;
+    dctint t29a = (dctint)(IN( 9) * 14811U + IN(23) *  7005U + (1 << 13)) >> 14;
+    dctint t19a = (dctint)(IN(25) * 15426U - IN( 7) *  5520U + (1 << 13)) >> 14;
+    dctint t28a = (dctint)(IN(25) *  5520U + IN( 7) * 15426U + (1 << 13)) >> 14;
+    dctint t20a = (dctint)(IN( 5) *  3981U - IN(27) * 15893U + (1 << 13)) >> 14;
+    dctint t27a = (dctint)(IN( 5) * 15893U + IN(27) *  3981U + (1 << 13)) >> 14;
+    dctint t21a = (dctint)(IN(21) * 14053U - IN(11) *  8423U + (1 << 13)) >> 14;
+    dctint t26a = (dctint)(IN(21) *  8423U + IN(11) * 14053U + (1 << 13)) >> 14;
+    dctint t22a = (dctint)(IN(13) *  9760U - IN(19) * 13160U + (1 << 13)) >> 14;
+    dctint t25a = (dctint)(IN(13) * 13160U + IN(19) *  9760U + (1 << 13)) >> 14;
+    dctint t23a = (dctint)(IN(29) * 16207U - IN( 3) *  2404U + (1 << 13)) >> 14;
+    dctint t24a = (dctint)(IN(29) *  2404U + IN( 3) * 16207U + (1 << 13)) >> 14;
+
+    dctint t0  = t0a  + t3a;
+    dctint t1  = t1a  + t2a;
+    dctint t2  = t1a  - t2a;
+    dctint t3  = t0a  - t3a;
+    dctint t4  = t4a  + t5a;
+    dctint t5  = t4a  - t5a;
+    dctint t6  = t7a  - t6a;
+    dctint t7  = t7a  + t6a;
+    dctint t8  = t8a  + t9a;
+    dctint t9  = t8a  - t9a;
+    dctint t10 = t11a - t10a;
+    dctint t11 = t11a + t10a;
+    dctint t12 = t12a + t13a;
+    dctint t13 = t12a - t13a;
+    dctint t14 = t15a - t14a;
+    dctint t15 = t15a + t14a;
+    dctint t16 = t16a + t17a;
+    dctint t17 = t16a - t17a;
+    dctint t18 = t19a - t18a;
+    dctint t19 = t19a + t18a;
+    dctint t20 = t20a + t21a;
+    dctint t21 = t20a - t21a;
+    dctint t22 = t23a - t22a;
+    dctint t23 = t23a + t22a;
+    dctint t24 = t24a + t25a;
+    dctint t25 = t24a - t25a;
+    dctint t26 = t27a - t26a;
+    dctint t27 = t27a + t26a;
+    dctint t28 = t28a + t29a;
+    dctint t29 = t28a - t29a;
+    dctint t30 = t31a - t30a;
+    dctint t31 = t31a + t30a;
+
+    t5a  = (dctint)((t6 - t5) * 11585U             + (1 << 13)) >> 14;
+    t6a  = (dctint)((t6 + t5) * 11585U             + (1 << 13)) >> 14;
+    t9a  = (dctint)(  t14 *  6270U - t9  * 15137U  + (1 << 13)) >> 14;
+    t14a = (dctint)(  t14 * 15137U + t9  *  6270U  + (1 << 13)) >> 14;
+    t10a = (dctint)(-(t13 * 15137U + t10 *  6270U) + (1 << 13)) >> 14;
+    t13a = (dctint)(  t13 *  6270U - t10 * 15137U  + (1 << 13)) >> 14;
+    t17a = (dctint)(  t30 *  3196U - t17 * 16069U  + (1 << 13)) >> 14;
+    t30a = (dctint)(  t30 * 16069U + t17 *  3196U  + (1 << 13)) >> 14;
+    t18a = (dctint)(-(t29 * 16069U + t18 *  3196U) + (1 << 13)) >> 14;
+    t29a = (dctint)(  t29 *  3196U - t18 * 16069U  + (1 << 13)) >> 14;
+    t21a = (dctint)(  t26 * 13623U - t21 *  9102U  + (1 << 13)) >> 14;
+    t26a = (dctint)(  t26 *  9102U + t21 * 13623U  + (1 << 13)) >> 14;
+    t22a = (dctint)(-(t25 *  9102U + t22 * 13623U) + (1 << 13)) >> 14;
+    t25a = (dctint)(  t25 * 13623U - t22 *  9102U  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4a  = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7a  = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+    t16a = t16  + t19;
+    t17  = t17a + t18a;
+    t18  = t17a - t18a;
+    t19a = t16  - t19;
+    t20a = t23  - t20;
+    t21  = t22a - t21a;
+    t22  = t22a + t21a;
+    t23a = t23  + t20;
+    t24a = t24  + t27;
+    t25  = t25a + t26a;
+    t26  = t25a - t26a;
+    t27a = t24  - t27;
+    t28a = t31  - t28;
+    t29  = t30a - t29a;
+    t30  = t30a + t29a;
+    t31a = t31  + t28;
+
+    t10a = (dctint)((t13  - t10)  * 11585U           + (1 << 13)) >> 14;
+    t13a = (dctint)((t13  + t10)  * 11585U           + (1 << 13)) >> 14;
+    t11  = (dctint)((t12a - t11a) * 11585U           + (1 << 13)) >> 14;
+    t12  = (dctint)((t12a + t11a) * 11585U           + (1 << 13)) >> 14;
+    t18a = (dctint)(  t29  *  6270U - t18  * 15137U  + (1 << 13)) >> 14;
+    t29a = (dctint)(  t29  * 15137U + t18  *  6270U  + (1 << 13)) >> 14;
+    t19  = (dctint)(  t28a *  6270U - t19a * 15137U  + (1 << 13)) >> 14;
+    t28  = (dctint)(  t28a * 15137U + t19a *  6270U  + (1 << 13)) >> 14;
+    t20  = (dctint)(-(t27a * 15137U + t20a *  6270U) + (1 << 13)) >> 14;
+    t27  = (dctint)(  t27a *  6270U - t20a * 15137U  + (1 << 13)) >> 14;
+    t21a = (dctint)(-(t26  * 15137U + t21  *  6270U) + (1 << 13)) >> 14;
+    t26a = (dctint)(  t26  *  6270U - t21  * 15137U  + (1 << 13)) >> 14;
+
+    t0   = t0a + t15a;
+    t1   = t1a + t14;
+    t2   = t2a + t13a;
+    t3   = t3a + t12;
+    t4   = t4a + t11;
+    t5a  = t5  + t10a;
+    t6a  = t6  + t9;
+    t7   = t7a + t8a;
+    t8   = t7a - t8a;
+    t9a  = t6  - t9;
+    t10  = t5  - t10a;
+    t11a = t4a - t11;
+    t12a = t3a - t12;
+    t13  = t2a - t13a;
+    t14a = t1a - t14;
+    t15  = t0a - t15a;
+    t16  = t16a + t23a;
+    t17a = t17  + t22;
+    t18  = t18a + t21a;
+    t19a = t19  + t20;
+    t20a = t19  - t20;
+    t21  = t18a - t21a;
+    t22a = t17  - t22;
+    t23  = t16a - t23a;
+    t24  = t31a - t24a;
+    t25a = t30  - t25;
+    t26  = t29a - t26a;
+    t27a = t28  - t27;
+    t28a = t28  + t27;
+    t29  = t29a + t26a;
+    t30a = t30  + t25;
+    t31  = t31a + t24a;
+
+    t20  = (dctint)((t27a - t20a) * 11585U + (1 << 13)) >> 14;
+    t27  = (dctint)((t27a + t20a) * 11585U + (1 << 13)) >> 14;
+    t21a = (dctint)((t26  - t21 ) * 11585U + (1 << 13)) >> 14;
+    t26a = (dctint)((t26  + t21 ) * 11585U + (1 << 13)) >> 14;
+    t22  = (dctint)((t25a - t22a) * 11585U + (1 << 13)) >> 14;
+    t25  = (dctint)((t25a + t22a) * 11585U + (1 << 13)) >> 14;
+    t23a = (dctint)((t24  - t23 ) * 11585U + (1 << 13)) >> 14;
+    t24a = (dctint)((t24  + t23 ) * 11585U + (1 << 13)) >> 14;
+
+    out[ 0] = t0   + t31;
+    out[ 1] = t1   + t30a;
+    out[ 2] = t2   + t29;
+    out[ 3] = t3   + t28a;
+    out[ 4] = t4   + t27;
+    out[ 5] = t5a  + t26a;
+    out[ 6] = t6a  + t25;
+    out[ 7] = t7   + t24a;
+    out[ 8] = t8   + t23a;
+    out[ 9] = t9a  + t22;
+    out[10] = t10  + t21a;
+    out[11] = t11a + t20;
+    out[12] = t12a + t19a;
+    out[13] = t13  + t18;
+    out[14] = t14a + t17a;
+    out[15] = t15  + t16;
+    out[16] = t15  - t16;
+    out[17] = t14a - t17a;
+    out[18] = t13  - t18;
+    out[19] = t12a - t19a;
+    out[20] = t11a - t20;
+    out[21] = t10  - t21a;
+    out[22] = t9a  - t22;
+    out[23] = t8   - t23a;
+    out[24] = t7   - t24a;
+    out[25] = t6a  - t25;
+    out[26] = t5a  - t26a;
+    out[27] = t4   - t27;
+    out[28] = t3   - t28a;
+    out[29] = t2   - t29;
+    out[30] = t1   - t30a;
+    out[31] = t0   - t31;
+}
+
+itxfm_wrapper(idct, idct, 32, 6, 1)
+
+static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    int t0, t1, t2, t3, t4;
+
+    if (pass == 0) {
+        t0 = IN(0) >> 2;
+        t1 = IN(3) >> 2;
+        t2 = IN(1) >> 2;
+        t3 = IN(2) >> 2;
+    } else {
+        t0 = IN(0);
+        t1 = IN(3);
+        t2 = IN(1);
+        t3 = IN(2);
+    }
+
+    t0 += t2;
+    t3 -= t1;
+    t4 = (t0 - t3) >> 1;
+    t1 = t4 - t1;
+    t2 = t4 - t2;
+    t0 -= t1;
+    t3 += t2;
+
+    out[0] = t0;
+    out[1] = t1;
+    out[2] = t2;
+    out[3] = t3;
+}
+
+itxfm_wrapper(iwht, iwht, 4, 0, 0)
+
+#undef IN
+#undef itxfm_wrapper
+#undef itxfm_wrap
+
+static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
+{
+#define init_itxfm(tx, sz) \
+    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
+
+#define init_idct(tx, nm) \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
+
+    init_itxfm(TX_4X4,   4x4);
+    init_itxfm(TX_8X8,   8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32,  idct_idct_32x32);
+    init_idct(4 /* lossless */, iwht_iwht_4x4);
+
+#undef init_itxfm
+#undef init_idct
+}
+
+static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
+                                         ptrdiff_t stridea, ptrdiff_t strideb,
+                                         int wd)
+{
+    int i, F = 1 << (BIT_DEPTH - 8);
+
+    E <<= (BIT_DEPTH - 8);
+    I <<= (BIT_DEPTH - 8);
+    H <<= (BIT_DEPTH - 8);
+    for (i = 0; i < 8; i++, dst += stridea) {
+        int p7, p6, p5, p4;
+        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
+        int q4, q5, q6, q7;
+        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
+                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
+                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
+                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
+        int flat8out, flat8in;
+
+        if (!fm)
+            continue;
+
+        if (wd >= 16) {
+            p7 = dst[strideb * -8];
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+            q7 = dst[strideb * +7];
+
+            flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
+                       FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
+                       FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
+                       FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
+        }
+
+        if (wd >= 8)
+            flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
+                      FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
+                      FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
+
+        if (wd >= 16 && flat8out && flat8in) {
+            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
+                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
+                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
+                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
+                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
+                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
+            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
+            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
+                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
+                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
+                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
+                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else {
+            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
+
+            if (hev) {
+                int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
+                f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+            } else {
+                int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = av_clip_pixel(p1 + f);
+                dst[strideb * +1] = av_clip_pixel(q1 - f);
+            }
+        }
+    }
+}
+
+#define lf_8_fn(dir, wd, stridea, strideb) \
+static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
+                                           ptrdiff_t stride, \
+                                           int E, int I, int H) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    stride /= sizeof(pixel); \
+    loop_filter(dst, E, I, H, stridea, strideb, wd); \
+}
+
+#define lf_8_fns(wd) \
+lf_8_fn(h, wd, stride, 1) \
+lf_8_fn(v, wd, 1, stride)
+
+lf_8_fns(4)
+lf_8_fns(8)
+lf_8_fns(16)
+
+#undef lf_8_fn
+#undef lf_8_fns
+
+#define lf_16_fn(dir, stridea) \
+static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
+                                        ptrdiff_t stride, \
+                                        int E, int I, int H) \
+{ \
+    loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
+    loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
+}
+
+lf_16_fn(h, stride)
+lf_16_fn(v, sizeof(pixel))
+
+#undef lf_16_fn
+
+#define lf_mix_fn(dir, wd1, wd2, stridea) \
+static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  int E, int I, int H) \
+{ \
+    loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
+    loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
+}
+
+#define lf_mix_fns(wd1, wd2) \
+lf_mix_fn(h, wd1, wd2, stride) \
+lf_mix_fn(v, wd1, wd2, sizeof(pixel))
+
+lf_mix_fns(4, 4)
+lf_mix_fns(4, 8)
+lf_mix_fns(8, 4)
+lf_mix_fns(8, 8)
+
+#undef lf_mix_fn
+#undef lf_mix_fns
+
+static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
+{
+    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
+    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
+    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
+    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
+    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
+    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
+
+    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
+    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
+
+    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
+    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
+    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
+    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
+    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
+    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
+    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
+    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void copy_c(uint8_t *restrict dst, ptrdiff_t dst_stride,
+                                    const uint8_t *restrict src,
+                                    ptrdiff_t src_stride, int w, int h)
+{
+    do {
+        memcpy(dst, src, w * sizeof(pixel));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+static av_always_inline void avg_c(uint8_t *restrict _dst, ptrdiff_t dst_stride,
+                                   const uint8_t *restrict _src,
+                                   ptrdiff_t src_stride, int w, int h)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x += 4)
+            AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define fpel_fn(type, sz) \
+static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                         const uint8_t *src, ptrdiff_t src_stride, \
+                         int h, int mx, int my) \
+{ \
+    type##_c(dst, dst_stride, src, src_stride, sz, h); \
+}
+
+#define copy_avg_fn(sz) \
+fpel_fn(copy, sz) \
+fpel_fn(avg,  sz)
+
+copy_avg_fn(64)
+copy_avg_fn(32)
+copy_avg_fn(16)
+copy_avg_fn(8)
+copy_avg_fn(4)
+
+#undef fpel_fn
+#undef copy_avg_fn
+
+#endif /* BIT_DEPTH != 12 */
+
+#define FILTER_8TAP(src, x, F, stride) \
+    av_clip_pixel((F[0] * src[x + -3 * stride] + \
+                   F[1] * src[x + -2 * stride] + \
+                   F[2] * src[x + -1 * stride] + \
+                   F[3] * src[x + +0 * stride] + \
+                   F[4] * src[x + +1 * stride] + \
+                   F[5] * src[x + +2 * stride] + \
+                   F[6] * src[x + +3 * stride] + \
+                   F[7] * src[x + +4 * stride] + 64) >> 7)
+
+static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, ptrdiff_t ds,
+                                          const int16_t *filter, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(src, x, filter, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define filter_8tap_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src, ptrdiff_t src_stride, \
+                                                int w, int h, const int16_t *filter) \
+{ \
+    do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
+}
+
+filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(put, 0, h, 1)
+filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(avg, 1, h, 1)
+
+#undef filter_8tap_1d_fn
+
+static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, const int16_t *filterx,
+                                          const int16_t *filtery, int avg)
+{
+    int tmp_h = h + 7;
+    pixel tmp[64 * 71], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define filter_8tap_2d_fn(opn, opa) \
+static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int w, int h, const int16_t *filterx, \
+                                           const int16_t *filtery) \
+{ \
+    do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
+}
+
+filter_8tap_2d_fn(put, 0)
+filter_8tap_2d_fn(avg, 1)
+
+#undef filter_8tap_2d_fn
+
+#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                              const uint8_t *src, ptrdiff_t src_stride, \
+                                              int h, int mx, int my) \
+{ \
+    avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
+                            ff_vp9_subpel_filters[type_idx][dir_m]); \
+}
+
+#define filter_fn_2d(sz, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my) \
+{ \
+    avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
+                       ff_vp9_subpel_filters[type_idx][mx], \
+                       ff_vp9_subpel_filters[type_idx][my]); \
+}
+
+#if BIT_DEPTH != 12
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
+
+static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, ptrdiff_t ds, int mxy, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(src, x, mxy, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define bilin_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                 const uint8_t *src, ptrdiff_t src_stride, \
+                                                 int w, int h, int mxy) \
+{ \
+    do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
+}
+
+bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+bilin_1d_fn(put, 0, h, 1)
+bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+bilin_1d_fn(avg, 1, h, 1)
+
+#undef bilin_1d_fn
+
+static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, int mx, int my, int avg)
+{
+    pixel tmp[64 * 65], *tmp_ptr = tmp;
+    int tmp_h = h + 1;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define bilin_2d_fn(opn, opa) \
+static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my) \
+{ \
+    do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
+}
+
+bilin_2d_fn(put, 0)
+bilin_2d_fn(avg, 1)
+
+#undef bilin_2d_fn
+
+#define bilinf_fn_1d(sz, dir, dir_m, avg) \
+static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my) \
+{ \
+    avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
+}
+
+#define bilinf_fn_2d(sz, avg) \
+static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my) \
+{ \
+    avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
+}
+
+#else
+
+#define bilinf_fn_1d(a, b, c, d)
+#define bilinf_fn_2d(a, b)
+
+#endif
+
+#define filter_fn(sz, avg) \
+filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_2d(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, h, mx, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, v, my, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_2d(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, h, mx, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_1d(sz, v, my, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_2d(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+bilinf_fn_1d(sz, h, mx,                               avg) \
+bilinf_fn_1d(sz, v, my,                               avg) \
+bilinf_fn_2d(sz,                                      avg)
+
+#define filter_fn_set(avg) \
+filter_fn(64, avg) \
+filter_fn(32, avg) \
+filter_fn(16, avg) \
+filter_fn(8,  avg) \
+filter_fn(4,  avg)
+
+filter_fn_set(put)
+filter_fn_set(avg)
+
+#undef filter_fn
+#undef filter_fn_set
+#undef filter_fn_1d
+#undef filter_fn_2d
+#undef bilinf_fn_1d
+#undef bilinf_fn_2d
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
+{
+#if BIT_DEPTH == 12
+    ff_vp9dsp_mc_init_10(dsp);
+#else /* BIT_DEPTH == 12 */
+
+#define init_fpel(idx1, idx2, sz, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = type##sz##_c
+
+#define init_copy_avg(idx, sz) \
+    init_fpel(idx, 0, sz, copy); \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_copy_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_fpel
+
+#endif /* BIT_DEPTH == 12 */
+
+#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
+
+#if BIT_DEPTH == 12
+#define init_subpel1 init_subpel1_bd_aware
+#else
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
+    init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
+#endif
+
+#define init_subpel2(idx, idxh, idxv, dir, type) \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type) \
+    init_subpel2(idx, 1, 1, hv, type); \
+    init_subpel2(idx, 0, 1, v, type); \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+#undef init_subpel1_bd_aware
+}
+
+static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                              const uint8_t *_src, ptrdiff_t src_stride,
+                                              int w, int h, int mx, int my,
+                                              int dx, int dy, int avg,
+                                              const int16_t (*filters)[8])
+{
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
+    pixel tmp[64 * 135], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+        const int16_t *filter = filters[my];
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_filter_8tap_fn(opn, opa) \
+static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my, int dx, int dy, \
+                                            const int16_t (*filters)[8]) \
+{ \
+    do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+                     opa, filters); \
+}
+
+scaled_filter_8tap_fn(put, 0)
+scaled_filter_8tap_fn(avg, 1)
+
+#undef scaled_filter_8tap_fn
+
+#undef FILTER_8TAP
+
+#define scaled_filter_fn(sz, type, type_idx, avg) \
+static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
+                        ff_vp9_subpel_filters[type_idx]); \
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                               const uint8_t *_src, ptrdiff_t src_stride,
+                                               int w, int h, int mx, int my,
+                                               int dx, int dy, int avg)
+{
+    pixel tmp[64 * 129], *tmp_ptr = tmp;
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_bilin_fn(opn, opa) \
+static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                             const uint8_t *src, ptrdiff_t src_stride, \
+                                             int w, int h, int mx, int my, int dx, int dy) \
+{ \
+    do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
+}
+
+scaled_bilin_fn(put, 0)
+scaled_bilin_fn(avg, 1)
+
+#undef scaled_bilin_fn
+
+#undef FILTER_BILIN
+
+#define scaled_bilinf_fn(sz, avg) \
+static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
+}
+
+#else
+
+#define scaled_bilinf_fn(a, b)
+
+#endif
+
+#define scaled_filter_fns(sz, avg) \
+scaled_filter_fn(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+scaled_filter_fn(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+scaled_filter_fn(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+scaled_bilinf_fn(sz,                                      avg)
+
+#define scaled_filter_fn_set(avg) \
+scaled_filter_fns(64, avg) \
+scaled_filter_fns(32, avg) \
+scaled_filter_fns(16, avg) \
+scaled_filter_fns(8,  avg) \
+scaled_filter_fns(4,  avg)
+
+scaled_filter_fn_set(put)
+scaled_filter_fn_set(avg)
+
+#undef scaled_filter_fns
+#undef scaled_filter_fn_set
+#undef scaled_filter_fn
+#undef scaled_bilinf_fn
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
+{
+#define init_scaled_bd_aware(idx1, idx2, sz, type) \
+    dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_SHARP  ][idx2] = type##_scaled_sharp_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_scaled_mc_init_10(dsp);
+#define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
+#else
+#define init_scaled(idx1, idx2, sz, type) \
+    init_scaled_bd_aware(idx1, idx2, sz, type); \
+    dsp->smc[idx1][FILTER_BILINEAR    ][idx2] = type##_scaled_bilin_##sz##_c
+#endif
+
+#define init_scaled_put_avg(idx, sz) \
+    init_scaled(idx, 0, sz, put); \
+    init_scaled(idx, 1, sz, avg)
+
+    init_scaled_put_avg(0, 64);
+    init_scaled_put_avg(1, 32);
+    init_scaled_put_avg(2, 16);
+    init_scaled_put_avg(3,  8);
+    init_scaled_put_avg(4,  4);
+
+#undef init_scaled_put_avg
+#undef init_scaled
+#undef init_scaled_bd_aware
+}
+
+av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp)
+{
+    FUNC(ff_vp9dsp_intrapred_init)(dsp);
+    vp9dsp_itxfm_init(dsp);
+    vp9dsp_loopfilter_init(dsp);
+    FUNC(ff_vp9dsp_mc_init)(dsp);
+    FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
+}
diff --git a/external/ffmpeg-snapshot/libavutil/aarch64/asm.S b/external/ffmpeg-snapshot/libavutil/aarch64/asm.S
new file mode 100644
index 0000000..1840f9f
--- /dev/null
+++ b/external/ffmpeg-snapshot/libavutil/aarch64/asm.S
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF #
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC #
+#endif
+
+#ifndef __has_feature
+#   define __has_feature(x) 0
+#endif
+
+#if HAVE_AS_ARCH_DIRECTIVE
+        .arch           AS_ARCH_LEVEL
+#endif
+
+#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
+#define ENABLE_DOTPROD  .arch_extension dotprod
+#define DISABLE_DOTPROD .arch_extension nodotprod
+#else
+#define ENABLE_DOTPROD
+#define DISABLE_DOTPROD
+#endif
+
+#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
+#define ENABLE_I8MM  .arch_extension i8mm
+#define DISABLE_I8MM .arch_extension noi8mm
+#else
+#define ENABLE_I8MM
+#define DISABLE_I8MM
+#endif
+
+DISABLE_DOTPROD
+DISABLE_I8MM
+
+
+/* Support macros for
+ *   - Armv8.3-A Pointer Authentication and
+ *   - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ *   .global f
+ *   f:
+ *     AARCH64_SIGN_LINK_REGISTER
+ *     stp x29, x30, [sp, #-96]!
+ *     mov x29, sp
+ *     ...
+ *     ldp x29, x30, [sp], #96
+ *     AARCH64_VALIDATE_LINK_REGISTER
+ *     ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ *   .globl return_zero
+ *   return_zero:
+ *     AARCH64_VALID_CALL_TARGET
+ *     mov x0, #0
+ *     ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ *   .globl with_early_jump
+ *   with_early_jump:
+ *     AARCH64_VALID_CALL_TARGET
+ *     cmp x0, #128
+ *     b.lt .Lwith_early_jump_128
+ *     AARCH64_SIGN_LINK_REGISTER
+ *     stp x29, x30, [sp, #-96]!
+ *     mov x29, sp
+ *     ...
+ *     ldp x29, x30, [sp], #96
+ *     AARCH64_VALIDATE_LINK_REGISTER
+ *     ret
+ *
+ *  .Lwith_early_jump_128:
+ *     ...
+ *     ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ *   https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ *   https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+ */
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
+#   define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has BTI
+#   define AARCH64_VALID_CALL_TARGET hint #34  // BTI 'c'
+#   define AARCH64_VALID_JUMP_TARGET hint #38  // BTI 'j'
+#else
+#   define GNU_PROPERTY_AARCH64_BTI 0          // No BTI
+#   define AARCH64_VALID_CALL_TARGET
+#   define AARCH64_VALID_JUMP_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+#   if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#       define AARCH64_SIGN_LINK_REGISTER      paciasp
+#       define AARCH64_VALIDATE_LINK_REGISTER  autiasp
+#   elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#       define AARCH64_SIGN_LINK_REGISTER      pacibsp
+#       define AARCH64_VALIDATE_LINK_REGISTER  autibsp
+#   else
+#       error Pointer authentication defines no valid key!
+#   endif
+#   if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0)
+#       error Authentication of leaf functions is enabled but not supported in FFmpeg!
+#   endif
+#   define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+#else
+#   define GNU_PROPERTY_AARCH64_PAC 0
+#   define AARCH64_SIGN_LINK_REGISTER
+#   define AARCH64_VALIDATE_LINK_REGISTER
+#endif
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
+        .pushsection .note.gnu.property, "a"
+        .balign 8
+        .long 4
+        .long 0x10
+        .long 0x5
+        .asciz "GNU"
+        .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+        .long 4
+        .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
+        .long 0
+        .popsection
+#endif
+
+.macro  function name, export=0, align=2
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .text
+        .align          \align
+    .if \export
+        .global EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+        AARCH64_VALID_CALL_TARGET
+    .else
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+    .endif
+.endm
+
+.macro  const   name, align=2, relocate=0
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+#if HAVE_SECTION_DATA_REL_RO
+.if \relocate
+        .section        .data.rel.ro
+.else
+        .section        .rodata
+.endif
+#elif defined(_WIN32)
+        .section        .rdata
+#elif !defined(__MACH__)
+        .section        .rodata
+#else
+        .const_data
+#endif
+        .align          \align
+\name:
+.endm
+
+.macro  movrel rd, val, offset=0
+#if CONFIG_PIC && defined(__APPLE__)
+    .if \offset < 0
+        adrp            \rd, \val@PAGE
+        add             \rd, \rd, \val@PAGEOFF
+        sub             \rd, \rd, -(\offset)
+    .else
+        adrp            \rd, \val+(\offset)@PAGE
+        add             \rd, \rd, \val+(\offset)@PAGEOFF
+    .endif
+#elif CONFIG_PIC && defined(_WIN32)
+    .if \offset < 0
+        adrp            \rd, \val
+        add             \rd, \rd, :lo12:\val
+        sub             \rd, \rd, -(\offset)
+    .else
+        adrp            \rd, \val+(\offset)
+        add             \rd, \rd, :lo12:\val+(\offset)
+    .endif
+#elif CONFIG_PIC
+#   if __has_feature(hwaddress_sanitizer)
+        adrp            \rd, :pg_hi21_nc:\val+(\offset)
+#   else
+        adrp            \rd, \val+(\offset)
+#   endif
+        add             \rd, \rd, :lo12:\val+(\offset)
+#else
+        ldr             \rd, =\val+\offset
+#endif
+.endm
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
diff --git a/src/.gitkeep b/src/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/.gitkeep b/tests/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/bench_neon_idct.c b/tests/bench_neon_idct.c
new file mode 100644
index 0000000..0ea5556
--- /dev/null
+++ b/tests/bench_neon_idct.c
@@ -0,0 +1,248 @@
+/*
+ * Phase 3 — NEON baseline microbench for VP9 8×8 DCT_DCT IDCT add.
+ *
+ * Reports two numbers:
+ *   M1 (correctness):  bit-exact match rate, our C reference vs
+ *                      FFmpeg's NEON, across N random blocks.
+ *   M3 (throughput):   NEON sustained MblockS on this host.
+ *
+ * Both are gating measurements for Phase 1 (see docs/phase1.md).
+ * NO QPU work happens here — that's later phases.
+ *
+ * Build: see CMakeLists.txt at project root.
+ * Run:   ./bench_neon_idct [--blocks N] [--iters K] [--seed S]
+ *
+ * License: BSD-2-Clause (daedalus-fourier), but this binary
+ *          statically links the LGPL-2.1+ FFmpeg NEON snapshot
+ *          — distribute the binary under LGPL-2.1+ in that case.
+ */
+#define _POSIX_C_SOURCE 200809L
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include <time.h>
+#include <getopt.h>
+
+/* Our C reference (tests/vp9_idct8_ref.c). */
+extern void daedalus_vp9_idct_idct_8x8_add_ref(
+    uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+/* FFmpeg NEON entry point (vendored vp9itxfm_neon.S). */
+extern void ff_vp9_idct_idct_8x8_add_neon(
+    uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+/* ---- Random-block generation ----------------------------------- */
+
+/* xorshift64 — deterministic per seed, fast enough not to dominate
+ * the measurement. */
+static uint64_t xs64_state;
+static inline uint64_t xs64(void)
+{
+    uint64_t x = xs64_state;
+    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
+    return xs64_state = x;
+}
+
+/* Random VP9-plausible coefficient block: most coefficients zero,
+ * a handful of nonzero ones in low-frequency positions. Bias chosen
+ * so eob is typically in [4, 32], hitting the general (non-DC) path.
+ * For Phase 3 baseline this isn't load-balanced against a real
+ * bitstream distribution — Phase 7 may revisit. */
+static int gen_block(int16_t block[64])
+{
+    memset(block, 0, 64 * sizeof(*block));
+    int eob = 0;
+    int n_nonzero = 1 + (int)(xs64() % 16);
+    for (int i = 0; i < n_nonzero; i++) {
+        /* Bias toward low-freq positions via xs64() % (xs64() % 64 + 1). */
+        int pos = (int)(xs64() % 64);
+        /* Coefficient range: signed 12-bit (typical dequant output). */
+        int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096);
+        block[pos] = coef;
+        if (pos + 1 > eob) eob = pos + 1;
+    }
+    if (eob == 0) eob = 1;
+    return eob;
+}
+
+static void gen_pred(uint8_t pred[64])
+{
+    for (int i = 0; i < 64; i++)
+        pred[i] = (uint8_t)(xs64() & 0xff);
+}
+
+/* ---- Wall-clock timing (CLOCK_MONOTONIC_RAW) ------------------- */
+
+static double now_seconds(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+    return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+/* ---- Phase 1 M1: bit-exact gate -------------------------------- */
+
+static int correctness_check(uint64_t seed, int n_blocks)
+{
+    xs64_state = seed ? seed : 0xdeadbeefcafebabeULL;
+    int mismatches = 0;
+    int dc_only_seen = 0;
+
+    int16_t block_a[64], block_b[64];
+    uint8_t pred[64];
+    uint8_t dst_a[64], dst_b[64];
+
+    for (int i = 0; i < n_blocks; i++) {
+        int eob = gen_block(block_a);
+        memcpy(block_b, block_a, sizeof(block_a));
+        gen_pred(pred);
+        memcpy(dst_a, pred, 64);
+        memcpy(dst_b, pred, 64);
+
+        daedalus_vp9_idct_idct_8x8_add_ref(dst_a, 8, block_a, eob);
+        ff_vp9_idct_idct_8x8_add_neon(dst_b, 8, block_b, eob);
+
+        if (memcmp(dst_a, dst_b, 64) != 0) {
+            if (mismatches < 4) {
+                fprintf(stderr, "MISMATCH block %d eob=%d:\n", i, eob);
+                for (int r = 0; r < 8; r++) {
+                    fprintf(stderr, "  row %d  ref ", r);
+                    for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r * 8 + c]);
+                    fprintf(stderr, " neon ");
+                    for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r * 8 + c]);
+                    fprintf(stderr, "\n");
+                }
+            }
+            mismatches++;
+        }
+        if (eob == 1) dc_only_seen++;
+    }
+
+    printf("M1 correctness: %d / %d blocks bit-exact match (%.4f%%)\n",
+           n_blocks - mismatches, n_blocks,
+           100.0 * (n_blocks - mismatches) / n_blocks);
+    printf("  dc-only path frequency: %d / %d (%.2f%%)\n",
+           dc_only_seen, n_blocks, 100.0 * dc_only_seen / n_blocks);
+    return mismatches;
+}
+
+/* ---- Phase 1 M3: NEON throughput ------------------------------- */
+
+static void throughput_neon(uint64_t seed, int n_blocks, int iters)
+{
+    xs64_state = seed ? seed : 0xfeedfacecafebeefULL;
+
+    /* Pre-generate all blocks + preds so generation cost is excluded
+     * from the timed region. Each block is consumed once per iteration
+     * (NEON path zeroes the block, so we restore from the master). */
+    int16_t *blocks_master = malloc(n_blocks * 64 * sizeof(int16_t));
+    int16_t *blocks_work   = malloc(n_blocks * 64 * sizeof(int16_t));
+    uint8_t *preds         = malloc(n_blocks * 64);
+    uint8_t *dsts          = malloc(n_blocks * 64);
+    int      *eobs         = malloc(n_blocks * sizeof(int));
+    if (!blocks_master || !blocks_work || !preds || !dsts || !eobs) {
+        fprintf(stderr, "alloc failed\n");
+        exit(1);
+    }
+
+    for (int i = 0; i < n_blocks; i++) {
+        eobs[i] = gen_block(blocks_master + i * 64);
+        gen_pred(preds + i * 64);
+    }
+
+    /* Warm-up. */
+    memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
+    memcpy(dsts, preds, n_blocks * 64);
+    for (int i = 0; i < n_blocks; i++)
+        ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
+                                       blocks_work + i * 64, eobs[i]);
+
+    /* Timed region. */
+    double t0 = now_seconds();
+    for (int it = 0; it < iters; it++) {
+        memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
+        memcpy(dsts, preds, n_blocks * 64);
+        for (int i = 0; i < n_blocks; i++)
+            ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
+                                           blocks_work + i * 64, eobs[i]);
+    }
+    double t1 = now_seconds();
+
+    /* memcpy cost-only run, to subtract setup overhead. */
+    double s0 = now_seconds();
+    for (int it = 0; it < iters; it++) {
+        memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
+        memcpy(dsts, preds, n_blocks * 64);
+    }
+    double s1 = now_seconds();
+
+    double total_seconds = (t1 - t0) - (s1 - s0);
+    double total_blocks  = (double) n_blocks * iters;
+    double mblocks_s     = total_blocks / total_seconds / 1e6;
+
+    printf("M3 NEON throughput:\n");
+    printf("  blocks=%d  iters=%d  total=%.0f\n", n_blocks, iters, total_blocks);
+    printf("  elapsed (kernel)=%.6f s  (setup-subtracted)\n", total_seconds);
+    printf("  elapsed (setup) =%.6f s\n", s1 - s0);
+    printf("  throughput      = %.3f Mblock/s\n", mblocks_s);
+    printf("  per-block       = %.1f ns\n", total_seconds / total_blocks * 1e9);
+
+    /* Equivalent at 1920x1080: 32 400 blocks/frame -> FPS. */
+    printf("  equiv 1080p     = %.1f FPS  (32400 blocks/frame)\n",
+           mblocks_s * 1e6 / 32400.0);
+
+    free(blocks_master); free(blocks_work); free(preds);
+    free(dsts); free(eobs);
+}
+
+/* ---- CLI ------------------------------------------------------- */
+
+static void usage(const char *p)
+{
+    fprintf(stderr,
+        "Usage: %s [--blocks N] [--iters K] [--seed S] [--no-correctness]\n"
+        "Defaults: N=1000000, K=10, S=0 (uses fixed default).\n", p);
+}
+
+int main(int argc, char **argv)
+{
+    int n_blocks = 1000000;
+    int iters    = 10;
+    uint64_t seed = 0;
+    int do_correctness = 1;
+
+    static struct option opts[] = {
+        {"blocks",         required_argument, 0, 'b'},
+        {"iters",          required_argument, 0, 'i'},
+        {"seed",           required_argument, 0, 's'},
+        {"no-correctness", no_argument,       0, 'C'},
+        {"help",           no_argument,       0, 'h'},
+        {0,0,0,0}
+    };
+    for (int c; (c = getopt_long(argc, argv, "b:i:s:Ch", opts, 0)) != -1;) {
+        switch (c) {
+        case 'b': n_blocks = atoi(optarg); break;
+        case 'i': iters    = atoi(optarg); break;
+        case 's': seed     = strtoull(optarg, 0, 0); break;
+        case 'C': do_correctness = 0; break;
+        case 'h': usage(argv[0]); return 0;
+        default:  usage(argv[0]); return 2;
+        }
+    }
+
+    if (do_correctness) {
+        printf("=== M1: bit-exact correctness (10000 random blocks) ===\n");
+        int miss = correctness_check(seed, 10000);
+        if (miss != 0) {
+            fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
+            return 1;
+        }
+        printf("\n");
+    }
+
+    printf("=== M3: NEON throughput ===\n");
+    throughput_neon(seed, n_blocks, iters);
+    return 0;
+}
diff --git a/tests/bench_vulkan_dispatch.c b/tests/bench_vulkan_dispatch.c
new file mode 100644
index 0000000..657ca6b
--- /dev/null
+++ b/tests/bench_vulkan_dispatch.c
@@ -0,0 +1,279 @@
+/*
+ * Phase 3 — Vulkan compute dispatch-overhead microbench (M5).
+ *
+ * Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa
+ * v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a
+ * noop compute shader. Establishes the floor below which kernel
+ * batching is mandatory.
+ *
+ * Two measurements:
+ *   M5a: empty command-buffer submit (no dispatch at all)
+ *   M5b: 1-workgroup dispatch of an empty shader
+ *
+ * The delta M5b - M5a isolates the per-vkCmdDispatch cost from
+ * the per-vkQueueSubmit cost.
+ *
+ * Build: cmake -DDAEDALUS_BUILD_VULKAN=ON ..
+ * Run:   ./bench_vulkan_dispatch [--iters N] [--spv PATH]
+ *
+ * License: BSD-2-Clause (daedalus-fourier).
+ */
+#define _POSIX_C_SOURCE 200809L
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+#include <getopt.h>
+#include <vulkan/vulkan.h>
+
+#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
+    fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \
+    exit(1); } } while (0)
+
+static double now_seconds(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+    return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+static uint32_t *read_spv(const char *path, size_t *out_size)
+{
+    FILE *f = fopen(path, "rb");
+    if (!f) { perror(path); exit(1); }
+    fseek(f, 0, SEEK_END);
+    long sz = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    if (sz <= 0 || (sz & 3)) {
+        fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
+        exit(1);
+    }
+    uint32_t *buf = malloc(sz);
+    if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
+        perror("read"); exit(1);
+    }
+    fclose(f);
+    *out_size = sz;
+    return buf;
+}
+
+int main(int argc, char **argv)
+{
+    int iters = 100000;
+    const char *spv_path = "noop.spv";
+
+    static struct option opts[] = {
+        {"iters", required_argument, 0, 'i'},
+        {"spv",   required_argument, 0, 's'},
+        {"help",  no_argument,       0, 'h'},
+        {0,0,0,0}
+    };
+    for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) {
+        switch (c) {
+        case 'i': iters    = atoi(optarg); break;
+        case 's': spv_path = optarg; break;
+        case 'h':
+            fprintf(stderr,
+                "Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]);
+            return 0;
+        default:
+            return 2;
+        }
+    }
+
+    /* ---- Instance ---- */
+    VkApplicationInfo app = {
+        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+        .pApplicationName = "daedalus-fourier-bench",
+        .apiVersion = VK_API_VERSION_1_3,
+    };
+    VkInstanceCreateInfo ici = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pApplicationInfo = &app,
+    };
+    VkInstance instance;
+    CHK(vkCreateInstance(&ici, NULL, &instance));
+
+    /* ---- Pick V3D physical device (skip llvmpipe) ---- */
+    uint32_t pd_count = 0;
+    CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL));
+    VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds));
+    CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds));
+    VkPhysicalDevice phys = VK_NULL_HANDLE;
+    VkPhysicalDeviceProperties props = {0};
+    for (uint32_t i = 0; i < pd_count; i++) {
+        vkGetPhysicalDeviceProperties(pds[i], &props);
+        printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n",
+               i, props.deviceName,
+               VK_VERSION_MAJOR(props.apiVersion),
+               VK_VERSION_MINOR(props.apiVersion),
+               VK_VERSION_PATCH(props.apiVersion),
+               props.vendorID);
+        if (strstr(props.deviceName, "V3D") != NULL) {
+            phys = pds[i];
+        }
+    }
+    if (phys == VK_NULL_HANDLE) {
+        fprintf(stderr, "no V3D device found; bailing.\n");
+        return 1;
+    }
+    vkGetPhysicalDeviceProperties(phys, &props);
+    printf("selected: %s\n", props.deviceName);
+    free(pds);
+
+    /* ---- Compute queue family ---- */
+    uint32_t qfc = 0;
+    vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL);
+    VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp));
+    vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp);
+    uint32_t qfi = (uint32_t) -1;
+    for (uint32_t i = 0; i < qfc; i++) {
+        if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
+            qfi = i; break;
+        }
+    }
+    if (qfi == (uint32_t) -1) {
+        fprintf(stderr, "no compute queue family\n");
+        return 1;
+    }
+    free(qfp);
+
+    /* ---- Logical device ---- */
+    float qprio = 1.0f;
+    VkDeviceQueueCreateInfo dqci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = qfi,
+        .queueCount = 1,
+        .pQueuePriorities = &qprio,
+    };
+    VkDeviceCreateInfo dci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .queueCreateInfoCount = 1,
+        .pQueueCreateInfos = &dqci,
+    };
+    VkDevice dev;
+    CHK(vkCreateDevice(phys, &dci, NULL, &dev));
+    VkQueue queue;
+    vkGetDeviceQueue(dev, qfi, 0, &queue);
+
+    /* ---- Command pool + buffers ---- */
+    VkCommandPoolCreateInfo cpci = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = qfi,
+    };
+    VkCommandPool pool;
+    CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool));
+
+    VkCommandBuffer cb_empty, cb_dispatch;
+    VkCommandBufferAllocateInfo cbai = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+    CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty));
+    CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch));
+
+    /* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */
+    VkPipelineLayoutCreateInfo plci = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+    };
+    VkPipelineLayout playout;
+    CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout));
+
+    /* ---- Compute pipeline from noop SPIR-V ---- */
+    size_t spv_size = 0;
+    uint32_t *spv = read_spv(spv_path, &spv_size);
+    VkShaderModuleCreateInfo smci = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .codeSize = spv_size,
+        .pCode = spv,
+    };
+    VkShaderModule shader;
+    CHK(vkCreateShaderModule(dev, &smci, NULL, &shader));
+    free(spv);
+
+    VkComputePipelineCreateInfo cpci2 = {
+        .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+        .stage = {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+            .module = shader,
+            .pName = "main",
+        },
+        .layout = playout,
+    };
+    VkPipeline pipe;
+    CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe));
+
+    /* ---- Record both command buffers once, reuse for every iteration ---- */
+    VkCommandBufferBeginInfo cbbi = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+    };
+
+    CHK(vkBeginCommandBuffer(cb_empty, &cbbi));
+    CHK(vkEndCommandBuffer(cb_empty));
+
+    CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi));
+    vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe);
+    vkCmdDispatch(cb_dispatch, 1, 1, 1);
+    CHK(vkEndCommandBuffer(cb_dispatch));
+
+    VkSubmitInfo si_empty = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1, .pCommandBuffers = &cb_empty,
+    };
+    VkSubmitInfo si_disp = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1, .pCommandBuffers = &cb_dispatch,
+    };
+
+    /* ---- Warm-up ---- */
+    for (int i = 0; i < 100; i++) {
+        CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
+        CHK(vkQueueWaitIdle(queue));
+    }
+
+    /* ---- M5a: empty CB submit+wait ---- */
+    double t0 = now_seconds();
+    for (int i = 0; i < iters; i++) {
+        CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE));
+        CHK(vkQueueWaitIdle(queue));
+    }
+    double t1 = now_seconds();
+    double m5a_per = (t1 - t0) / iters * 1e6;  /* µs */
+
+    /* ---- M5b: 1-WG noop dispatch submit+wait ---- */
+    double t2 = now_seconds();
+    for (int i = 0; i < iters; i++) {
+        CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
+        CHK(vkQueueWaitIdle(queue));
+    }
+    double t3 = now_seconds();
+    double m5b_per = (t3 - t2) / iters * 1e6;  /* µs */
+
+    printf("\n=== M5: Vulkan compute dispatch overhead ===\n");
+    printf("  iters per measurement: %d\n", iters);
+    printf("  M5a empty CB submit+wait:           %.2f µs/op\n", m5a_per);
+    printf("  M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per);
+    printf("  delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n",
+           m5b_per - m5a_per);
+    printf("\n");
+    printf("  Implication for kernel batching:\n");
+    printf("    if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n");
+    printf("    a single-block dispatch costs %.0fx more in overhead\n",
+           m5b_per * 1e3 / 100.0);
+    printf("    -> batch at least %.0f blocks per dispatch to break even.\n",
+           m5b_per * 1e3 / 100.0);
+
+    /* ---- Tear down (minimal — process exit handles the rest) ---- */
+    vkDestroyPipeline(dev, pipe, NULL);
+    vkDestroyShaderModule(dev, shader, NULL);
+    vkDestroyPipelineLayout(dev, playout, NULL);
+    vkDestroyCommandPool(dev, pool, NULL);
+    vkDestroyDevice(dev, NULL);
+    vkDestroyInstance(instance, NULL);
+    return 0;
+}
diff --git a/tests/shaders/noop.comp b/tests/shaders/noop.comp
new file mode 100644
index 0000000..c2bc9fa
--- /dev/null
+++ b/tests/shaders/noop.comp
@@ -0,0 +1,5 @@
+#version 450
+// Empty compute shader for measuring Vulkan dispatch overhead (M5).
+// Reads nothing, writes nothing — pure dispatch round-trip floor.
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+void main() {}
diff --git a/tests/vp9_idct8_ref.c b/tests/vp9_idct8_ref.c
new file mode 100644
index 0000000..f8219df
--- /dev/null
+++ b/tests/vp9_idct8_ref.c
@@ -0,0 +1,114 @@
+/*
+ * Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
+ * transform + add (8-bit pixels), transcribed from the spec
+ * structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
+ * (vendored under external/ffmpeg-snapshot/ at commit f46e514).
+ *
+ * Provided as a self-contained translation unit so the harness
+ * doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
+ * expansion. Cross-checked against the vendored reference at
+ * runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
+ *
+ * License: LGPL-2.1-or-later (matches the upstream reference).
+ *
+ * Spec source: VP9 specification §8.7 — Inverse transform process.
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+/* Q14 trig constants — VP9 spec table 8.7.1.4. */
+#define COSPI_16_64 11585  /* cos(pi/4)  * 2^14 */
+#define COSPI_24_64  6270  /* cos(3pi/8) * 2^14 */
+#define COSPI_8_64  15137  /* sin(3pi/8) * 2^14 */
+#define COSPI_28_64  3196  /* cos(7pi/16)* 2^14 */
+#define COSPI_4_64  16069  /* sin(7pi/16)* 2^14 */
+#define COSPI_20_64  9102  /* cos(5pi/16)* 2^14 */
+#define COSPI_12_64 13623  /* sin(5pi/16)* 2^14 */
+
+/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
+static inline int32_t qround14(int64_t x)
+{
+    return (int32_t) ((x + (1 << 13)) >> 14);
+}
+
+static inline uint8_t clip_u8(int x)
+{
+    return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
+}
+
+/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
+ * idct8_1d in libavcodec/vp9dsp_template.c (with the stride
+ * collapsed to indexed access; identical arithmetic). */
+static void idct8_1d(const int32_t in[8], int32_t out[8])
+{
+    int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
+    int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
+    int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
+    int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64  + (int64_t)in[6] * COSPI_24_64);
+    int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
+    int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
+    int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
+    int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64  + (int64_t)in[7] * COSPI_28_64);
+
+    int32_t t0 = t0a + t3a, t1 = t1a + t2a;
+    int32_t t2 = t1a - t2a, t3 = t0a - t3a;
+    int32_t t4 = t4a + t5a;
+    int32_t t5p = t4a - t5a;
+    int32_t t7 = t7a + t6a;
+    int32_t t6p = t7a - t6a;
+
+    int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
+    int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);
+
+    out[0] = t0 + t7; out[1] = t1 + t6;
+    out[2] = t2 + t5; out[3] = t3 + t4;
+    out[4] = t3 - t4; out[5] = t2 - t5;
+    out[6] = t1 - t6; out[7] = t0 - t7;
+}
+
+/* Public reference entry point. Signature matches
+ * ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
+ * zeroed (matches FFmpeg behaviour). */
+void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
+                                        int16_t *block, int eob)
+{
+    int32_t tmp[64];
+    int32_t out[8];
+    int32_t col[8];
+
+    /* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
+     * broadcast (+16) >> 5 added to every pixel. */
+    if (eob == 1) {
+        int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
+                              * (int64_t) COSPI_16_64);
+        block[0] = 0;
+        int32_t add = (dc + 16) >> 5;
+        for (int r = 0; r < 8; r++)
+            for (int c = 0; c < 8; c++)
+                dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
+        return;
+    }
+
+    /* 8 column passes, transposed write: IDCT of block column i lands
+     * in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
+     * uses `tmp + i*8` as the column-pass output base — the transpose
+     * is implicit in the offset pattern, making the row pass below
+     * read columns of tmp and write columns of dst. */
+    for (int i = 0; i < 8; i++) {
+        for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
+        idct8_1d(col, out);
+        for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
+    }
+    memset(block, 0, 64 * sizeof(*block));
+
+    /* 8 row passes: column i of tmp -> column i of dst (matches
+     * FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
+    for (int i = 0; i < 8; i++) {
+        for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
+        idct8_1d(col, out);
+        for (int r = 0; r < 8; r++)
+            dst[r * stride + i] = clip_u8(dst[r * stride + i]
+                                          + ((out[r] + 16) >> 5));
+    }
+}