commit dcbbc77038f3489e8fc7221de5d9c96b62fa2f4a Author: Markus Fritsche Date: Mon May 18 11:30:12 2026 +0000 Path B pivot + Phase 0-3 closed with first baseline numbers This is a from-scratch initial commit on a fresh .git. The original scaffold commit (7510b56) and the earlier session's working-tree docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted .git is preserved at .git-broken-2026-05-18/ (gitignored) for forensic inspection. Scope re-anchored from Path A (custom VPU firmware on VC7 scalar cores; blocked by BCM2712 silicon-RoT mask-ROM signature check) to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or direct DRM, on stock signed Pi 5 / CM5). See README.md and docs/phase0.md for the substrate audit that closed Path A. Phases closed: Phase 0 — substrate audit; Path A blocked, Path B open; codec-back-end-fits-QPU finding (docs/phase0.md) Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with publish-before-measure R = M2/M3 decision rules (docs/phase1.md) Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored under external/ffmpeg-snapshot/ (PROVENANCE.md pins commit f46e514 + per-file SHA-256s) (docs/phase2.md) Phase 3 — real baseline measurements on hertz (docs/phase3.md): M1 bit-exact 100.0000 % (10000/10000) M3 NEON IDCT8 single 8.171 Mblock/s (122.4 ns/block) M5a empty Vulkan submit 22.66 us M5b 1-WG noop dispatch 55.60 us M5 delta 32.95 us/dispatch => per-dispatch overhead is ~455x per-NEON-block cost; Phase 4 must batch at frame level or close to it. Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c, vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} + external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM). Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12, libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S assembles via the config.h shim. Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching constraint) -> Phase 5 second-model review -> Phase 6 implement. Co-Authored-By: Claude Opus 4.7 (1M context) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7a6eee8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +build/ +build-*/ +*.o +*.spv +.cache/ +.vscode/ +.idea/ +*.swp +*~ + +# Forensic snapshot of the corrupted .git from 2026-05-18 10:25 +# working-tree wipe. Retained on disk for inspection; not tracked. +.git-broken-2026-05-18/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..a6b5125 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,103 @@ +# daedalus-fourier — Phase 3 baseline + (later) Phase 6 implementation. +# +# Builds: +# bench_neon_idct — NEON throughput baseline (Phase 3 M3) + +# bit-exact correctness gate (Phase 1 M1). +# bench_vulkan_dispatch — Vulkan compute dispatch-overhead baseline (M5). +# +# Linkage note: bench_neon_idct statically links the vendored +# FFmpeg n7.1.3 NEON snapshot (LGPL-2.1+); see +# external/ffmpeg-snapshot/PROVENANCE.md. + +cmake_minimum_required(VERSION 3.20) +project(daedalus-fourier C ASM) + +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED ON) + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + message(FATAL_ERROR + "daedalus-fourier targets aarch64 (Pi 5 / BCM2712). " + "Cross-compile not yet wired.") +endif() + +add_compile_options(-Wall -Wextra -Wno-unused-parameter) + +# ---- Vendored FFmpeg snapshot (LGPL-2.1+) ----------------------------------- + +set(FFSNAP ${CMAKE_SOURCE_DIR}/external/ffmpeg-snapshot) + +# Assembly preamble (config.h shim + FFmpeg's asm helpers) used by the +# vendored .S file. -I flags expose: +# - FFSNAP/ so `#include "config.h"` finds our shim +# - FFSNAP/libavcodec/aarch64/ so `#include "neon.S"` finds the helper +# - FFSNAP/ so `#include "libavutil/aarch64/asm.S"` +# resolves against the vendored copy +set(FFASM_FLAGS + -I${FFSNAP} + -I${FFSNAP}/libavcodec/aarch64 + -I${FFSNAP} +) + +set(FFASM_SOURCES + ${FFSNAP}/libavcodec/aarch64/vp9itxfm_neon.S +) + +# Tell CMake/gas to preprocess .S sources. +set_source_files_properties(${FFASM_SOURCES} PROPERTIES + COMPILE_OPTIONS "${FFASM_FLAGS}" + LANGUAGE ASM) + +# ---- NEON baseline microbench ---------------------------------------------- + +add_executable(bench_neon_idct + tests/bench_neon_idct.c + tests/vp9_idct8_ref.c + ${FFASM_SOURCES} +) +target_compile_options(bench_neon_idct PRIVATE -O3 -march=armv8-a+simd) +# bench_neon_idct doesn't need vulkan/drm — pure CPU baseline. + +# ---- Vulkan dispatch-overhead microbench (next chunk) ---------------------- +# Stub: written in a follow-up step. Toggle ON with -DDAEDALUS_BUILD_VULKAN=ON +# once tests/bench_vulkan_dispatch.c exists. + +option(DAEDALUS_BUILD_VULKAN "Build Vulkan compute-dispatch microbench" ON) + +if (DAEDALUS_BUILD_VULKAN) + find_package(Vulkan REQUIRED) + + # Compile GLSL compute shaders to SPIR-V via glslangValidator. + # The binary loads them at runtime from the build dir (cwd-relative). + find_program(GLSLANG_VALIDATOR + NAMES glslangValidator glslang + REQUIRED) + + set(NOOP_SPV ${CMAKE_BINARY_DIR}/noop.spv) + add_custom_command( + OUTPUT ${NOOP_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V -o ${NOOP_SPV} + ${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp + DEPENDS ${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp + COMMENT "glslang: noop.comp -> noop.spv" + VERBATIM + ) + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV}) + + add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c) + add_dependencies(bench_vulkan_dispatch daedalus_shaders) + target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan) + target_compile_options(bench_vulkan_dispatch PRIVATE -O2) +endif() + +# ---- Summary ---------------------------------------------------------------- + +message(STATUS "daedalus-fourier build configured for ${CMAKE_SYSTEM_PROCESSOR}") +message(STATUS " FFmpeg snapshot: ${FFSNAP}") +message(STATUS " Build type: ${CMAKE_BUILD_TYPE}") +message(STATUS " Targets: bench_neon_idct" + "$<$:; bench_vulkan_dispatch>") diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d9d223 --- /dev/null +++ b/README.md @@ -0,0 +1,177 @@ +# daedalus-fourier + +Community-built VP9 / AV1 software-decode back-end running on the +VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 / +Compute Module 5), via the existing Mesa `v3d` userspace driver. +ARM keeps the serial entropy front-end; the QPU takes the parallel +back-end (inverse transforms, deblocking, CDEF, loop restoration, +MC residual add). + +> Daedalus built the Labyrinth for King Minos, then escaped from it +> by hand-forging flight firmware out of feathers and wax when no +> sanctioned exit existed. + +That's the project shape. The Broadcom-locked VideoCore VII is the +Labyrinth; the Pi Foundation's "use the HEVC block and live with +software decode for everything else" is the official non-exit; +the QPU sits unused inside the labyrinth's walls. + +**Status: Phase 0 closed (substrate audit). Phase 1 in progress +(first-kernel proof on hertz).** This is research-track work that +may take months or may yield a single proof-of-concept kernel that +loses to ARM NEON, in which case the negative result ships and the +project closes. + +## Why this exists + +higgs is a Raspberry Pi Compute Module 5 in a small portable +chassis with a battery. Watching nerds review *Star Wars* on YouTube +while putting Mac Studios into virtual shopping baskets is a +core workload for the higgs class of device. + +YouTube serves H.264 (legacy), VP9 (typical 4K), and AV1 (newer +high-bitrate / high-resolution content). It does not serve HEVC. +Pi 5's BCM2712 has one HW decoder block: HEVC. The intersection +of {what YouTube serves} ∩ {what BCM2712 decodes in HW} = ∅. + +Every YouTube frame on higgs today is software-decoded on Cortex-A76 +cores at ~50–90% CPU per video stream. Offloading the parallel +back-end of that decode to the otherwise-idle QPU complex *might* +recover meaningful CPU time and battery on higgs. The honest +prior — measured in Phase 0 — is that the QPU has roughly equal +raw compute to the A76 cluster but a smaller slice of the shared +LPDDR4x bandwidth, so the win, if any, comes from offloading +*concurrent* work the CPU would have done anyway. + +The Pi Foundation isn't going to do this work (per their own +statement: chromium-patch sustainment was too much; codec +sustainment would be moreso). The kernel `rpi-hevc-dec` series has +been 17 months in review for one decoder block they DID write +themselves. Whatever ships here ships through the community. + +## Architecture (Path B) + +Phase 0 closed two paths: + +- **Path A — custom VPU firmware on the VC7 scalar cores.** + Blocked. BCM2712 has a silicon root of trust: the mask ROM + hardcodes RPi's public key and unconditionally verifies the + second-stage bootloader. `EXECUTE_CODE` mailbox removed on Pi 5. + No software-only bypass exists. See `docs/phase0.md §3`. + +- **Path B — QPU compute kernels via the existing Mesa `v3d` / + DRM / Vulkan-compute path.** This is the path. The QPU is + reachable from userspace today on a stock signed Pi 5 / CM5 + via `/dev/dri/card0`. No firmware loading. No signing fight. + `Idein/py-videocore7` (SGEMM 21 GFLOPS sustained) is the + existence proof. + +The build: + +``` +┌───────────────────────────────┐ +│ userspace VP9 / AV1 decoder │ +│ (fork of dav1d / libvpx) │ +├───────────────────────────────┤ +│ ARM: entropy decode │ ← Cortex-A76 + NEON +│ (Bool coder / ANS) │ structurally serial +├───────────────────────────────┤ +│ QPU: parallel back-end │ ← V3D 7.1 via Mesa v3dv +│ (IDCT, CDEF, │ Vulkan compute shaders +│ deblock, LR, MC) │ or direct DRM submit +├───────────────────────────────┤ +│ V4L2 stateless wrapper │ ← out-of-tree kernel module +│ (eventual, kernel-agent) │ exposing /dev/videoN +└───────────────────────────────┘ +``` + +The first deliverable is *not* the V4L2 wrapper. The first +deliverable is one back-end kernel running on the QPU, bit-exact +against a libavcodec reference, with measured throughput. If that +single kernel can't beat NEON or get within 50% of it, the project +closes here with a documented negative result. + +## In scope + +- A small set of codec back-end kernels (IDCT 8×8, CDEF, deblocking, + loop restoration filter, MC interpolation) compiled as SPIR-V + compute shaders for Mesa `v3dv`, dispatched via Vulkan compute + from userspace. +- A test harness on hertz that runs each kernel against libavcodec + reference outputs and measures throughput (megapixels/sec or + blocks/sec) against the equivalent NEON path. +- Phase 1 = one kernel, bit-exact, with numbers. Phase 2+ = more + kernels only if Phase 1 numbers justify it. + +## Out of scope (for now) + +- HEVC (Pi 5 has dedicated silicon; `rpi-hevc-dec` covers it). +- Pi 4 / BCM2711 / VideoCore VI. Different ISA, smaller compute + budget. Path B *could* extend but isn't the priority. +- Encode. Pi Foundation removed all HW encode in Pi 5; encode on + VC7 is a separate, larger project. +- Custom VPU firmware (Path A — blocked by silicon RoT, see + `docs/phase0.md`). +- V4L2 stateless driver wrapping the userspace decoder. Eventual + consumption point, but Phase 1 lives entirely in userspace. +- Beating ARM NEON unconditionally. The honest target is + *concurrent* work: QPU runs while CPU does something else. + +## Dev substrate + +- **hertz** (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75-rpt-rpi-2712, + Mesa 25.0.7 with v3dv, V3D 7.1.7) — the dev / test / measurement + host. Watchdog-protected for crash recovery. See + `docs/vulkaninfo_v3d_7_1_7_hertz.txt` for the inside-view device + profile. +- **higgs** (CM5 in portable battery chassis) — the eventual user + target. Not a dev unit; sealed chassis. + +## Conventions + +This project follows the 9(+1)-phase dev process. See +`docs/dev_process.md`. Phase 0 is closed (`docs/phase0.md`); +Phase 1 is `docs/phase1.md`. + +Gitea identity: `claude-noether` (per +`feedback_gitea_as_claude_noether.md`). No `marfrit` pushes from +Claude sessions. + +## Layout + +``` +daedalus-fourier/ +├── README.md ← this file +├── docs/ +│ ├── dev_process.md ← reference copy of the 9(+1)-phase loop +│ ├── phase0.md ← substrate audit (closes Paths A and B) +│ ├── phase1.md ← first-kernel goal + measurement plan +│ └── vulkaninfo_v3d_7_1_7_hertz.txt +│ ← inside-view device profile from hertz +├── src/ ← kernels + Vulkan dispatch harness +└── tests/ ← bit-exact vs libavcodec, throughput +``` + +No build system yet. Adding CMake when the first kernel lands. + +## Sibling projects in the same orbit + +- `libva-v4l2-request-fourier` — VA-API consumer-side backend. + Eventual consumer if daedalus produces a V4L2 stateless node. +- `firefox-fourier` — Firefox fork that routes stateless V4L2 + through libavcodec's `v4l2_request` hwaccel. Same pickup point. +- `chromium-fourier` — sibling for Chromium. +- `kernel-agent` — would house the V4L2 driver wrapping the + userspace decoder, once one exists. +- `ampere-av1-enablement` — software-side AV1 bring-up on RK3588 + (rkvdec / vpu981). Provides the userspace conformance harness + daedalus reuses for VC7-AV1 verification. + +## Source attribution + +Daedalus-the-myth is public domain. The wax-and-feathers +metaphor is older than software engineering. + +Anyone wanting to fail at this project: please file your failures +under `branches/icarus/`. Built-in self-deprecation slot, with +honor. diff --git a/docs/dev_process.md b/docs/dev_process.md new file mode 100644 index 0000000..ae9715c --- /dev/null +++ b/docs/dev_process.md @@ -0,0 +1,96 @@ +--- +name: Claude-Assisted Development Process (9(+1)-phase loop) +description: Default workflow for any non-trivial implementation — substrate/motivation/inventory, formulate, analyze, baseline, plan, second-model review, implement, verify, closing (package+ship), memory-update; with explicit loopback edges +type: feedback +originSessionId: 83898ac9-e61f-4c44-8429-0154cb12d124 +--- +Markus's standardized loop for our implementation work. Apply by default whenever a task is bigger than a one-liner. Skipping phases is a deliberate choice that should be flagged, not a default. + +## Phase 0 — Substrate / Motivation / Inventory + +Pre-formulation. Lock the research question and assemble the substrate *before* Phase 1 commits to a measurable goal. Output: a `phase0_findings.md` artifact that future phases can refer back to without re-deriving. + +- **Research question + mechanism captured.** State the question in one sentence. Capture any operator-supplied mechanism (the "why this question, how does it work" insight) verbatim — it's the load-bearing claim Phase 1 binds against. +- **Predecessor carry-over: state vs data.** When a campaign succeeds another, categorize what transfers. *State* (installed packages, governor settings, system tweaks, source-read file:line pointers, protocol designs, parser scripts) carries forward. *Data* (drop counts, perf percentages, threshold values, baseline floors) does not — it is reference history only. Binding cells in this campaign anchor to in-session-acquired numbers, even if the predecessor measured an identical condition. +- **Tooling and measurement-instrument inventory.** What's installed, what would need installing, what extensions/protocols the live system actually supports. Live verification, not paper compatibility. +- **In-session baseline anchor.** Re-run the reference rep — N=3 minimum if the baseline is load-bearing for the campaign's premise — *before* any instrument changes. **If the predecessor's reference floor doesn't replicate at N=3 in the same session, that is the campaign result.** Don't build multi-phase infrastructure on an N=1 historical floor. See `feedback_replicate_baseline_first.md`. +- **Open questions tabled.** What's not known going into Phase 1. Phase 1 locks against the knowns; Phase 0 surfaces the unknowns explicitly so they don't slip into binding cells unverified. + +## Phase 1 — Goal Formulation +Define the objective in measurable terms. State what success looks like *before* touching anything. The chosen metric is a **hypothesis** about what to measure, not an axiom — Phase 3 may invalidate it. + +## Phase 2 — Situation Analysis +Document current state. Identify constraints, dependencies, known failure modes. **Reset context here** — do not carry assumptions from prior sessions; re-read CLAUDE.md, relevant memory files, run `git status`, re-verify reachability. + +## Phase 3 — Baseline Measurements +Take concrete measurements *before* any changes. Paste raw output into DokuWiki at capture time — verbatim, not paraphrased. The Phase 5 artifact is the raw data, not Claude's summary. + +**Real data, not theatre.** Phase 3 exists to use AI capacity for absorbing wide, low-level instrumentation a human reader would skim past. Attaching strace / perf / ftrace / eBPF / custom tripwires to the process under test is real Phase 3; scraping mpv's stdout dropped-frame counter is not. Discriminator: if a human with bash and grep could produce the same baseline, it isn't Phase 3 yet — go down to the syscall / call-path / MMIO / register layer. See `feedback_phase3_no_theatre.md`. + +**Anti-fabrication:** +- Every cited value traces to a visible tool invocation or verbatim paste-in. If a measurement wasn't taken, write "not measured" — never an estimate, inference, or recall from training / prior sessions / sibling-host memory. +- Raw before derived. A derived number (FPS, p99, error rate) appears alongside the raw stream it came from, never alone. +- Rig failure is the finding. Empty strace, dead UART, perf counter that didn't increment → that *is* the Phase 3 result. Loop back to Phase 2 to fix the rig; do not synthesize plausible-looking baseline data to keep momentum. + +- **If baseline reveals the Phase 1 metric was tracking the wrong thing → loop back to Phase 1** with the corrected target. (Example: "max H.264 FPS" Phase 1 metric, but baseline shows DMA-setup + sync overhead dwarfs decode → real metric is bytes-copied-per-second / EGL surface-import time, not FPS.) + +**Measurements describe what the system *does*, not what it *should do*.** Baseline data is evidence, not a specification. Do NOT derive API call sequences, struct layouts, or parameter values from observed behaviour (strace, perf, example output). Observable behaviour may reflect bugs, workarounds, or implementation accidents — anything you copy from it inherits those. + +## Phase 4 — Plan +Formulate the approach. Identify what will and will not be touched. State expected outcome of implementation in the *same* measurable terms used in Phase 1/3. + +## Phase 5 — Second Model Review +Goal, situation, measurements, plan get pasted into **DokuWiki**. Markus reviews and redacts, then initiates the handover to a fresh model instance. **Claude does not curate the artifact going to the reviewer** — that would re-introduce the blind-spot accumulation the review is meant to escape. Do not summarize when handing over; paste the actual artifacts. + +## Phase 6 — Implementation +Execute the plan. Scope strictly to what was planned — resist feature creep, refactor-creep, "while I'm here" cleanups, and over-eager scope expansion. If a plan revision is needed mid-implementation, surface it explicitly and re-enter Phase 4. + +**Contract before code.** Before writing or modifying any call site: +- Read the API contract — kernel docs, header comments, and upstream source for every call touched. +- State the contract explicitly before implementing against it (in the plan, the commit message, or a comment — somewhere reviewable). +- If the contract cannot be found: stop and surface the gap. Don't infer it from baseline behaviour or sibling code. + +**Copying from baseline measurements is not implementation. It is transcription of potentially broken behaviour.** A deliverable that matches baseline bytes but violates the API contract is not a deliverable — it is a deferred bug. + +### What "state the contract explicitly" looks like + +Worked example: `0012-h264-omit-scaling-matrix-frame-based.patch` in `~/src/ohm_gl_fix/phase6/step1/`. The commit message opens with the contract before any code: + +> VAAPI signals "explicit scaling lists are present in the bitstream" implicitly: the consumer (ffmpeg-vaapi, mpv, etc.) sends a `VAIQMatrixBufferH264` alongside `RenderPicture` iff `sps_scaling_matrix_present_flag || pps_scaling_matrix_present_flag`. When the bitstream uses default (flat) scaling, no IQMatrixBuffer arrives […] +> +> Earlier draft of this patch unconditionally omitted SCALING_MATRIX in FRAME_BASED. That's **corpus-correct** (bbb has no explicit scaling lists) but the **wrong predicate**: the kernel-side gating is by "matrix-supplied vs. not," not by decode mode. […] +> +> Contract verification (audit_0008_decode_params_2026-05-01.md + hantro_h264.c::assemble_scaling_list): the kernel uses the supplied matrix when SCALING_MATRIX is in the control batch and falls back to spec-defined defaults when absent. Mode-independent. + +What this gets right: +- **Contract first**: per-control rules cited from kernel doc (`ext-ctrls-codec-stateless.rst:752`), kernel driver (`hantro_h264.c::assemble_scaling_list`), and sibling implementation (gst-plugins-bad commit 9e3e775) — *before* any patch hunks. +- **Corpus-correct ≠ spec-correct, called out by name**: the rejected predicate ("omit SCALING_MATRIX in FRAME_BASED") *did* match the BBB baseline. It still got rejected, because the contract said the gate is "matrix-supplied vs. not," not "decode mode." This is exactly the Phase 3-derived-implementation trap. +- **Then** the diff implements one branch per contract clause: SPS/PPS/DECODE_PARAMS always, SCALING_MATRIX iff `matrix_set`, SLICE_PARAMS iff SLICE_BASED, PRED_WEIGHTS iff SLICE_BASED + `V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED`. + +Mirror format anywhere reviewable: PR description, commit message body, plan section, or a header comment block. The shape is "contract clauses with citations → code that maps 1:1 to those clauses." + +## Phase 7 — Verification Measurements +Repeat measurements from Phase 3. Compare explicitly against baseline. +- **If the delta does not match Phase 4's prediction → loop back to Phase 4** (re-plan). Do not declare success when the numbers say otherwise; an unexplained delta is a finding, not a footnote. + +## Phase 8 — Closing (Package & Ship) +Ship the deliverable to its consumption point. Working code that lives only in a checkout is half a deliverable — the next session has to re-discover it, the fleet doesn't get the fix, and the loop's value evaporates. + +- **Kernel patch → kernel-agent package.** Route through the kernel-agent flow (`fleet/.yaml` + scope-tagged patches) so the kernel package gets properly built, signed, and published. Don't leave loose `.patch` files in a working tree. See `project_kernel_agent.md` for the manifest shape; `linux-ampere-fourier` and `linux-fresnel-fourier` are the canonical examples. +- **Program / library change → marfrit-packages.** Add or update a PKGBUILD (Arch/ALARM) or debian/ tree (deb), push to `git.reauktion.de/marfrit/marfrit-packages`, and let `.gitea/workflows/build.yml` produce + sign + publish to `packages.reauktion.de`. See `project_marfrit_packages.md`. Local-only fixes go upstream as PR-quality diffs into the same overlay. +- **Skipping is a deliberate choice.** If the change is one-shot scratch work (debugging tripwire, throw-away script), say so explicitly in the closing note. The default is: it gets packaged. +- **Re-verify on the deploy host with the packaged artifact.** A clean Phase 7 result from a hand-rolled dev build (e.g. `meson -Dbuildtype=release && ninja`) is **not** the same as the `.pkg.tar.zst` / `.deb` that the deploy host installs. Distro packaging flags (Arch makepkg's `-O2 + FORTIFY + stack-protector-strong + stack-clash-protection` vs meson's `-O3 -DNDEBUG`, debhelper's hardening defaults, lto toggles) vectorise / unroll loops differently and routinely unmask latent UB the dev build folded away. Pull the published package down via the package manager and re-run the Phase 7 success criterion against it before closing — until that PASSes, the loop is not done. See `feedback_package_build_flags_unmask_bugs.md` for the iter39 incident that codified this. + +## Phase 9 — Memory Update +Loop terminates here. Distill the lesson into a memory entry — what was the mistake the loop caught, what's the rule that would shorten the next cycle. Do not let the lesson rot in chat history. + +--- + +## Loopback edges (summary) +- Phase 3 → Phase 1 (metric was wrong) +- Phase 7 → Phase 4 (plan didn't deliver predicted delta) +- Any phase → Phase 0 (substrate was wrong: predecessor baseline didn't replicate, mechanism doesn't engage on this stack, or the data inverts the premise → re-anchor or honest close) +- Phase 9 closes the loop + +## Why this exists +Several recurring failures in prior work codify into individual rules — observer-first, simulate-before-flash, three-strikes-then-verify, "trust eyes not vibes," scope-strictly-to-plan, no-fake-dry-run. Those are all symptoms; this loop is the structural fix. Use it as the spine and let those rules show up as rejection patterns inside the appropriate phases. diff --git a/docs/phase0.md b/docs/phase0.md new file mode 100644 index 0000000..8dbc370 --- /dev/null +++ b/docs/phase0.md @@ -0,0 +1,239 @@ +--- +phase: 0 +status: closed 2026-05-18 +date_opened: 2026-05-17 +date_closed: 2026-05-18 +research_method: three rounds of parallel web research (Sonnet via Agent), plus hands-on hertz substrate inventory and live `vulkaninfo` capture +target_hardware: hertz (Pi 5 8 GB) for dev; higgs (CM5) eventual user target +--- + +# Phase 0 — Substrate / motivation / inventory + +This is the consolidated Phase 0 record. Path A (custom VPU firmware) +is **closed at the silicon-RoT step**; Path B (QPU compute via the +existing Mesa `v3d` driver) is **open**. The remainder of the +project lives in Path B. + +The earlier session produced two separate Phase 0 artifacts that +were lost when the working tree was wiped at 2026-05-18 10:25 +(`.git-broken-2026-05-18/` retains the corrupted state if needed). +This document supersedes both. + +--- + +## 1. Research question + +Verbatim from `README.md`: + +> Community-built VP9 / AV1 software-decode back-end running on the +> VideoCore VII (V3D 7.1) QPUs on Broadcom BCM2712 (Raspberry Pi 5 / +> Compute Module 5), via the existing Mesa `v3d` userspace driver. + +The load-bearing claim: *the QPU is programmable by us, on stock +production hardware, and the codec back-end is a workload class +where that programmability buys CPU time on the A76 cluster.* +Phase 0's job is to test that claim before Phase 1 binds a metric. + +## 2. Substrate inventory — hertz + +Captured live 2026-05-17 via SSH. Full `vulkaninfo` in +`vulkaninfo_v3d_7_1_7_hertz.txt`. + +| | | +|---|---| +| Host | hertz, Pi 5, 8 GB, eMMC + 1 TB SATA | +| Role | LXD host for 11 containers (home-LAN spine — DNS / VPN / HA proxy / NCP / SMTP) | +| OS | Debian 13 Trixie | +| Kernel | `6.12.75+rpt-rpi-2712` (RPi Foundation kernel, 2026-03-11) | +| CPU | 4× Cortex-A76 @ 2.8 GHz | +| GPU clock | V3D 7.1 @ 1000 MHz (slight OC; spec 960 MHz) | +| Mesa | `25.0.7-2+rpt4` (`libvulkan_broadcom.so` v3dv ICD) | +| Vulkan loader | `1.4.309` | +| Vulkan device API | 1.3.305 (conformance 1.3.8.3) | +| DRM nodes | `card0 → v3d` (compute target), `card1 → vc4-drm` (display), `renderD128` | +| kernel uAPI hdr | `/usr/include/drm/v3d_drm.h` present | +| Build tools | cmake 3.31, ninja 1.12, libvulkan-dev 1.4.309, glslang-tools 15.1.0, spirv-tools 2025.1, libdrm-dev 2.4.131 (installed 2026-05-17) | +| User groups | mfritsche ∈ `render`, `video`, `lxd`, `sudo` | +| Memory pressure | 7.9 GiB RAM, ~3 GiB available; 6 GiB zram, ~2.8 GiB in use (cohabitation with LXD spine) | +| Watchdog | yes — power-cut reboot via Himbeere plug if hertz crashes (acknowledged dev cost: household DNS/VPN drops during each reboot cycle) | + +**Inside-view V3D 7.1 compute envelope** (from +`vulkaninfo_v3d_7_1_7_hertz.txt`): + +| Property | Value | Implication | +|---|---|---| +| `maxStorageBufferRange` | 1 GiB | Bounds single-tensor size; codec working sets (frames, planes) fit trivially | +| `maxPerStageDescriptorStorageBuffers` | 8 | Forces ≤8 SSBO bindings per dispatch — ggml-vulkan binds more, doesn't fit | +| `maxComputeSharedMemorySize` | 16 KiB | Small tiled kernels only; codec block work (8×8, 16×16) fits easily | +| `maxComputeWorkGroupInvocations` | 256 | Standard | +| `maxComputeWorkGroupSize` | 256 / 256 / ? | Standard | +| `subgroupSize` | 16 (fixed) | Matches QPU SIMD width | +| `subgroupSupportedOperations` | BASIC + VOTE only | No arithmetic reductions — accumulate via shared memory | +| `shaderFloat16` | **false** | Storage only; arithmetic runs FP32 | +| `shaderInt8` | **false** | Storage only; arithmetic on widened ints | +| `shaderInt16` | **false** | Same | +| `storageBuffer8/16BitAccess` | true | Can load tightly-packed quantized / packed pixel data | +| `subgroupSizeControl`, `computeFullSubgroups`, `synchronization2` | true | Modern compute features available | + +**Throughput envelopes** (from prior community measurements, +not yet re-confirmed in-session): + +| Metric | Value | Source | +|---|---|---| +| V3D 7.1 theoretical FP32 peak | ~92 GFLOPS at 960 MHz | 12 QPU × 4 ALU × 2 op/cycle | +| Direct-DRM SGEMM sustained | 21.4 GFLOPS (~23%) | `Idein/py-videocore7` | +| Vulkan-compute `vkpeak` fp32-vec4 | 6.9 GFLOPS (~7.5%) | RPi forum benchmark thread | +| A76 NEON sustained for matmul | ~50 GFLOPS | Multiple benchmark sources | +| Shared LPDDR4x bus | ~17 GB/s nominal | LPDDR4x-4267 × 32 bit / 8 | +| GPU-measured BW share | 4–7 GB/s | py-videocore7 scopy benchmark | +| CPU NEON BW achievable | 12–15 GB/s | Pi 5 STREAM benchmarks | + +## 3. Path A — closed + +**Custom VPU firmware loaded onto VC7 scalar cores.** This was the +README's original framing. + +Blocked at the silicon-RoT step: + +- **BCM2712 mask ROM hardcodes RPi's public key** and unconditionally + verifies the second-stage bootloader (`bootsys`) on every boot + path (SPI flash, USB rpiboot, SD recovery). RPi holds the + corresponding private key. +- `EXECUTE_CODE` mailbox tag (the only documented Pi 1–4 runtime + "run code on a VPU core" mechanism) **confirmed removed on Pi 5** + by Pi Foundation engineer (forum.raspberrypi.com). +- Pre-CRA EEPROM downgrade is possible (no anti-rollback fuse) but + only yields *older RPi-signed* EEPROMs — doesn't help. +- OTP fuse state on stock CM5 is already the most permissive + possible (customer key hash = zero); the RPi-key check is + silicon-unconditional, not gated by OTP. +- CM5 vs retail Pi 5: same silicon, same chain, no meaningful + security delta. +- One non-software escape exists: VPU JTAG via documented test + points (`schlae/cm5-reveng`, Dec 2025). Hardware mod only, + sealed-chassis higgs not the dev unit, novel research with no + published firmware-injection workflow. Out of scope for this + project. + +Verdict: **structurally blocked for community use without RPi +cooperation or hardware-RE-grade work on a sacrificial CM5.** + +## 4. Path B — open + +**QPU compute kernels via the existing Mesa `v3d` driver.** Reachable +from userspace today on a stock signed Pi 5 / CM5 via +`/dev/dri/card0` (Vulkan compute through `v3dv`) or `renderD128` +(direct DRM submit, py-videocore7 style). No firmware loading. +No signing fight. mfritsche on hertz is in the `render` group and +can hit the device without sudo. + +The substrate is real: +- `Idein/py-videocore7` runs SGEMM at 21 GFLOPS sustained on stock + Pi 5 with no special setup — existence proof of arbitrary QPU + programs. +- Mesa v3dv is Vulkan 1.3-conformant on V3D 7.1 (Mesa 24.3+; + hertz runs 25.0.7). +- The kernel `v3d` DRM driver is fully upstream and open. + +Phase 0 does **not** assume Path B leads to a winning result. It +asserts only that Path B is *reachable*, where Path A isn't. + +## 5. Why this isn't the same project as "v3d backend for llama.cpp" + +A llama.cpp v3d backend was investigated mid-session and rejected +as structurally infeasible. The verdict was decisive: GPU loses +to CPU on raw FP32 (21 vs ~50 GFLOPS), on memory bandwidth share +(4–7 vs 12–15 GB/s), and on quantized instruction support (no +INT8 MAC vs A76 SDOT/UDOT). For LLM matmul, the QPU is the wrong +substrate. + +**Codec back-end work is a different workload class** with +properties that fit the QPU substantively better: + +| Property | LLM matmul | Codec back-end (post-entropy) | +|---|---|---| +| Working set per dispatch | Whole weight matrices (GB) | Per-block (8×8 / 16×16, hundreds of bytes) — fits in 16 KiB shared mem | +| Dominant op | INT8 MAC | Integer add / shift / small-constant multiply | +| Why GPU misses | No INT8 MAC | Less impact — fewer multiplies, mostly add/shift | +| Memory pattern | Full-tensor stream | Sequential plane reads, TMU-friendly | +| Parallelism | One big GEMM | Thousands of independent small blocks per frame | +| A76 advantage | NEON SDOT/UDOT crushing it | Less specialized; QPU advantage real | +| Bandwidth-bound? | Yes (kills the GPU) | Compute-bound at block scale | + +This is the load-bearing reframe between the failed llama.cpp +investigation and the daedalus-fourier scope. Codec back-end +*might* live on the QPU. Phase 1 measures whether it actually does. + +## 6. Honest probability assessment + +A competent outside reviewer should rate the project as **hard but +viable**, with one concrete prior precedent (MulticoreWare / +Imagination PowerVR OpenCL VP9 decoder, 2014, achieved 1080p30 in +a hybrid model with CPU entropy + GPU back-end on a comparable +embedded GPU) and one concrete recent failure (FFmpeg 8.0 VP9-on- +Vulkan-compute, 2025, produced corrupted output on a much more +capable NVIDIA target — but the failure was in the *attempt to +move entropy onto GPU*, not the back-end). + +The win condition is **not** "GPU beats CPU at the same work." The +win condition is **"GPU work overlaps with CPU work that has to +happen anyway"** — concurrent decode where ARM does entropy and +the QPU finishes the block-level back-end on the previous frame, +recovering CPU time for the rest of the system (browser, audio, +UI, the 11 LXD containers on hertz). + +Phase 1 measures the building block: one kernel, bit-exact, with +numbers. Phase 2+ only if Phase 1 numbers justify it. + +## 7. Open questions for Phase 1 + +1. **What's the actual single-kernel QPU throughput on a + codec-shaped workload?** SGEMM at 21 GFLOPS is the only public + number, and SGEMM is not block-IDCT-shaped. We need an in-session + N=3 measurement on a real codec kernel. + +2. **What's the ARM NEON baseline for the same kernel on the same + hertz?** libavcodec ships highly-tuned NEON paths for IDCT, + deblocking, etc. Without measuring NEON in-session, "the QPU + wins" or "the QPU loses" is unverifiable. + +3. **Vulkan compute vs direct DRM submit — which path?** Vulkan + has tooling, documentation, debuggability. Direct DRM has + ~10–15% lower per-dispatch overhead and bypasses the + v3dv-imposed 16 KiB shared-mem / 8-SSBO limits, at the cost + of writing QPU asm against the NDA ISA. Phase 1 picks one. + +4. **Memory bandwidth contention with concurrent ARM decode.** + The shared 17 GB/s bus is the floor. If QPU+ARM-NEON both + running collide for bandwidth, the "concurrent work" win + disappears. Needs in-session measurement once any kernel exists. + +5. **VC7 thermal headroom under sustained mixed CPU+GPU load.** + Pi 5 throttles GPU at 85°C, CPU at 80°C. hertz idles at ~64°C + with the LXD spine; mixed compute will push higher. With or + without active cooling on hertz is an open question. + +These are Phase 1's burden, not Phase 0's. Phase 0 closes here. + +## 8. Sources + +Earlier session's web research produced ~7000 words of substrate +references across 6 parallel threads. The full source list lived +in the deleted `phase0_findings.md` and `phase0_wall1_bypass.md`. +The high-value pointers that should follow this project forward: + +- [Mesa `src/broadcom/qpu/qpu_instr.h`](https://github.com/Mesa3D/mesa/blob/main/src/broadcom/qpu/qpu_instr.h) — de-facto VC7 QPU ISA reference (no Broadcom-published doc; ISA under NDA) +- [Mesa `src/broadcom/compiler/`](https://github.com/Mesa3D/mesa/tree/main/src/broadcom/compiler) — NIR→QPU compiler, the open ground truth for what V3D 7.1 can do +- [`Idein/py-videocore7`](https://github.com/Idein/py-videocore7) — working QPU GPGPU runtime via DRM; SGEMM benchmark; existence proof +- [`Towdo/py-videocore7`](https://github.com/Towdo/py-videocore7) — fork with more fixes +- [Mesa `v3dv` driver source](https://gitlab.freedesktop.org/mesa/mesa/-/tree/main/src/broadcom/vulkan) — Vulkan compute path +- [Pi 5 HEVC kernel driver patch series](https://patchwork.kernel.org) — closest architectural template for ARM-side V4L2 stateless wrapping a Pi-5 hardware accelerator (search "rpi-hevc-dec") +- [raspberrypi/usbboot secure-boot.md](https://github.com/raspberrypi/usbboot/blob/master/docs/secure-boot.md) — Wall 1 silicon-RoT confirmation +- [schlae/cm5-reveng](https://github.com/schlae/cm5-reveng) — CM5 PCB RE; VPU JTAG test points (Dec 2025; out of Path B scope, kept as escape hatch reference) +- [MulticoreWare / Imagination PowerVR VP9 OpenCL decoder press](https://www.design-reuse.com/news/34030/vp9-decoder-imagination-powervr-series6-gpus.html) — 2014 precedent for hybrid codec back-end on embedded GPU compute +- [FFmpeg 8.0 part-3 VP9 Vulkan failure post](https://www.rendi.dev/blog/ffmpeg-8-0-part-3-failed-attempts-to-use-vulkan-for-av1-encoding-vp9-decoding) — recent cautionary tale; failure was in entropy stage, not back-end +- [`Halide/Halide` Vulkan Pi 5 issue #8494](https://github.com/halide/Halide/issues/8494) — known runtime edge cases on Pi 5 Vulkan +- [Pi Forum p=2330030](https://forums.raspberrypi.com/viewtopic.php?p=2330030) — RPi engineer confirms VC7 ISA NDA + EU CRA signing rationale + +Future phases should add citations here as they're consumed, not +re-derive Phase 0's substrate findings. diff --git a/docs/phase1.md b/docs/phase1.md new file mode 100644 index 0000000..0032fe3 --- /dev/null +++ b/docs/phase1.md @@ -0,0 +1,128 @@ +--- +phase: 1 +status: open +date_opened: 2026-05-18 +parent: phase0.md +target_kernel: VP9 / AV1 8×8 inverse DCT (integer fixed-point) +dev_host: hertz +--- + +# Phase 1 — Goal formulation + +Per `dev_process.md`: + +> Define the objective in measurable terms. State what success looks +> like *before* touching anything. The chosen metric is a **hypothesis** +> about what to measure, not an axiom — Phase 3 may invalidate it. + +## Kernel under test + +**VP9 / AV1 8×8 inverse DCT (DCT_DCT variant), integer 16-bit +fixed-point input, 8-bit output, with reconstructed-block add.** + +Mirrors the `ff_vp9_idct_idct_8x8_add_neon` shape in libavcodec +(see `libavcodec/aarch64/vp9itxfm_neon.S`) and the equivalent +dav1d / rav1d / libgav1 implementations for AV1's `IDTX_DCT` / +`DCT_DCT` 8×8 path. + +I/O contract (per VP9 spec § 8.7 inverse transform process): + +``` +input: int16_t coeffs[64] // dequantized transform coefficients +input: uint8_t pred[64] // predicted block (intra/inter) +input: ptrdiff_t stride // typically 8 for an isolated test +output: uint8_t dst[64] // clamp(pred + idct(coeffs)) per pixel +``` + +Bit-exact: integer arithmetic per spec, no rounding ambiguity. + +## Measurable success criteria + +Three numbers must come out of Phase 7, all measured in-session on +hertz, all N≥3: + +| ID | Measurement | What it tells us | +|---|---|---| +| **M1** | **Bit-exactness rate** vs libavcodec C reference, across ≥10 000 random coefficient blocks | Correctness gate. Must be 100.000 %. Anything less and the kernel is wrong, no other number matters. | +| **M2** | **QPU throughput** in million-blocks-per-second (MblockS), single-threaded host driver, sustained over ≥1 s | The substrate's actual delivered capacity for this kernel shape. | +| **M3** | **NEON throughput** in MblockS on the same hertz, single-threaded, running `ff_vp9_idct_idct_8x8_add_neon` via a microbench harness | The floor any GPU offload has to beat or get close to. | + +Derived figure for go/no-go: **R = M2 / M3**. + +## Decision rules (set before measuring, per `feedback_no_motivated_reasoning`) + +| R | Interpretation | Next step | +|---|---|---| +| ≥ 1.0 | QPU beats NEON on this kernel in isolation. Strong substrate signal. | Phase 9 lessons → Phase 1 of next kernel (deblocking or CDEF). | +| 0.5 ≤ R < 1.0 | QPU loses in isolation but is in the same order of magnitude. *Concurrent-work* hypothesis becomes viable: at R≈0.5 the QPU can roughly handle half of decode while the CPU does the other half + everything else. | Add a Phase 1' measurement: M4 = combined CPU+QPU throughput when both run concurrently (does total system delivery exceed pure-CPU?). Then decide. | +| 0.1 ≤ R < 0.5 | QPU is materially slower. Concurrent-work win unlikely to be worth the integration cost. | Honest close. Phase 9 documents the negative result. | +| < 0.1 | QPU is structurally wrong for this kernel shape. | Honest close. Phase 9 documents the failure, project shelves. | + +These thresholds are deliberately published *before* measurement so +the result can't be retroactively reframed. + +## Secondary measurements (not gating, but recorded) + +- **M5** — per-kernel-launch overhead in µs, isolated (run with 0 + blocks, measure submit+wait round-trip). Tells us the floor below + which kernel batching is required. +- **M6** — workgroup-size sweep across {8, 16, 32, 64, 128, 256} + invocations to identify the v3dv-optimal launch shape for this + kernel. Records the Pareto curve, doesn't change R unless the + best-WG result invalidates M2. +- **M7** — power draw delta at the wall (via the Himbeere Fritz!DECT + plug telemetry, if reachable) under idle vs CPU-only vs QPU-only + vs CPU+QPU concurrent. Order-of-magnitude only; informs the higgs + battery argument that motivates the project. + +## What Phase 1 does *not* lock + +- The dispatch path (Vulkan compute via `v3dv` vs direct DRM + submit via `v3d_drm.h` ioctl). Phase 4 picks. Default for + Phase 1 = **Vulkan compute** unless Phase 4 has reason to flip: + documented, debuggable, doesn't require QPU asm against the + NDA ISA. +- The shader source (GLSL → glslang → SPIR-V) vs hand-written + SPIR-V. Default = GLSL. +- Workgroup partitioning (one-block-per-WG vs many-blocks-per-WG). + Phase 4 chooses based on subgroup width and tile cost; Phase 1 + records the sweep (M6). + +## Non-goals for Phase 1 + +- No V4L2 driver work. +- No end-to-end VP9 / AV1 decode (entropy + back-end). Just one + kernel, isolated, measured. +- No optimization beyond what's needed to hit the bit-exact gate + and produce a single throughput number. Tuning is Phase 7's + feedback if R is borderline. +- No build-system perfection. A CMakeLists that compiles the test + harness on hertz is enough. + +## Phase 2 → Phase 3 hand-off conditions + +Phase 1 closes when: +- The above metrics + decision rules are reviewed (second-model + review per dev_process.md Phase 5? No — this is *Phase 1* not + Phase 5. The Phase 5 second-model review comes after Phase 4 + plan). +- The metrics are recorded in this file or a sibling + `phase1_metrics.md` artifact (TBD). + +The next phase (Phase 2 — situation analysis) inventories: +- libavcodec's NEON IDCT reference (file, function, calling + convention, expected I/O contract). +- VP9 spec § 8.7 transform process (which the C reference + implements verbatim). +- AV1 spec § 7.7 (same transform structure, larger transform set; + 8×8 DCT_DCT path is identical to VP9's at this size). +- Mesa v3dv's compute-shader compilation path and any known + v3dv-specific shader idioms that perform better on V3D 7.1. +- The hertz Vulkan dispatch overhead floor (M5 candidate, but + measured as part of Phase 3 baseline). + +## Open questions Phase 1 hands forward + +None new. Phase 0 § 7's open questions are the standing list; +Phase 1 picks off Q1 (single-kernel throughput) and Q2 (NEON +baseline) directly via M2 and M3. diff --git a/docs/phase2.md b/docs/phase2.md new file mode 100644 index 0000000..8cad881 --- /dev/null +++ b/docs/phase2.md @@ -0,0 +1,212 @@ +--- +phase: 2 +status: closed 2026-05-18 +date_opened: 2026-05-18 +parent: phase1.md +target_kernel: VP9 8×8 inverse DCT (DCT_DCT variant, 8-bit pixels) +--- + +# Phase 2 — Situation analysis + +Per `dev_process.md`: + +> Document current state. Identify constraints, dependencies, known +> failure modes. Reset context here — do not carry assumptions from +> prior sessions; re-read CLAUDE.md, relevant memory files, run +> `git status`, re-verify reachability. + +## 1. Context reset + +- Working tree state: dirty (Phase 0/1/2 docs not yet committed). + `.git-broken-2026-05-18/` preserved as a forensic artifact of + the 2026-05-18 10:25 working-tree wipe (cause undetermined). +- CLAUDE.md re-read: no contradictions with the Path B scope set + in README §"Architecture (Path B)". +- hertz reachability: confirmed via SSH; `vcgencmd`, `vulkaninfo`, + `apt`, sudo NOPASSWD all working as of 2026-05-17 inventory. + Mesa 25.0.7 / Vulkan 1.3.305 / V3D 7.1.7 stable. + +## 2. Reference implementations — VP9 8×8 IDCT (DCT_DCT) + +The Phase 1 kernel has *two* canonical reference implementations +in FFmpeg n7.1.3 (the version installed on hertz). The harness +will link both: the C path as the bit-exact gate (M1), the NEON +path as the throughput baseline (M3). + +### 2.1 C reference + +- **Source**: `libavcodec/vp9dsp_template.c`, function `idct_idct_8x8_add_c` +- **Spec basis**: VP9 specification §8.7 — Inverse transform process +- **Signature**: + + ```c + static void idct_idct_8x8_add_c(uint8_t *_dst, ptrdiff_t stride, + int16_t *_block, int eob); + ``` + +- **Algorithm** (8-bit path): + 1. If `eob == 1` (DC-only): single `(coef * 11585 * 11585)` round, broadcast to 8×8 with `+pred, clamp[0,255]`. + 2. Otherwise: 8 column passes through `idct8_1d` → tmp[64]. Zero the input block. 8 row passes through `idct8_1d` → out[8]. Per-element `(out + 16) >> 5`, add to `dst`, `av_clip_pixel`. +- **`idct8_1d`**: 1-D 8-point inverse DCT, 8 trigonometric multiply-add stages with Q14 fixed-point constants then 8-butterfly add/sub stages. All arithmetic is signed int32 (`dctint`). +- **Q14 constants** (matched against VP9 spec §8.7.1.4): + | symbol | value | trig identity | + |---|---|---| + | cospi_16_64 | 11585 | cos(π/4) × 2^14 ≈ 0.70711 | + | cospi_24_64 | 6270 | cos(3π/8) × 2^14 ≈ 0.38268 | + | cospi_8_64 | 15137 | sin(3π/8) × 2^14 ≈ 0.92388 | + | cospi_28_64 | 3196 | cos(7π/16) × 2^14 ≈ 0.19509 | + | cospi_4_64 | 16069 | sin(7π/16) × 2^14 ≈ 0.98079 | + | cospi_20_64 | 9102 | cos(5π/16) × 2^14 ≈ 0.55557 | + | cospi_12_64 | 13623 | sin(5π/16) × 2^14 ≈ 0.83147 | + + Rounding convention: `(product + (1 << 13)) >> 14`, i.e. round-half-up at bit 14. + +- **License**: LGPL-2.1-or-later (FFmpeg). +- **Side effect**: zeroes the input `block[]` (idempotency requirement; matches spec). + +### 2.2 NEON reference + +- **Source**: `libavcodec/aarch64/vp9itxfm_neon.S`, symbol `ff_vp9_idct_idct_8x8_add_neon` +- **Signature** (same as C): + ``` + void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); + ``` + Registers: `x0=dst, x1=stride, x2=block, w3=eob`. +- **Internal dependencies** (must be copied alongside the .S): + | macro / symbol | location | role | + |---|---|---| + | `idct8` | `vp9itxfm_neon.S` | 1-D 8-pt IDCT, fully unrolled with `dmbutterfly*` | + | `dmbutterfly0` | `vp9itxfm_neon.S` | rotation by π/4 (the `cospi_16_64` case) | + | `dmbutterfly` | `vp9itxfm_neon.S` | general 2-input rotation `[a,b] → [a·c1−b·c2, a·c2+b·c1]` (`Q14`) | + | `dmbutterfly_l` | `vp9itxfm_neon.S` | wide-form (4×i32 acc) for `dmbutterfly` | + | `butterfly_8h` | `vp9itxfm_neon.S` | trivial `[a+b, a−b]` on `int16x8_t` | + | `transpose_8x8H` | `libavcodec/aarch64/neon.S` | in-place 8×8 i16 transpose | + | `idct_coeffs` | `vp9itxfm_neon.S` (`const`) | Q14 trig constants table, aligned 4 | + | `movrel` | `libavutil/aarch64/asm.S` | PIC-aware constant-pool relocation helper | +- **License**: LGPL-2.1-or-later (Google, 2016). +- **Performance shape**: full unrolled 8-pt butterfly with NEON `smull/smlsl/smlal` + `rshrn` for the Q14 round-shift; output uses `sqxtun` for saturated narrow to u8. Estimated ~80 NEON instructions for the steady state (non-DC) path. + +### 2.3 AV1 equivalence note + +AV1's 8×8 DCT_DCT transform (`av1_iidentity8_iidentity8_c` vs `av1_idct8_idct8_c` family in `libavcodec/av1dsp/...`) shares the same 1-D 8-point structure but with **different** scaling: AV1 uses 12-bit fixed-point (`>> 12`) and a slightly different rounding shift due to its different transform-stage bit growth model. Calling our VP9 IDCT shader on AV1 coefficients will produce wrong output. **AV1 support is out of scope for Phase 1.** A Phase-N variant can fork the shader with the AV1 constants once Phase 1 has proven the VP9 path. + +## 3. Vulkan compute dispatch path + +Hertz exposes V3D 7.1 via Mesa's v3dv driver as Vulkan +`PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU`, API 1.3.305, conformance +1.3.8.3. The compute-only dispatch path is: + +``` +host program + ├─ vkCreateInstance / vkEnumeratePhysicalDevices (picks V3D 7.1.7.0) + ├─ vkCreateDevice (queue family with COMPUTE_BIT, no graphics needed) + ├─ vkCreateBuffer x N (SSBOs for block coeffs in / dst pixels in+out) + │ - buffer flags: STORAGE_BUFFER_BIT | TRANSFER_SRC/DST + │ - memory type: HOST_VISIBLE | HOST_COHERENT (zero-copy on shared LPDDR4x) + ├─ vkCreateDescriptorSetLayout (≤8 SSBOs per layout — Pi 5 limit) + ├─ vkCreateShaderModule (SPIR-V from glslang) + ├─ vkCreateComputePipeline + ├─ vkBeginCommandBuffer + │ vkCmdBindPipeline / vkCmdBindDescriptorSets / vkCmdPushConstants + │ vkCmdDispatch(group_count_x, 1, 1) # one WG per ~K blocks + ├─ vkQueueSubmit + vkQueueWaitIdle (or fence) — this is the measured op + └─ (read back via the HOST_VISIBLE buffer, or alias it to the same memory the CPU populated) +``` + +Per Phase 0 §2 inside-view limits, the relevant constraints +for this kernel: + +- ≤8 SSBOs per stage → group inputs/outputs into ≤8 bindings (we + only need 2: `block[]` in, `dst[]` in/out). +- Shared mem ≤16 KiB → each 8×8 block fits trivially (256 B in + i16 plus 64 B in u8). One WG can carry dozens of blocks of + shared state if useful. +- Subgroup size = 16 (fixed). One workgroup of 64 invocations = + 4 subgroups; one block per subgroup is a natural shape (each + 16-lane subgroup processes 8×8 = 64 pixels in 4 cycles of + subgroup work). + +## 4. Build path on hertz + +Already installed (2026-05-17): cmake 3.31, ninja 1.12, gcc (Debian +trixie default), `libvulkan-dev 1.4.309`, `glslang-tools 15.1.0`, +`spirv-tools 2025.1`, `libdrm-dev 2.4.131`, `vulkan-tools 1.4.304`. + +Missing but cheap: +- `libavcodec-dev` — only needed if the harness wants to link + against system libavcodec for cross-checks against the dynamic + dispatcher. *Not* needed for the source-copy approach (preferred, + see §5). + +## 5. Reference-copy strategy (vs system-libavcodec link) + +**Decision: source-copy the 3 FFmpeg files into `external/ffmpeg-snapshot/`.** + +Rationale: +- System `libavcodec.so` on hertz is symbol-stripped (`nm` returns + empty for `ff_vp9_idct_*`). Internal NEON entry points are not + reachable via `dlsym`. +- The two reference implementations (C, NEON) plus their macro/ + data dependencies total ~3 files / ~600 lines. Source-copy is + smaller than the dlopen plumbing would be. +- LGPL-2.1-or-later (FFmpeg license) is propagation-compatible + with the harness binary if the harness binary itself is GPL + or LGPL. The kernel shaders and dispatch library stay + separately-licensed (BSD-2-Clause, default for this project). +- Pinning to `n7.1.3` matches hertz's runtime libavcodec version, + so any in-session sanity cross-check against the running Mesa + / video tooling stays consistent. + +Files to vendor: + +| Source | License | Target path under `daedalus-fourier/` | +|---|---|---| +| `libavcodec/vp9dsp_template.c` | LGPL-2.1+ | `external/ffmpeg-snapshot/vp9dsp_template.c` | +| `libavcodec/aarch64/vp9itxfm_neon.S` | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/vp9itxfm_neon.S` | +| `libavcodec/aarch64/neon.S` (for `transpose_8x8H`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/neon.S` | +| `libavutil/aarch64/asm.S` (for `movrel`, `function`, `endfunc`) | LGPL-2.1+ | `external/ffmpeg-snapshot/aarch64/asm.S` | +| (whatever else `vp9dsp_template.c` transitively needs) | LGPL-2.1+ | as required | + +A `external/ffmpeg-snapshot/COPYING.LGPL` and `external/ffmpeg-snapshot/PROVENANCE.md` document the upstream commit (n7.1.3 tag, commit hash) and the verbatim-copy guarantee. + +## 6. Known constraints / failure modes carried from Phase 0 + +Repeated here so Phase 4 (plan) can bind against them without +re-derivation: + +- **C1**: shaderFloat16 = false → all shader arithmetic must be int32 (we are int anyway — no risk). +- **C2**: maxComputeSharedMemorySize = 16 KiB → kernel must not require more (8×8 IDCT trivially fits even with many blocks per WG). +- **C3**: maxPerStageDescriptorStorageBuffers = 8 → we need only 2 (coeffs + dst), no risk. +- **C4**: subgroupSupportedOperations = BASIC + VOTE only → no `subgroupAdd`/etc. for accumulator reductions. Workaround: the IDCT structure is fully data-parallel without reductions; this constraint doesn't bite. +- **C5**: VC7 has SMUL24 but no INT8 MAC. Our Q14 multiplies are i16×i16→i32 — the multiplicands fit in 17 bits, so SMUL24 covers it. No INT8/INT4 issues. +- **C6**: shared LPDDR4x bus; GPU sees ~4–7 GB/s vs CPU ~12–15 GB/s. For 8×8 IDCT, working set is tiny (≤320 B/block), so per-block bandwidth is not the bottleneck; per-dispatch submit overhead is. +- **C7**: VPM read-stall serialization. If we hand-write QPU asm (we won't, in Phase 1) this would matter; the Vulkan compute path lets the v3d_compiler schedule for us. +- **C8**: VC7 thermal throttle at 85°C GPU / 80°C CPU. Phase 7 measurements should record temp before/during/after to flag throttling. + +## 7. What Phase 2 does *not* close + +- The harness architecture (single binary? Two binaries — one for + bit-exact, one for throughput?). Phase 4 picks. +- Block-per-WG dispatch geometry. Phase 4 + Phase 6 sweep. +- Random-coefficient generation strategy (uniform i16 vs + realistic-distribution; the latter affects DC-only path + frequency). Phase 4 picks; Phase 7 may re-evaluate. +- Whether NEON measurement uses `clock_gettime(CLOCK_MONOTONIC_RAW)` + per-call (high overhead) or batched (more realistic for codec + use). Phase 3 picks during baseline collection. + +## 8. Hand-off to Phase 3 + +Phase 3 measures: +- **M3-prelim**: NEON `ff_vp9_idct_idct_8x8_add_neon` throughput + on hertz, batched over 10⁶ random blocks, single-threaded, + 4-thread, sched-isolated. This is the *floor*. +- **M5-prelim**: Vulkan dispatch overhead — pipeline create cost + (one-time), per-`vkCmdDispatch` cost (per-frame-equivalent), + per-`vkQueueSubmit + vkQueueWaitIdle` cost (per-completion). + Bound below which kernel batching is mandatory. + +Both are measurements on the *existing* substrate. Neither +requires writing any shader code. Phase 3 closes before Phase 4 +(plan) begins. diff --git a/docs/phase3.md b/docs/phase3.md new file mode 100644 index 0000000..700287f --- /dev/null +++ b/docs/phase3.md @@ -0,0 +1,105 @@ +--- +phase: 3 +status: closed 2026-05-18 +date_opened: 2026-05-18 +date_closed: 2026-05-18 +parent: phase2.md +host: hertz (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75+rpt-rpi-2712, Mesa 25.0.7-2+rpt4, V3D 7.1.7 @ 1 GHz, A76 @ 2.8 GHz) +artifacts: build/bench_neon_idct, build/bench_vulkan_dispatch, build/noop.spv +--- + +# Phase 3 — Baseline measurements + +Per `dev_process.md`: + +> Take concrete measurements *before* any changes. Raw before +> derived. Real data, not theatre. + +These numbers anchor every Phase 4+ decision. Re-run with the +same harness on the same hertz before drawing any new conclusions +in later phases. + +## M1 — bit-exact correctness gate (Phase 1) + +| | | +|---|---| +| Method | 10 000 random VP9-plausible coefficient blocks + random `pred[64]`, compare `daedalus_vp9_idct_idct_8x8_add_ref` C output vs vendored FFmpeg `ff_vp9_idct_idct_8x8_add_neon` | +| Run | `./bench_neon_idct --blocks 1000000 --iters 5` (built 2026-05-18) | +| **Result** | **10 000 / 10 000 = 100.0000 %** | +| DC-only path frequency | 11 / 10 000 = 0.11 % | +| Notes | Random generator: xorshift64, biased toward 1–16 non-zero coeffs per block; eob mostly ∈ [4, 63]. DC-only frequency is incidental; Phase 7 may revisit if it materially affects the throughput number. | + +**Gate passes. Throughput measurement was authorized to run.** + +## M3 — NEON throughput (single-core) + +| | | +|---|---| +| Kernel | `ff_vp9_idct_idct_8x8_add_neon` from FFmpeg n7.1.3 (vendored, see `external/ffmpeg-snapshot/PROVENANCE.md`) | +| Method | Pre-generate 1 M random blocks + preds. Per iteration: memcpy refresh of all blocks/preds (NEON path zeroes blocks), then call NEON kernel 1 M times. Subtract setup memcpy time from the measured wall-clock. 5 iterations, single thread, no CPU pinning. | +| Compiler flags | `-O3 -march=armv8-a+simd` | +| Run | `./bench_neon_idct --blocks 1000000 --iters 5` | +| **Throughput** | **8.171 Mblock/s** | +| Per-block | 122.4 ns | +| Equivalent 1080p frame rate | 252.2 FPS (32 400 blocks per 1080p frame, assuming pure 8×8 work) | +| Elapsed (kernel) | 0.612 s / 5 M blocks | +| Elapsed (setup-only) | 0.250 s / 5 M iters | +| Cross-check | Cycle estimate at 2.8 GHz: 122.4 ns × 2.8 GHz ≈ 342 cycles/block. Plausible for a fully-unrolled NEON 8-point IDCT with butterflies + saturated narrow stores; the FFmpeg implementation interleaves loads/computes/stores aggressively. | + +### M3 implications + +- A single A76 core handles ~8 M blocks/s = **252 FPS at 1080p**. Real decode needs ~60 FPS = 4.2× headroom on one core, ~16× headroom on all four cores. **NEON is not the bottleneck for current YouTube workloads on Pi 5.** +- The QPU offload story is not "make decode faster" — decode is already fast enough single-threaded. The story has to be "free CPU cycles for the rest of the system" (browser, audio, the 11 LXD containers on hertz). +- For a per-kernel R = QPU / NEON measurement (per `phase1.md §"Decision rules"`), the QPU has to hit ≥4 M blocks/s to score R ≥ 0.5. That's the gate. + +## M5 — Vulkan compute dispatch overhead + +| | | +|---|---| +| Method | Allocate empty pipeline (no descriptors, no push constants), bind+dispatch a `void main(){}` shader on `local_size_x=64`. Time `vkQueueSubmit` + `vkQueueWaitIdle` round-trip. 50 000 iterations, warm. | +| Device | V3D 7.1.7.0 via Mesa v3dv 25.0.7 (selected past llvmpipe by `strstr("V3D")`) | +| Run | `./bench_vulkan_dispatch --iters 50000` | +| **M5a — empty CB submit+wait** | **22.66 µs / op** | +| **M5b — 1-WG noop dispatch submit+wait** | **55.60 µs / op** | +| **M5 delta — per-vkCmdDispatch + pipeline-bind** | **32.95 µs** | + +### M5 implications — the load-bearing finding for Phase 4 + +This is the single most important number from Phase 3. + +- Per-dispatch cost (55.6 µs) is **~455× the NEON per-block cost** (122 ns). +- A per-block QPU dispatch is structurally impossible — overhead dominates by two-and-a-half orders of magnitude. +- Break-even batch size for a *hypothetical* zero-cost QPU kernel: **≥ 556 blocks per dispatch**. Real kernel cost on top of that. +- Frame-level batching is mandatory: a 1080p frame has 32 400 8×8 blocks; one dispatch per frame amortizes M5b to 1.7 ns/block — well below NEON's 122 ns. +- Tile-level batching is borderline: a typical VP9 64×64 superblock has 64 sub-blocks; 55.6 µs / 64 ≈ 870 ns/block, ~7× NEON. Probably too coarse — frame-level or full-plane is the right granularity. + +### M5 measurement caveats + +- `vkQueueWaitIdle` after each submit forces a full GPU sync, modelling the "submit and need the result now" case. Real decode pipelines can submit multiple frames ahead and wait less often — the per-dispatch cost in a pipelined deployment will be lower (probably bounded below by M5a ≈ 22.66 µs as the pure submit cost). +- Empty CB (M5a) at 22.66 µs is the *floor*. This is Mesa command-list construction + kernel `DRM_IOCTL_V3D_SUBMIT_CL` + scheduler RTT. Cannot be optimised at the userspace level without changing Mesa or kernel. +- Both numbers include `vkQueueWaitIdle` overhead; pure submit-without-wait would be lower. For Phase 1's threshold analysis the with-wait number is the right one to use because end-to-end frame decode must wait for its output to be readable. + +## Phase 3 closure + +Two anchor measurements captured, both with verbatim raw output +(see `bench_neon_idct` and `bench_vulkan_dispatch` source for the +print format). No estimates, no inferences, no recall from prior +sessions or sibling-host memory. + +Phase 4 (plan) opens against these numbers. Its first decision: +**given the 32.95 µs per-dispatch floor, what is the +batch granularity for the first kernel?** The answer is either +frame-level (32 400 blocks/dispatch) or row-level (~120 +blocks/dispatch for one 1920-wide row of 8×8 → still ~460 ns/block +overhead, ~4× NEON). Frame-level is the only granularity that +amortises overhead enough to leave kernel compute room to win. + +Open thread for a later phase (not blocking Phase 4): +- Multi-core NEON sweep (M3'): single-core NEON is the right + *competitor floor*, but the actual ARM headroom on hertz is + 4× this number under load. +- Memory-bandwidth contention measurement (M6): does NEON's + rate change when concurrent QPU is reading the same LPDDR4x + bus? Needs the QPU kernel to exist first. +- Power-draw delta via Himbeere plug (M7): same — needs a real + GPU workload to differentiate from idle. diff --git a/docs/vulkaninfo_v3d_7_1_7_hertz.txt b/docs/vulkaninfo_v3d_7_1_7_hertz.txt new file mode 100644 index 0000000..4f24c97 --- /dev/null +++ b/docs/vulkaninfo_v3d_7_1_7_hertz.txt @@ -0,0 +1,2099 @@ +========== +VULKANINFO +========== + +Vulkan Instance Version: 1.4.309 + + +Instance Extensions: count = 24 +=============================== + VK_EXT_acquire_drm_display : extension revision 1 + VK_EXT_acquire_xlib_display : extension revision 1 + VK_EXT_debug_report : extension revision 10 + VK_EXT_debug_utils : extension revision 2 + VK_EXT_direct_mode_display : extension revision 1 + VK_EXT_display_surface_counter : extension revision 1 + VK_EXT_headless_surface : extension revision 1 + VK_EXT_surface_maintenance1 : extension revision 1 + VK_EXT_swapchain_colorspace : extension revision 5 + VK_KHR_device_group_creation : extension revision 1 + VK_KHR_display : extension revision 23 + VK_KHR_external_fence_capabilities : extension revision 1 + VK_KHR_external_memory_capabilities : extension revision 1 + VK_KHR_external_semaphore_capabilities : extension revision 1 + VK_KHR_get_display_properties2 : extension revision 1 + VK_KHR_get_physical_device_properties2 : extension revision 2 + VK_KHR_get_surface_capabilities2 : extension revision 1 + VK_KHR_portability_enumeration : extension revision 1 + VK_KHR_surface : extension revision 25 + VK_KHR_surface_protected_capabilities : extension revision 1 + VK_KHR_wayland_surface : extension revision 6 + VK_KHR_xcb_surface : extension revision 6 + VK_KHR_xlib_surface : extension revision 6 + VK_LUNARG_direct_driver_loading : extension revision 1 + +Layers: count = 2 +================= +VK_LAYER_MESA_device_select (Linux device selection layer) Vulkan version 1.4.303, layer version 1: + Layer Extensions: count = 0 + Devices: count = 2 + GPU id = 0 (V3D 7.1.7.0) + Layer-Device Extensions: count = 0 + + GPU id = 1 (llvmpipe (LLVM 19.1.7, 128 bits)) + Layer-Device Extensions: count = 0 + +VK_LAYER_MESA_overlay (Mesa Overlay layer) Vulkan version 1.4.303, layer version 1: + Layer Extensions: count = 0 + Devices: count = 2 + GPU id = 0 (V3D 7.1.7.0) + Layer-Device Extensions: count = 0 + + GPU id = 1 (llvmpipe (LLVM 19.1.7, 128 bits)) + Layer-Device Extensions: count = 0 + +Device Properties and Extensions: +================================= +GPU0: +VkPhysicalDeviceProperties: +--------------------------- + apiVersion = 1.3.305 (4206897) + driverVersion = 25.0.7 (104857607) + vendorID = 0x14e4 + deviceID = 0x55701c33 + deviceType = PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU + deviceName = V3D 7.1.7.0 + pipelineCacheUUID = a801ad89-90bc-6e4b-dbf7-6f6038afe3ab + +VkPhysicalDeviceLimits: +----------------------- + maxImageDimension1D = 4096 + maxImageDimension2D = 4096 + maxImageDimension3D = 4096 + maxImageDimensionCube = 4096 + maxImageArrayLayers = 2048 + maxTexelBufferElements = 268435456 + maxUniformBufferRange = 1073741824 + maxStorageBufferRange = 1073741824 + maxPushConstantsSize = 128 + maxMemoryAllocationCount = 1048576 + maxSamplerAllocationCount = 65536 + bufferImageGranularity = 0x00000100 + sparseAddressSpaceSize = 0x00000000 + maxBoundDescriptorSets = 16 + maxPerStageDescriptorSamplers = 24 + maxPerStageDescriptorUniformBuffers = 16 + maxPerStageDescriptorStorageBuffers = 8 + maxPerStageDescriptorSampledImages = 16 + maxPerStageDescriptorStorageImages = 4 + maxPerStageDescriptorInputAttachments = 4 + maxPerStageResources = 128 + maxDescriptorSetSamplers = 96 + maxDescriptorSetUniformBuffers = 64 + maxDescriptorSetUniformBuffersDynamic = 8 + maxDescriptorSetStorageBuffers = 32 + maxDescriptorSetStorageBuffersDynamic = 4 + maxDescriptorSetSampledImages = 64 + maxDescriptorSetStorageImages = 16 + maxDescriptorSetInputAttachments = 4 + maxVertexInputAttributes = 16 + maxVertexInputBindings = 16 + maxVertexInputAttributeOffset = 4294967295 + maxVertexInputBindingStride = 65535 + maxVertexOutputComponents = 64 + maxTessellationGenerationLevel = 0 + maxTessellationPatchSize = 0 + maxTessellationControlPerVertexInputComponents = 0 + maxTessellationControlPerVertexOutputComponents = 0 + maxTessellationControlPerPatchOutputComponents = 0 + maxTessellationControlTotalOutputComponents = 0 + maxTessellationEvaluationInputComponents = 0 + maxTessellationEvaluationOutputComponents = 0 + maxGeometryShaderInvocations = 32 + maxGeometryInputComponents = 64 + maxGeometryOutputComponents = 64 + maxGeometryOutputVertices = 256 + maxGeometryTotalOutputComponents = 1024 + maxFragmentInputComponents = 64 + maxFragmentOutputAttachments = 4 + maxFragmentDualSrcAttachments = 0 + maxFragmentCombinedOutputResources = 20 + maxComputeSharedMemorySize = 16384 + maxComputeWorkGroupCount: count = 3 + 65535 + 65535 + 65535 + maxComputeWorkGroupInvocations = 256 + maxComputeWorkGroupSize: count = 3 + 256 + 256 + 256 + subPixelPrecisionBits = 6 + subTexelPrecisionBits = 8 + mipmapPrecisionBits = 8 + maxDrawIndexedIndexValue = 4294967295 + maxDrawIndirectCount = 2147483647 + maxSamplerLodBias = 14 + maxSamplerAnisotropy = 16 + maxViewports = 1 + maxViewportDimensions: count = 2 + 4096 + 4096 + viewportBoundsRange: count = 2 + -8192 + 8191 + viewportSubPixelBits = 0 + minMemoryMapAlignment = 4096 + minTexelBufferOffsetAlignment = 0x00000040 + minUniformBufferOffsetAlignment = 0x00000020 + minStorageBufferOffsetAlignment = 0x00000020 + minTexelOffset = -8 + maxTexelOffset = 7 + minTexelGatherOffset = -8 + maxTexelGatherOffset = 7 + minInterpolationOffset = -0.5 + maxInterpolationOffset = 0.5 + subPixelInterpolationOffsetBits = 6 + maxFramebufferWidth = 4096 + maxFramebufferHeight = 4096 + maxFramebufferLayers = 256 + framebufferColorSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + framebufferDepthSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + framebufferStencilSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + framebufferNoAttachmentsSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + maxColorAttachments = 8 + sampledImageColorSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + sampledImageIntegerSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + sampledImageDepthSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + sampledImageStencilSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + storageImageSampleCounts: count = 1 + SAMPLE_COUNT_1_BIT + maxSampleMaskWords = 1 + timestampComputeAndGraphics = true + timestampPeriod = 1 + maxClipDistances = 8 + maxCullDistances = 0 + maxCombinedClipAndCullDistances = 8 + discreteQueuePriorities = 2 + pointSizeRange: count = 2 + 0.03125 + 512 + lineWidthRange: count = 2 + 1 + 32 + pointSizeGranularity = 0.03125 + lineWidthGranularity = 0.03125 + strictLines = true + standardSampleLocations = false + optimalBufferCopyOffsetAlignment = 0x00000020 + optimalBufferCopyRowPitchAlignment = 0x00000020 + nonCoherentAtomSize = 0x00000100 + +VkPhysicalDeviceSparseProperties: +--------------------------------- + residencyStandard2DBlockShape = false + residencyStandard2DMultisampleBlockShape = false + residencyStandard3DBlockShape = false + residencyAlignedMipSize = false + residencyNonResidentStrict = false + +VkPhysicalDeviceCustomBorderColorPropertiesEXT: +----------------------------------------------- + maxCustomBorderColorSamplers = 24 + +VkPhysicalDeviceDrmPropertiesEXT: +--------------------------------- + hasPrimary = true + hasRender = true + primaryMajor = 226 + primaryMinor = 1 + renderMajor = 226 + renderMinor = 128 + +VkPhysicalDeviceLineRasterizationPropertiesKHR: +----------------------------------------------- + lineSubPixelPrecisionBits = 6 + +VkPhysicalDeviceMaintenance5PropertiesKHR: +------------------------------------------ + earlyFragmentMultisampleCoverageAfterSampleCounting = true + earlyFragmentSampleMaskTestBeforeSampleCounting = true + depthStencilSwizzleOneSupport = true + polygonModePointSize = true + nonStrictSinglePixelWideLinesUseParallelogram = true + nonStrictWideLinesUseParallelogram = true + +VkPhysicalDeviceMultiDrawPropertiesEXT: +--------------------------------------- + maxMultiDrawCount = 2048 + +VkPhysicalDevicePerformanceQueryPropertiesKHR: +---------------------------------------------- + allowCommandBufferQueryCopies = true + +VkPhysicalDevicePipelineRobustnessPropertiesEXT: +------------------------------------------------ + defaultRobustnessStorageBuffers = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT + defaultRobustnessUniformBuffers = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT + defaultRobustnessVertexInputs = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT + defaultRobustnessImages = PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DEVICE_DEFAULT + +VkPhysicalDeviceProvokingVertexPropertiesEXT: +--------------------------------------------- + provokingVertexModePerPipeline = true + transformFeedbackPreservesTriangleFanProvokingVertex = false + +VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT: +---------------------------------------------------- + shaderModuleIdentifierAlgorithmUUID = 4d455341-2d42-4c41-4b45-330000000000 + +VkPhysicalDeviceVertexAttributeDivisorPropertiesKHR: +---------------------------------------------------- + maxVertexAttribDivisor = 65535 + supportsNonZeroFirstInstance = true + +VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT: +---------------------------------------------------- + maxVertexAttribDivisor = 65535 + +VkPhysicalDeviceVulkan11Properties: +----------------------------------- + deviceUUID = 5fd8106e-741a-cafa-e080-fdb16cf11a80 + driverUUID = cd58235c-2a29-4ac7-8b40-a18d30b128ba + deviceNodeMask = 0 + deviceLUIDValid = false + subgroupSize = 16 + subgroupSupportedStages: count = 2 + SHADER_STAGE_FRAGMENT_BIT + SHADER_STAGE_COMPUTE_BIT + subgroupSupportedOperations: count = 6 + SUBGROUP_FEATURE_BASIC_BIT + SUBGROUP_FEATURE_VOTE_BIT + SUBGROUP_FEATURE_BALLOT_BIT + SUBGROUP_FEATURE_SHUFFLE_BIT + SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT + SUBGROUP_FEATURE_QUAD_BIT + subgroupQuadOperationsInAllStages = false + pointClippingBehavior = POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES + maxMultiviewViewCount = 16 + maxMultiviewInstanceIndex = 4294967294 + protectedNoFault = false + maxPerSetDescriptors = 67108863 + maxMemoryAllocationSize = 0x40000000 + +VkPhysicalDeviceVulkan12Properties: +----------------------------------- + driverID = DRIVER_ID_MESA_V3DV + driverName = V3DV Mesa + driverInfo = Mesa 25.0.7-2+rpt4 + conformanceVersion: + major = 1 + minor = 3 + subminor = 8 + patch = 3 + denormBehaviorIndependence = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL + roundingModeIndependence = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL + shaderSignedZeroInfNanPreserveFloat16 = true + shaderSignedZeroInfNanPreserveFloat32 = true + shaderSignedZeroInfNanPreserveFloat64 = false + shaderDenormPreserveFloat16 = true + shaderDenormPreserveFloat32 = true + shaderDenormPreserveFloat64 = false + shaderDenormFlushToZeroFloat16 = false + shaderDenormFlushToZeroFloat32 = false + shaderDenormFlushToZeroFloat64 = false + shaderRoundingModeRTEFloat16 = true + shaderRoundingModeRTEFloat32 = true + shaderRoundingModeRTEFloat64 = false + shaderRoundingModeRTZFloat16 = false + shaderRoundingModeRTZFloat32 = false + shaderRoundingModeRTZFloat64 = false + maxUpdateAfterBindDescriptorsInAllPools = 0 + shaderUniformBufferArrayNonUniformIndexingNative = false + shaderSampledImageArrayNonUniformIndexingNative = false + shaderStorageBufferArrayNonUniformIndexingNative = false + shaderStorageImageArrayNonUniformIndexingNative = false + shaderInputAttachmentArrayNonUniformIndexingNative = false + robustBufferAccessUpdateAfterBind = false + quadDivergentImplicitLod = false + maxPerStageDescriptorUpdateAfterBindSamplers = 24 + maxPerStageDescriptorUpdateAfterBindUniformBuffers = 16 + maxPerStageDescriptorUpdateAfterBindStorageBuffers = 8 + maxPerStageDescriptorUpdateAfterBindSampledImages = 16 + maxPerStageDescriptorUpdateAfterBindStorageImages = 4 + maxPerStageDescriptorUpdateAfterBindInputAttachments = 4 + maxPerStageUpdateAfterBindResources = 128 + maxDescriptorSetUpdateAfterBindSamplers = 96 + maxDescriptorSetUpdateAfterBindUniformBuffers = 64 + maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = 8 + maxDescriptorSetUpdateAfterBindStorageBuffers = 32 + maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = 8 + maxDescriptorSetUpdateAfterBindSampledImages = 64 + maxDescriptorSetUpdateAfterBindStorageImages = 16 + maxDescriptorSetUpdateAfterBindInputAttachments = 4 + supportedDepthResolveModes: count = 1 + RESOLVE_MODE_SAMPLE_ZERO_BIT + supportedStencilResolveModes: count = 1 + RESOLVE_MODE_SAMPLE_ZERO_BIT + independentResolveNone = false + independentResolve = false + filterMinmaxSingleComponentFormats = false + filterMinmaxImageComponentMapping = false + maxTimelineSemaphoreValueDifference = 18446744073709551615 + framebufferIntegerColorSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + +VkPhysicalDeviceVulkan13Properties: +----------------------------------- + minSubgroupSize = 16 + maxSubgroupSize = 16 + maxComputeWorkgroupSubgroups = 16 + requiredSubgroupSizeStages: count = 1 + SHADER_STAGE_COMPUTE_BIT + maxInlineUniformBlockSize = 4096 + maxPerStageDescriptorInlineUniformBlocks = 4 + maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 4 + maxDescriptorSetInlineUniformBlocks = 4 + maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 4 + maxInlineUniformTotalSize = 16384 + integerDotProduct8BitUnsignedAccelerated = false + integerDotProduct8BitSignedAccelerated = false + integerDotProduct8BitMixedSignednessAccelerated = false + integerDotProduct4x8BitPackedUnsignedAccelerated = false + integerDotProduct4x8BitPackedSignedAccelerated = false + integerDotProduct4x8BitPackedMixedSignednessAccelerated = false + integerDotProduct16BitUnsignedAccelerated = false + integerDotProduct16BitSignedAccelerated = false + integerDotProduct16BitMixedSignednessAccelerated = false + integerDotProduct32BitUnsignedAccelerated = false + integerDotProduct32BitSignedAccelerated = false + integerDotProduct32BitMixedSignednessAccelerated = false + integerDotProduct64BitUnsignedAccelerated = false + integerDotProduct64BitSignedAccelerated = false + integerDotProduct64BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating8BitSignedAccelerated = false + integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = false + integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false + integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating16BitSignedAccelerated = false + integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating32BitSignedAccelerated = false + integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating64BitSignedAccelerated = false + integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false + storageTexelBufferOffsetAlignmentBytes = 0x00000040 + storageTexelBufferOffsetSingleTexelAlignment = false + uniformTexelBufferOffsetAlignmentBytes = 0x00000040 + uniformTexelBufferOffsetSingleTexelAlignment = false + maxBufferSize = 0x40000000 + +Device Extensions: count = 92 + VK_EXT_4444_formats : extension revision 1 + VK_EXT_attachment_feedback_loop_layout : extension revision 2 + VK_EXT_border_color_swizzle : extension revision 1 + VK_EXT_color_write_enable : extension revision 1 + VK_EXT_custom_border_color : extension revision 12 + VK_EXT_depth_clamp_zero_one : extension revision 1 + VK_EXT_depth_clip_control : extension revision 1 + VK_EXT_depth_clip_enable : extension revision 1 + VK_EXT_extended_dynamic_state : extension revision 1 + VK_EXT_extended_dynamic_state2 : extension revision 1 + VK_EXT_external_memory_dma_buf : extension revision 1 + VK_EXT_host_query_reset : extension revision 1 + VK_EXT_image_drm_format_modifier : extension revision 2 + VK_EXT_image_robustness : extension revision 1 + VK_EXT_index_type_uint8 : extension revision 1 + VK_EXT_inline_uniform_block : extension revision 1 + VK_EXT_line_rasterization : extension revision 1 + VK_EXT_load_store_op_none : extension revision 1 + VK_EXT_memory_budget : extension revision 1 + VK_EXT_multi_draw : extension revision 1 + VK_EXT_physical_device_drm : extension revision 1 + VK_EXT_pipeline_creation_cache_control : extension revision 3 + VK_EXT_pipeline_creation_feedback : extension revision 1 + VK_EXT_pipeline_robustness : extension revision 1 + VK_EXT_primitive_topology_list_restart : extension revision 1 + VK_EXT_private_data : extension revision 1 + VK_EXT_provoking_vertex : extension revision 1 + VK_EXT_queue_family_foreign : extension revision 1 + VK_EXT_separate_stencil_usage : extension revision 1 + VK_EXT_shader_demote_to_helper_invocation : extension revision 1 + VK_EXT_shader_module_identifier : extension revision 1 + VK_EXT_subgroup_size_control : extension revision 2 + VK_EXT_swapchain_maintenance1 : extension revision 1 + VK_EXT_texel_buffer_alignment : extension revision 1 + VK_EXT_tooling_info : extension revision 1 + VK_EXT_vertex_attribute_divisor : extension revision 3 + VK_KHR_16bit_storage : extension revision 1 + VK_KHR_8bit_storage : extension revision 1 + VK_KHR_bind_memory2 : extension revision 1 + VK_KHR_buffer_device_address : extension revision 1 + VK_KHR_copy_commands2 : extension revision 1 + VK_KHR_create_renderpass2 : extension revision 1 + VK_KHR_dedicated_allocation : extension revision 3 + VK_KHR_depth_stencil_resolve : extension revision 1 + VK_KHR_descriptor_update_template : extension revision 1 + VK_KHR_device_group : extension revision 4 + VK_KHR_driver_properties : extension revision 1 + VK_KHR_dynamic_rendering : extension revision 1 + VK_KHR_external_fence : extension revision 1 + VK_KHR_external_fence_fd : extension revision 1 + VK_KHR_external_memory : extension revision 1 + VK_KHR_external_memory_fd : extension revision 1 + VK_KHR_external_semaphore : extension revision 1 + VK_KHR_external_semaphore_fd : extension revision 1 + VK_KHR_format_feature_flags2 : extension revision 2 + VK_KHR_get_memory_requirements2 : extension revision 1 + VK_KHR_image_format_list : extension revision 1 + VK_KHR_imageless_framebuffer : extension revision 1 + VK_KHR_incremental_present : extension revision 2 + VK_KHR_index_type_uint8 : extension revision 1 + VK_KHR_line_rasterization : extension revision 1 + VK_KHR_load_store_op_none : extension revision 1 + VK_KHR_maintenance1 : extension revision 2 + VK_KHR_maintenance2 : extension revision 1 + VK_KHR_maintenance3 : extension revision 1 + VK_KHR_maintenance4 : extension revision 2 + VK_KHR_maintenance5 : extension revision 1 + VK_KHR_multiview : extension revision 1 + VK_KHR_performance_query : extension revision 1 + VK_KHR_pipeline_executable_properties : extension revision 1 + VK_KHR_relaxed_block_layout : extension revision 1 + VK_KHR_sampler_mirror_clamp_to_edge : extension revision 3 + VK_KHR_sampler_ycbcr_conversion : extension revision 14 + VK_KHR_separate_depth_stencil_layouts : extension revision 1 + VK_KHR_shader_expect_assume : extension revision 1 + VK_KHR_shader_float_controls : extension revision 4 + VK_KHR_shader_integer_dot_product : extension revision 1 + VK_KHR_shader_non_semantic_info : extension revision 1 + VK_KHR_shader_relaxed_extended_instruction : extension revision 1 + VK_KHR_shader_terminate_invocation : extension revision 1 + VK_KHR_spirv_1_4 : extension revision 1 + VK_KHR_storage_buffer_storage_class : extension revision 1 + VK_KHR_swapchain : extension revision 70 + VK_KHR_swapchain_mutable_format : extension revision 1 + VK_KHR_synchronization2 : extension revision 1 + VK_KHR_timeline_semaphore : extension revision 2 + VK_KHR_uniform_buffer_standard_layout : extension revision 1 + VK_KHR_variable_pointers : extension revision 1 + VK_KHR_vertex_attribute_divisor : extension revision 1 + VK_KHR_vulkan_memory_model : extension revision 3 + VK_KHR_workgroup_memory_explicit_layout : extension revision 1 + VK_KHR_zero_initialize_workgroup_memory : extension revision 1 + +VkQueueFamilyProperties: +======================== + queueProperties[0]: + ------------------- + minImageTransferGranularity = (1,1,1) + queueCount = 1 + queueFlags = QUEUE_GRAPHICS_BIT | QUEUE_COMPUTE_BIT | QUEUE_TRANSFER_BIT + timestampValidBits = 64 + present support = false + +VkPhysicalDeviceMemoryProperties: +================================= +memoryHeaps: count = 1 + memoryHeaps[0]: + size = 4294967296 (0x100000000) (4.00 GiB) + budget = 3292721971 (0xc442f333) (3.07 GiB) + usage = 0 (0x00000000) (0.00 B) + flags: count = 1 + MEMORY_HEAP_DEVICE_LOCAL_BIT +memoryTypes: count = 1 + memoryTypes[0]: + heapIndex = 0 + propertyFlags = 0x0007: count = 3 + MEMORY_PROPERTY_DEVICE_LOCAL_BIT + MEMORY_PROPERTY_HOST_VISIBLE_BIT + MEMORY_PROPERTY_HOST_COHERENT_BIT + usable for: + IMAGE_TILING_OPTIMAL: + color images + FORMAT_D16_UNORM + FORMAT_X8_D24_UNORM_PACK32 + FORMAT_D32_SFLOAT + FORMAT_D24_UNORM_S8_UINT + (non-sparse) + IMAGE_TILING_LINEAR: + color images + (non-sparse) + +VkPhysicalDeviceFeatures: +========================= + robustBufferAccess = true + fullDrawIndexUint32 = true + imageCubeArray = true + independentBlend = true + geometryShader = true + tessellationShader = false + sampleRateShading = true + dualSrcBlend = false + logicOp = true + multiDrawIndirect = false + drawIndirectFirstInstance = true + depthClamp = true + depthBiasClamp = true + fillModeNonSolid = true + depthBounds = true + wideLines = true + largePoints = true + alphaToOne = true + multiViewport = false + samplerAnisotropy = true + textureCompressionETC2 = true + textureCompressionASTC_LDR = true + textureCompressionBC = false + occlusionQueryPrecise = true + pipelineStatisticsQuery = false + vertexPipelineStoresAndAtomics = true + fragmentStoresAndAtomics = true + shaderTessellationAndGeometryPointSize = true + shaderImageGatherExtended = true + shaderStorageImageExtendedFormats = true + shaderStorageImageMultisample = false + shaderStorageImageReadWithoutFormat = true + shaderStorageImageWriteWithoutFormat = false + shaderUniformBufferArrayDynamicIndexing = false + shaderSampledImageArrayDynamicIndexing = false + shaderStorageBufferArrayDynamicIndexing = false + shaderStorageImageArrayDynamicIndexing = false + shaderClipDistance = true + shaderCullDistance = false + shaderFloat64 = false + shaderInt64 = false + shaderInt16 = false + shaderResourceResidency = false + shaderResourceMinLod = false + sparseBinding = false + sparseResidencyBuffer = false + sparseResidencyImage2D = false + sparseResidencyImage3D = false + sparseResidency2Samples = false + sparseResidency4Samples = false + sparseResidency8Samples = false + sparseResidency16Samples = false + sparseResidencyAliased = false + variableMultisampleRate = false + inheritedQueries = true + +VkPhysicalDevice4444FormatsFeaturesEXT: +--------------------------------------- + formatA4R4G4B4 = true + formatA4B4G4R4 = true + +VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT: +-------------------------------------------------------- + attachmentFeedbackLoopLayout = true + +VkPhysicalDeviceBorderColorSwizzleFeaturesEXT: +---------------------------------------------- + borderColorSwizzle = true + borderColorSwizzleFromImage = true + +VkPhysicalDeviceColorWriteEnableFeaturesEXT: +-------------------------------------------- + colorWriteEnable = true + +VkPhysicalDeviceCustomBorderColorFeaturesEXT: +--------------------------------------------- + customBorderColors = true + customBorderColorWithoutFormat = false + +VkPhysicalDeviceDepthClampZeroOneFeaturesEXT: +--------------------------------------------- + depthClampZeroOne = true + +VkPhysicalDeviceDepthClipControlFeaturesEXT: +-------------------------------------------- + depthClipControl = true + +VkPhysicalDeviceDepthClipEnableFeaturesEXT: +------------------------------------------- + depthClipEnable = true + +VkPhysicalDeviceExtendedDynamicState2FeaturesEXT: +------------------------------------------------- + extendedDynamicState2 = true + extendedDynamicState2LogicOp = false + extendedDynamicState2PatchControlPoints = false + +VkPhysicalDeviceExtendedDynamicStateFeaturesEXT: +------------------------------------------------ + extendedDynamicState = true + +VkPhysicalDeviceIndexTypeUint8FeaturesKHR: +------------------------------------------ + indexTypeUint8 = true + +VkPhysicalDeviceLineRasterizationFeaturesKHR: +--------------------------------------------- + rectangularLines = true + bresenhamLines = true + smoothLines = true + stippledRectangularLines = false + stippledBresenhamLines = false + stippledSmoothLines = false + +VkPhysicalDeviceMaintenance5FeaturesKHR: +---------------------------------------- + maintenance5 = true + +VkPhysicalDeviceMultiDrawFeaturesEXT: +------------------------------------- + multiDraw = true + +VkPhysicalDevicePerformanceQueryFeaturesKHR: +-------------------------------------------- + performanceCounterQueryPools = true + performanceCounterMultipleQueryPools = false + +VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR: +-------------------------------------------------------- + pipelineExecutableInfo = true + +VkPhysicalDevicePipelineRobustnessFeaturesEXT: +---------------------------------------------- + pipelineRobustness = true + +VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT: +-------------------------------------------------------- + primitiveTopologyListRestart = true + primitiveTopologyPatchListRestart = false + +VkPhysicalDeviceProvokingVertexFeaturesEXT: +------------------------------------------- + provokingVertexLast = true + transformFeedbackPreservesProvokingVertex = false + +VkPhysicalDeviceShaderExpectAssumeFeaturesKHR: +---------------------------------------------- + shaderExpectAssume = true + +VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT: +-------------------------------------------------- + shaderModuleIdentifier = true + +VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR: +------------------------------------------------------------ + shaderRelaxedExtendedInstruction = true + +VkPhysicalDeviceSwapchainMaintenance1FeaturesEXT: +------------------------------------------------- + swapchainMaintenance1 = true + +VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT: +------------------------------------------------ + texelBufferAlignment = true + +VkPhysicalDeviceVertexAttributeDivisorFeaturesKHR: +-------------------------------------------------- + vertexAttributeInstanceRateDivisor = true + vertexAttributeInstanceRateZeroDivisor = false + +VkPhysicalDeviceVulkan11Features: +--------------------------------- + storageBuffer16BitAccess = true + uniformAndStorageBuffer16BitAccess = true + storagePushConstant16 = true + storageInputOutput16 = false + multiview = true + multiviewGeometryShader = false + multiviewTessellationShader = false + variablePointersStorageBuffer = true + variablePointers = false + protectedMemory = false + samplerYcbcrConversion = true + shaderDrawParameters = false + +VkPhysicalDeviceVulkan12Features: +--------------------------------- + samplerMirrorClampToEdge = true + drawIndirectCount = false + storageBuffer8BitAccess = true + uniformAndStorageBuffer8BitAccess = true + storagePushConstant8 = true + shaderBufferInt64Atomics = false + shaderSharedInt64Atomics = false + shaderFloat16 = false + shaderInt8 = false + descriptorIndexing = false + shaderInputAttachmentArrayDynamicIndexing = false + shaderUniformTexelBufferArrayDynamicIndexing = false + shaderStorageTexelBufferArrayDynamicIndexing = false + shaderUniformBufferArrayNonUniformIndexing = false + shaderSampledImageArrayNonUniformIndexing = false + shaderStorageBufferArrayNonUniformIndexing = false + shaderStorageImageArrayNonUniformIndexing = false + shaderInputAttachmentArrayNonUniformIndexing = false + shaderUniformTexelBufferArrayNonUniformIndexing = false + shaderStorageTexelBufferArrayNonUniformIndexing = false + descriptorBindingUniformBufferUpdateAfterBind = false + descriptorBindingSampledImageUpdateAfterBind = false + descriptorBindingStorageImageUpdateAfterBind = false + descriptorBindingStorageBufferUpdateAfterBind = false + descriptorBindingUniformTexelBufferUpdateAfterBind = false + descriptorBindingStorageTexelBufferUpdateAfterBind = false + descriptorBindingUpdateUnusedWhilePending = false + descriptorBindingPartiallyBound = false + descriptorBindingVariableDescriptorCount = false + runtimeDescriptorArray = false + samplerFilterMinmax = false + scalarBlockLayout = true + imagelessFramebuffer = true + uniformBufferStandardLayout = true + shaderSubgroupExtendedTypes = true + separateDepthStencilLayouts = true + hostQueryReset = true + timelineSemaphore = true + bufferDeviceAddress = true + bufferDeviceAddressCaptureReplay = false + bufferDeviceAddressMultiDevice = false + vulkanMemoryModel = true + vulkanMemoryModelDeviceScope = true + vulkanMemoryModelAvailabilityVisibilityChains = true + shaderOutputViewportIndex = false + shaderOutputLayer = false + subgroupBroadcastDynamicId = true + +VkPhysicalDeviceVulkan13Features: +--------------------------------- + robustImageAccess = true + inlineUniformBlock = true + descriptorBindingInlineUniformBlockUpdateAfterBind = false + pipelineCreationCacheControl = true + privateData = true + shaderDemoteToHelperInvocation = true + shaderTerminateInvocation = true + subgroupSizeControl = true + computeFullSubgroups = true + synchronization2 = true + textureCompressionASTC_HDR = false + shaderZeroInitializeWorkgroupMemory = true + dynamicRendering = true + shaderIntegerDotProduct = true + maintenance4 = true + +VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR: +--------------------------------------------------------- + workgroupMemoryExplicitLayout = true + workgroupMemoryExplicitLayoutScalarBlockLayout = false + workgroupMemoryExplicitLayout8BitAccess = true + workgroupMemoryExplicitLayout16BitAccess = true + + +GPU1: +VkPhysicalDeviceProperties: +--------------------------- + apiVersion = 1.4.305 (4210993) + driverVersion = 0.0.1 (1) + vendorID = 0x10005 + deviceID = 0x0000 + deviceType = PHYSICAL_DEVICE_TYPE_CPU + deviceName = llvmpipe (LLVM 19.1.7, 128 bits) + pipelineCacheUUID = 32352e30-2e37-2d32-2b72-707434616161 + +VkPhysicalDeviceLimits: +----------------------- + maxImageDimension1D = 16384 + maxImageDimension2D = 16384 + maxImageDimension3D = 4096 + maxImageDimensionCube = 32768 + maxImageArrayLayers = 2048 + maxTexelBufferElements = 134217728 + maxUniformBufferRange = 65536 + maxStorageBufferRange = 134217728 + maxPushConstantsSize = 256 + maxMemoryAllocationCount = 4294967295 + maxSamplerAllocationCount = 32768 + bufferImageGranularity = 0x00000040 + sparseAddressSpaceSize = 0x80000000 + maxBoundDescriptorSets = 8 + maxPerStageDescriptorSamplers = 1000000 + maxPerStageDescriptorUniformBuffers = 1000000 + maxPerStageDescriptorStorageBuffers = 1000000 + maxPerStageDescriptorSampledImages = 1000000 + maxPerStageDescriptorStorageImages = 1000000 + maxPerStageDescriptorInputAttachments = 1000000 + maxPerStageResources = 1000000 + maxDescriptorSetSamplers = 1000000 + maxDescriptorSetUniformBuffers = 1000000 + maxDescriptorSetUniformBuffersDynamic = 1000000 + maxDescriptorSetStorageBuffers = 1000000 + maxDescriptorSetStorageBuffersDynamic = 1000000 + maxDescriptorSetSampledImages = 1000000 + maxDescriptorSetStorageImages = 1000000 + maxDescriptorSetInputAttachments = 1000000 + maxVertexInputAttributes = 32 + maxVertexInputBindings = 32 + maxVertexInputAttributeOffset = 2047 + maxVertexInputBindingStride = 2048 + maxVertexOutputComponents = 128 + maxTessellationGenerationLevel = 64 + maxTessellationPatchSize = 32 + maxTessellationControlPerVertexInputComponents = 128 + maxTessellationControlPerVertexOutputComponents = 128 + maxTessellationControlPerPatchOutputComponents = 128 + maxTessellationControlTotalOutputComponents = 4096 + maxTessellationEvaluationInputComponents = 128 + maxTessellationEvaluationOutputComponents = 128 + maxGeometryShaderInvocations = 32 + maxGeometryInputComponents = 64 + maxGeometryOutputComponents = 128 + maxGeometryOutputVertices = 1024 + maxGeometryTotalOutputComponents = 1024 + maxFragmentInputComponents = 128 + maxFragmentOutputAttachments = 8 + maxFragmentDualSrcAttachments = 2 + maxFragmentCombinedOutputResources = 104 + maxComputeSharedMemorySize = 32768 + maxComputeWorkGroupCount: count = 3 + 65535 + 65535 + 65535 + maxComputeWorkGroupInvocations = 1024 + maxComputeWorkGroupSize: count = 3 + 1024 + 1024 + 1024 + subPixelPrecisionBits = 8 + subTexelPrecisionBits = 8 + mipmapPrecisionBits = 6 + maxDrawIndexedIndexValue = 4294967295 + maxDrawIndirectCount = 4294967295 + maxSamplerLodBias = 16 + maxSamplerAnisotropy = 16 + maxViewports = 16 + maxViewportDimensions: count = 2 + 16384 + 16384 + viewportBoundsRange: count = 2 + -32768 + 32768 + viewportSubPixelBits = 0 + minMemoryMapAlignment = 64 + minTexelBufferOffsetAlignment = 0x00000010 + minUniformBufferOffsetAlignment = 0x00000010 + minStorageBufferOffsetAlignment = 0x00000010 + minTexelOffset = -32 + maxTexelOffset = 31 + minTexelGatherOffset = -32 + maxTexelGatherOffset = 31 + minInterpolationOffset = -2 + maxInterpolationOffset = 2 + subPixelInterpolationOffsetBits = 8 + maxFramebufferWidth = 16384 + maxFramebufferHeight = 16384 + maxFramebufferLayers = 2048 + framebufferColorSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + framebufferDepthSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + framebufferStencilSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + framebufferNoAttachmentsSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + maxColorAttachments = 8 + sampledImageColorSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + sampledImageIntegerSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + sampledImageDepthSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + sampledImageStencilSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + storageImageSampleCounts: count = 2 + SAMPLE_COUNT_1_BIT + SAMPLE_COUNT_4_BIT + maxSampleMaskWords = 1 + timestampComputeAndGraphics = true + timestampPeriod = 1 + maxClipDistances = 8 + maxCullDistances = 8 + maxCombinedClipAndCullDistances = 8 + discreteQueuePriorities = 2 + pointSizeRange: count = 2 + 0 + 256 + lineWidthRange: count = 2 + 1 + 255 + pointSizeGranularity = 0.125 + lineWidthGranularity = 0.0078125 + strictLines = true + standardSampleLocations = true + optimalBufferCopyOffsetAlignment = 0x00000080 + optimalBufferCopyRowPitchAlignment = 0x00000080 + nonCoherentAtomSize = 0x00000040 + +VkPhysicalDeviceSparseProperties: +--------------------------------- + residencyStandard2DBlockShape = true + residencyStandard2DMultisampleBlockShape = true + residencyStandard3DBlockShape = true + residencyAlignedMipSize = false + residencyNonResidentStrict = false + +VkPhysicalDeviceAccelerationStructurePropertiesKHR: +--------------------------------------------------- + maxGeometryCount = 16777215 + maxInstanceCount = 16777215 + maxPrimitiveCount = 16777215 + maxPerStageDescriptorAccelerationStructures = 1000000 + maxPerStageDescriptorUpdateAfterBindAccelerationStructures = 1000000 + maxDescriptorSetAccelerationStructures = 1000000 + maxDescriptorSetUpdateAfterBindAccelerationStructures = 1000000 + minAccelerationStructureScratchOffsetAlignment = 8 + +VkPhysicalDeviceComputeShaderDerivativesPropertiesKHR: +------------------------------------------------------ + meshAndTaskShaderDerivatives = true + +VkPhysicalDeviceCustomBorderColorPropertiesEXT: +----------------------------------------------- + maxCustomBorderColorSamplers = 32768 + +VkPhysicalDeviceDescriptorBufferDensityMapPropertiesEXT: +-------------------------------------------------------- + combinedImageSamplerDensityMapDescriptorSize = 0 + +VkPhysicalDeviceDescriptorBufferPropertiesEXT: +---------------------------------------------- + combinedImageSamplerDescriptorSingleArray = true + bufferlessPushDescriptors = true + allowSamplerImageViewPostSubmitCreation = false + descriptorBufferOffsetAlignment = 0x00000004 + maxDescriptorBufferBindings = 8 + maxResourceDescriptorBufferBindings = 8 + maxSamplerDescriptorBufferBindings = 8 + maxEmbeddedImmutableSamplerBindings = 8 + maxEmbeddedImmutableSamplers = 2032 + bufferCaptureReplayDescriptorDataSize = 0 + imageCaptureReplayDescriptorDataSize = 0 + imageViewCaptureReplayDescriptorDataSize = 0 + samplerCaptureReplayDescriptorDataSize = 0 + accelerationStructureCaptureReplayDescriptorDataSize = 0 + samplerDescriptorSize = 256 + combinedImageSamplerDescriptorSize = 256 + sampledImageDescriptorSize = 256 + storageImageDescriptorSize = 256 + uniformTexelBufferDescriptorSize = 256 + robustUniformTexelBufferDescriptorSize = 256 + storageTexelBufferDescriptorSize = 256 + robustStorageTexelBufferDescriptorSize = 256 + uniformBufferDescriptorSize = 256 + robustUniformBufferDescriptorSize = 256 + storageBufferDescriptorSize = 256 + robustStorageBufferDescriptorSize = 256 + inputAttachmentDescriptorSize = 256 + accelerationStructureDescriptorSize = 256 + maxSamplerDescriptorBufferRange = 0xffffffff + maxResourceDescriptorBufferRange = 0xffffffff + samplerDescriptorBufferAddressSpaceSize = 0xffffffff + resourceDescriptorBufferAddressSpaceSize = 0xffffffff + descriptorBufferAddressSpaceSize = 0xffffffff + +VkPhysicalDeviceDeviceGeneratedCommandsPropertiesEXT: +----------------------------------------------------- + maxIndirectPipelineCount = 4096 + maxIndirectShaderObjectCount = 4096 + maxIndirectSequenceCount = 1048576 + maxIndirectCommandsTokenCount = 16 + maxIndirectCommandsTokenOffset = 2047 + maxIndirectCommandsIndirectStride = 2048 + supportedIndirectCommandsInputModes: count = 2 + INDIRECT_COMMANDS_INPUT_MODE_VULKAN_INDEX_BUFFER_EXT + INDIRECT_COMMANDS_INPUT_MODE_DXGI_INDEX_BUFFER_EXT + supportedIndirectCommandsShaderStages: count = 16 + SHADER_STAGE_VERTEX_BIT + SHADER_STAGE_TESSELLATION_CONTROL_BIT + SHADER_STAGE_TESSELLATION_EVALUATION_BIT + SHADER_STAGE_GEOMETRY_BIT + SHADER_STAGE_FRAGMENT_BIT + SHADER_STAGE_COMPUTE_BIT + SHADER_STAGE_RAYGEN_BIT_KHR + SHADER_STAGE_ANY_HIT_BIT_KHR + SHADER_STAGE_CLOSEST_HIT_BIT_KHR + SHADER_STAGE_MISS_BIT_KHR + SHADER_STAGE_INTERSECTION_BIT_KHR + SHADER_STAGE_CALLABLE_BIT_KHR + SHADER_STAGE_TASK_BIT_EXT + SHADER_STAGE_MESH_BIT_EXT + SHADER_STAGE_SUBPASS_SHADING_BIT_HUAWEI + SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI + supportedIndirectCommandsShaderStagesPipelineBinding: count = 16 + SHADER_STAGE_VERTEX_BIT + SHADER_STAGE_TESSELLATION_CONTROL_BIT + SHADER_STAGE_TESSELLATION_EVALUATION_BIT + SHADER_STAGE_GEOMETRY_BIT + SHADER_STAGE_FRAGMENT_BIT + SHADER_STAGE_COMPUTE_BIT + SHADER_STAGE_RAYGEN_BIT_KHR + SHADER_STAGE_ANY_HIT_BIT_KHR + SHADER_STAGE_CLOSEST_HIT_BIT_KHR + SHADER_STAGE_MISS_BIT_KHR + SHADER_STAGE_INTERSECTION_BIT_KHR + SHADER_STAGE_CALLABLE_BIT_KHR + SHADER_STAGE_TASK_BIT_EXT + SHADER_STAGE_MESH_BIT_EXT + SHADER_STAGE_SUBPASS_SHADING_BIT_HUAWEI + SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI + supportedIndirectCommandsShaderStagesShaderBinding: count = 16 + SHADER_STAGE_VERTEX_BIT + SHADER_STAGE_TESSELLATION_CONTROL_BIT + SHADER_STAGE_TESSELLATION_EVALUATION_BIT + SHADER_STAGE_GEOMETRY_BIT + SHADER_STAGE_FRAGMENT_BIT + SHADER_STAGE_COMPUTE_BIT + SHADER_STAGE_RAYGEN_BIT_KHR + SHADER_STAGE_ANY_HIT_BIT_KHR + SHADER_STAGE_CLOSEST_HIT_BIT_KHR + SHADER_STAGE_MISS_BIT_KHR + SHADER_STAGE_INTERSECTION_BIT_KHR + SHADER_STAGE_CALLABLE_BIT_KHR + SHADER_STAGE_TASK_BIT_EXT + SHADER_STAGE_MESH_BIT_EXT + SHADER_STAGE_SUBPASS_SHADING_BIT_HUAWEI + SHADER_STAGE_CLUSTER_CULLING_BIT_HUAWEI + deviceGeneratedCommandsTransformFeedback = true + deviceGeneratedCommandsMultiDrawIndirectCount = true + +VkPhysicalDeviceExtendedDynamicState3PropertiesEXT: +--------------------------------------------------- + dynamicPrimitiveTopologyUnrestricted = true + +VkPhysicalDeviceExternalMemoryHostPropertiesEXT: +------------------------------------------------ + minImportedHostPointerAlignment = 0x00001000 + +VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT: +----------------------------------------------------- + graphicsPipelineLibraryFastLinking = true + graphicsPipelineLibraryIndependentInterpolationDecoration = true + +VkPhysicalDeviceLayeredApiPropertiesListKHR: +-------------------------------------------- + layeredApiCount = 0 + pLayeredApis = NULL + +VkPhysicalDeviceLegacyVertexAttributesPropertiesEXT: +---------------------------------------------------- + nativeUnalignedPerformance = true + +VkPhysicalDeviceMaintenance7PropertiesKHR: +------------------------------------------ + robustFragmentShadingRateAttachmentAccess = false + separateDepthStencilAttachmentAccess = true + maxDescriptorSetTotalUniformBuffersDynamic = 1000000 + maxDescriptorSetTotalStorageBuffersDynamic = 1000000 + maxDescriptorSetTotalBuffersDynamic = 1000000 + maxDescriptorSetUpdateAfterBindTotalUniformBuffersDynamic = 1000000 + maxDescriptorSetUpdateAfterBindTotalStorageBuffersDynamic = 1000000 + maxDescriptorSetUpdateAfterBindTotalBuffersDynamic = 1000000 + +VkPhysicalDeviceMeshShaderPropertiesEXT: +---------------------------------------- + maxTaskWorkGroupTotalCount = 4194304 + maxTaskWorkGroupCount: count = 3 + 65536 + 65536 + 65536 + maxTaskWorkGroupInvocations = 1024 + maxTaskWorkGroupSize: count = 3 + 1024 + 1024 + 1024 + maxTaskPayloadSize = 16384 + maxTaskSharedMemorySize = 32768 + maxTaskPayloadAndSharedMemorySize = 32768 + maxMeshWorkGroupTotalCount = 4194304 + maxMeshWorkGroupCount: count = 3 + 65536 + 65536 + 65536 + maxMeshWorkGroupInvocations = 1024 + maxMeshWorkGroupSize: count = 3 + 1024 + 1024 + 1024 + maxMeshSharedMemorySize = 28672 + maxMeshPayloadAndSharedMemorySize = 45056 + maxMeshOutputMemorySize = 32768 + maxMeshPayloadAndOutputMemorySize = 49152 + maxMeshOutputComponents = 128 + maxMeshOutputVertices = 256 + maxMeshOutputPrimitives = 256 + maxMeshOutputLayers = 8 + maxMeshMultiviewViewCount = 0 + meshOutputPerVertexGranularity = 1 + meshOutputPerPrimitiveGranularity = 1 + maxPreferredTaskWorkGroupInvocations = 64 + maxPreferredMeshWorkGroupInvocations = 128 + prefersLocalInvocationVertexOutput = true + prefersLocalInvocationPrimitiveOutput = true + prefersCompactVertexOutput = true + prefersCompactPrimitiveOutput = false + +VkPhysicalDeviceMultiDrawPropertiesEXT: +--------------------------------------- + maxMultiDrawCount = 2048 + +VkPhysicalDeviceNestedCommandBufferPropertiesEXT: +------------------------------------------------- + maxCommandBufferNestingLevel = 4294967295 + +VkPhysicalDeviceProvokingVertexPropertiesEXT: +--------------------------------------------- + provokingVertexModePerPipeline = true + transformFeedbackPreservesTriangleFanProvokingVertex = true + +VkPhysicalDeviceRayTracingPipelinePropertiesKHR: +------------------------------------------------ + shaderGroupHandleSize = 32 + maxRayRecursionDepth = 31 + maxShaderGroupStride = 16384 + shaderGroupBaseAlignment = 32 + shaderGroupHandleCaptureReplaySize = 0 + maxRayDispatchInvocationCount = 67108864 + shaderGroupHandleAlignment = 16 + maxRayHitAttributeSize = 32 + +VkPhysicalDeviceRobustness2PropertiesEXT: +----------------------------------------- + robustStorageBufferAccessSizeAlignment = 0x00000001 + robustUniformBufferAccessSizeAlignment = 0x00000001 + +VkPhysicalDeviceShaderObjectPropertiesEXT: +------------------------------------------ + shaderBinaryUUID = 32352e30-2e37-2d32-2b72-707434616161 + shaderBinaryVersion = 1 + +VkPhysicalDeviceTransformFeedbackPropertiesEXT: +----------------------------------------------- + maxTransformFeedbackStreams = 4 + maxTransformFeedbackBuffers = 4 + maxTransformFeedbackBufferSize = 0xffffffff + maxTransformFeedbackStreamDataSize = 512 + maxTransformFeedbackBufferDataSize = 512 + maxTransformFeedbackBufferDataStride = 512 + transformFeedbackQueries = true + transformFeedbackStreamsLinesTriangles = false + transformFeedbackRasterizationStreamSelect = false + transformFeedbackDraw = true + +VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT: +---------------------------------------------------- + maxVertexAttribDivisor = 4294967295 + +VkPhysicalDeviceVulkan11Properties: +----------------------------------- + deviceUUID = 6d657361-3235-2e30-2e37-2d322b727000 + driverUUID = 6c6c766d-7069-7065-5555-494400000000 + deviceNodeMask = 0 + deviceLUIDValid = false + subgroupSize = 4 + subgroupSupportedStages: count = 4 + SHADER_STAGE_FRAGMENT_BIT + SHADER_STAGE_COMPUTE_BIT + SHADER_STAGE_TASK_BIT_EXT + SHADER_STAGE_MESH_BIT_EXT + subgroupSupportedOperations: count = 10 + SUBGROUP_FEATURE_BASIC_BIT + SUBGROUP_FEATURE_VOTE_BIT + SUBGROUP_FEATURE_ARITHMETIC_BIT + SUBGROUP_FEATURE_BALLOT_BIT + SUBGROUP_FEATURE_SHUFFLE_BIT + SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT + SUBGROUP_FEATURE_CLUSTERED_BIT + SUBGROUP_FEATURE_QUAD_BIT + SUBGROUP_FEATURE_ROTATE_BIT + SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT + subgroupQuadOperationsInAllStages = false + pointClippingBehavior = POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES + maxMultiviewViewCount = 6 + maxMultiviewInstanceIndex = 2147483647 + protectedNoFault = false + maxPerSetDescriptors = 1000000 + maxMemoryAllocationSize = 0x80000000 + +VkPhysicalDeviceVulkan12Properties: +----------------------------------- + driverID = DRIVER_ID_MESA_LLVMPIPE + driverName = llvmpipe + driverInfo = Mesa 25.0.7-2+rpt4 (LLVM 19.1.7) + conformanceVersion: + major = 1 + minor = 3 + subminor = 1 + patch = 1 + denormBehaviorIndependence = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL + roundingModeIndependence = SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL + shaderSignedZeroInfNanPreserveFloat16 = true + shaderSignedZeroInfNanPreserveFloat32 = true + shaderSignedZeroInfNanPreserveFloat64 = true + shaderDenormPreserveFloat16 = false + shaderDenormPreserveFloat32 = false + shaderDenormPreserveFloat64 = false + shaderDenormFlushToZeroFloat16 = false + shaderDenormFlushToZeroFloat32 = false + shaderDenormFlushToZeroFloat64 = false + shaderRoundingModeRTEFloat16 = true + shaderRoundingModeRTEFloat32 = true + shaderRoundingModeRTEFloat64 = true + shaderRoundingModeRTZFloat16 = false + shaderRoundingModeRTZFloat32 = false + shaderRoundingModeRTZFloat64 = false + maxUpdateAfterBindDescriptorsInAllPools = 4294967295 + shaderUniformBufferArrayNonUniformIndexingNative = true + shaderSampledImageArrayNonUniformIndexingNative = true + shaderStorageBufferArrayNonUniformIndexingNative = true + shaderStorageImageArrayNonUniformIndexingNative = true + shaderInputAttachmentArrayNonUniformIndexingNative = true + robustBufferAccessUpdateAfterBind = true + quadDivergentImplicitLod = true + maxPerStageDescriptorUpdateAfterBindSamplers = 1000000 + maxPerStageDescriptorUpdateAfterBindUniformBuffers = 1000000 + maxPerStageDescriptorUpdateAfterBindStorageBuffers = 1000000 + maxPerStageDescriptorUpdateAfterBindSampledImages = 1000000 + maxPerStageDescriptorUpdateAfterBindStorageImages = 1000000 + maxPerStageDescriptorUpdateAfterBindInputAttachments = 1000000 + maxPerStageUpdateAfterBindResources = 1000000 + maxDescriptorSetUpdateAfterBindSamplers = 1000000 + maxDescriptorSetUpdateAfterBindUniformBuffers = 1000000 + maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = 1000000 + maxDescriptorSetUpdateAfterBindStorageBuffers = 1000000 + maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = 1000000 + maxDescriptorSetUpdateAfterBindSampledImages = 1000000 + maxDescriptorSetUpdateAfterBindStorageImages = 1000000 + maxDescriptorSetUpdateAfterBindInputAttachments = 1000000 + supportedDepthResolveModes: count = 2 + RESOLVE_MODE_SAMPLE_ZERO_BIT + RESOLVE_MODE_AVERAGE_BIT + supportedStencilResolveModes: count = 1 + RESOLVE_MODE_SAMPLE_ZERO_BIT + independentResolveNone = false + independentResolve = false + filterMinmaxSingleComponentFormats = true + filterMinmaxImageComponentMapping = true + maxTimelineSemaphoreValueDifference = 18446744073709551615 + framebufferIntegerColorSampleCounts: count = 1 + SAMPLE_COUNT_1_BIT + +VkPhysicalDeviceVulkan13Properties: +----------------------------------- + minSubgroupSize = 4 + maxSubgroupSize = 4 + maxComputeWorkgroupSubgroups = 32 + requiredSubgroupSizeStages: count = 2 + SHADER_STAGE_FRAGMENT_BIT + SHADER_STAGE_COMPUTE_BIT + maxInlineUniformBlockSize = 4096 + maxPerStageDescriptorInlineUniformBlocks = 8 + maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 8 + maxDescriptorSetInlineUniformBlocks = 8 + maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 8 + maxInlineUniformTotalSize = 262144 + integerDotProduct8BitUnsignedAccelerated = false + integerDotProduct8BitSignedAccelerated = false + integerDotProduct8BitMixedSignednessAccelerated = false + integerDotProduct4x8BitPackedUnsignedAccelerated = false + integerDotProduct4x8BitPackedSignedAccelerated = false + integerDotProduct4x8BitPackedMixedSignednessAccelerated = false + integerDotProduct16BitUnsignedAccelerated = false + integerDotProduct16BitSignedAccelerated = false + integerDotProduct16BitMixedSignednessAccelerated = false + integerDotProduct32BitUnsignedAccelerated = false + integerDotProduct32BitSignedAccelerated = false + integerDotProduct32BitMixedSignednessAccelerated = false + integerDotProduct64BitUnsignedAccelerated = false + integerDotProduct64BitSignedAccelerated = false + integerDotProduct64BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating8BitSignedAccelerated = false + integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = false + integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false + integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating16BitSignedAccelerated = false + integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating32BitSignedAccelerated = false + integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false + integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false + integerDotProductAccumulatingSaturating64BitSignedAccelerated = false + integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false + storageTexelBufferOffsetAlignmentBytes = 0x00000010 + storageTexelBufferOffsetSingleTexelAlignment = true + uniformTexelBufferOffsetAlignmentBytes = 0x00000010 + uniformTexelBufferOffsetSingleTexelAlignment = true + maxBufferSize = 0xffffffff + +VkPhysicalDeviceVulkan14Properties: +----------------------------------- + lineSubPixelPrecisionBits = 8 + maxVertexAttribDivisor = 4294967295 + supportsNonZeroFirstInstance = false + maxPushDescriptors = 32 + dynamicRenderingLocalReadDepthStencilAttachments = false + dynamicRenderingLocalReadMultisampledAttachments = false + earlyFragmentMultisampleCoverageAfterSampleCounting = true + earlyFragmentSampleMaskTestBeforeSampleCounting = false + depthStencilSwizzleOneSupport = false + polygonModePointSize = true + nonStrictSinglePixelWideLinesUseParallelogram = false + nonStrictWideLinesUseParallelogram = false + blockTexelViewCompatibleMultipleLayers = true + maxCombinedImageSamplerDescriptorCount = 3 + fragmentShadingRateClampCombinerInputs = false + defaultRobustnessStorageBuffers = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2 + defaultRobustnessUniformBuffers = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2 + defaultRobustnessVertexInputs = PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2 + defaultRobustnessImages = PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2 + copySrcLayoutCount = 23 + pCopySrcLayouts: count = 23 + IMAGE_LAYOUT_GENERAL + IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL + IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + IMAGE_LAYOUT_PREINITIALIZED + IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_PRESENT_SRC_KHR + IMAGE_LAYOUT_VIDEO_DECODE_DST_KHR + IMAGE_LAYOUT_VIDEO_DECODE_SRC_KHR + IMAGE_LAYOUT_VIDEO_DECODE_DPB_KHR + IMAGE_LAYOUT_SHARED_PRESENT_KHR + IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT + IMAGE_LAYOUT_FRAGMENT_SHADING_RATE_ATTACHMENT_OPTIMAL_KHR + copyDstLayoutCount = 23 + pCopyDstLayouts: count = 23 + IMAGE_LAYOUT_GENERAL + IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL + IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + IMAGE_LAYOUT_PREINITIALIZED + IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_READ_ONLY_OPTIMAL + IMAGE_LAYOUT_ATTACHMENT_OPTIMAL + IMAGE_LAYOUT_PRESENT_SRC_KHR + IMAGE_LAYOUT_VIDEO_DECODE_DST_KHR + IMAGE_LAYOUT_VIDEO_DECODE_SRC_KHR + IMAGE_LAYOUT_VIDEO_DECODE_DPB_KHR + IMAGE_LAYOUT_SHARED_PRESENT_KHR + IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT + IMAGE_LAYOUT_FRAGMENT_SHADING_RATE_ATTACHMENT_OPTIMAL_KHR + optimalTilingLayoutUUID = 32352e30-2e37-2d32-2b72-707434616161 + identicalMemoryTypeRequirements = false + +Device Extensions: count = 156 + VK_AMDX_shader_enqueue : extension revision 2 + VK_ANDROID_external_memory_android_hardware_buffer : extension revision 5 + VK_ARM_rasterization_order_attachment_access : extension revision 1 + VK_EXT_4444_formats : extension revision 1 + VK_EXT_attachment_feedback_loop_dynamic_state : extension revision 1 + VK_EXT_attachment_feedback_loop_layout : extension revision 2 + VK_EXT_border_color_swizzle : extension revision 1 + VK_EXT_calibrated_timestamps : extension revision 2 + VK_EXT_color_write_enable : extension revision 1 + VK_EXT_conditional_rendering : extension revision 2 + VK_EXT_custom_border_color : extension revision 12 + VK_EXT_depth_clip_control : extension revision 1 + VK_EXT_depth_clip_enable : extension revision 1 + VK_EXT_depth_range_unrestricted : extension revision 1 + VK_EXT_descriptor_buffer : extension revision 1 + VK_EXT_descriptor_indexing : extension revision 2 + VK_EXT_device_generated_commands : extension revision 1 + VK_EXT_dynamic_rendering_unused_attachments : extension revision 1 + VK_EXT_extended_dynamic_state : extension revision 1 + VK_EXT_extended_dynamic_state2 : extension revision 1 + VK_EXT_extended_dynamic_state3 : extension revision 2 + VK_EXT_external_memory_dma_buf : extension revision 1 + VK_EXT_external_memory_host : extension revision 1 + VK_EXT_graphics_pipeline_library : extension revision 1 + VK_EXT_host_image_copy : extension revision 1 + VK_EXT_host_query_reset : extension revision 1 + VK_EXT_image_2d_view_of_3d : extension revision 1 + VK_EXT_image_drm_format_modifier : extension revision 2 + VK_EXT_image_robustness : extension revision 1 + VK_EXT_image_sliced_view_of_3d : extension revision 1 + VK_EXT_index_type_uint8 : extension revision 1 + VK_EXT_inline_uniform_block : extension revision 1 + VK_EXT_legacy_vertex_attributes : extension revision 1 + VK_EXT_line_rasterization : extension revision 1 + VK_EXT_load_store_op_none : extension revision 1 + VK_EXT_memory_budget : extension revision 1 + VK_EXT_memory_priority : extension revision 1 + VK_EXT_mesh_shader : extension revision 1 + VK_EXT_multi_draw : extension revision 1 + VK_EXT_multisampled_render_to_single_sampled : extension revision 1 + VK_EXT_mutable_descriptor_type : extension revision 1 + VK_EXT_nested_command_buffer : extension revision 1 + VK_EXT_non_seamless_cube_map : extension revision 1 + VK_EXT_pageable_device_local_memory : extension revision 1 + VK_EXT_pipeline_creation_cache_control : extension revision 3 + VK_EXT_pipeline_creation_feedback : extension revision 1 + VK_EXT_pipeline_library_group_handles : extension revision 1 + VK_EXT_pipeline_protected_access : extension revision 1 + VK_EXT_pipeline_robustness : extension revision 1 + VK_EXT_post_depth_coverage : extension revision 1 + VK_EXT_primitive_topology_list_restart : extension revision 1 + VK_EXT_primitives_generated_query : extension revision 1 + VK_EXT_private_data : extension revision 1 + VK_EXT_provoking_vertex : extension revision 1 + VK_EXT_queue_family_foreign : extension revision 1 + VK_EXT_rasterization_order_attachment_access : extension revision 1 + VK_EXT_robustness2 : extension revision 1 + VK_EXT_sampler_filter_minmax : extension revision 2 + VK_EXT_scalar_block_layout : extension revision 1 + VK_EXT_separate_stencil_usage : extension revision 1 + VK_EXT_shader_atomic_float : extension revision 1 + VK_EXT_shader_atomic_float2 : extension revision 1 + VK_EXT_shader_demote_to_helper_invocation : extension revision 1 + VK_EXT_shader_object : extension revision 1 + VK_EXT_shader_replicated_composites : extension revision 1 + VK_EXT_shader_stencil_export : extension revision 1 + VK_EXT_shader_subgroup_ballot : extension revision 1 + VK_EXT_shader_subgroup_vote : extension revision 1 + VK_EXT_shader_viewport_index_layer : extension revision 1 + VK_EXT_subgroup_size_control : extension revision 2 + VK_EXT_swapchain_maintenance1 : extension revision 1 + VK_EXT_texel_buffer_alignment : extension revision 1 + VK_EXT_transform_feedback : extension revision 1 + VK_EXT_vertex_attribute_divisor : extension revision 3 + VK_EXT_vertex_input_dynamic_state : extension revision 2 + VK_EXT_ycbcr_2plane_444_formats : extension revision 1 + VK_EXT_ycbcr_image_arrays : extension revision 1 + VK_GOOGLE_decorate_string : extension revision 1 + VK_GOOGLE_hlsl_functionality1 : extension revision 1 + VK_KHR_16bit_storage : extension revision 1 + VK_KHR_8bit_storage : extension revision 1 + VK_KHR_acceleration_structure : extension revision 13 + VK_KHR_bind_memory2 : extension revision 1 + VK_KHR_buffer_device_address : extension revision 1 + VK_KHR_compute_shader_derivatives : extension revision 1 + VK_KHR_copy_commands2 : extension revision 1 + VK_KHR_create_renderpass2 : extension revision 1 + VK_KHR_dedicated_allocation : extension revision 3 + VK_KHR_deferred_host_operations : extension revision 4 + VK_KHR_depth_stencil_resolve : extension revision 1 + VK_KHR_descriptor_update_template : extension revision 1 + VK_KHR_device_group : extension revision 4 + VK_KHR_draw_indirect_count : extension revision 1 + VK_KHR_driver_properties : extension revision 1 + VK_KHR_dynamic_rendering : extension revision 1 + VK_KHR_dynamic_rendering_local_read : extension revision 1 + VK_KHR_external_fence : extension revision 1 + VK_KHR_external_fence_fd : extension revision 1 + VK_KHR_external_memory : extension revision 1 + VK_KHR_external_memory_fd : extension revision 1 + VK_KHR_external_semaphore : extension revision 1 + VK_KHR_external_semaphore_fd : extension revision 1 + VK_KHR_format_feature_flags2 : extension revision 2 + VK_KHR_get_memory_requirements2 : extension revision 1 + VK_KHR_global_priority : extension revision 1 + VK_KHR_image_format_list : extension revision 1 + VK_KHR_imageless_framebuffer : extension revision 1 + VK_KHR_incremental_present : extension revision 2 + VK_KHR_index_type_uint8 : extension revision 1 + VK_KHR_line_rasterization : extension revision 1 + VK_KHR_load_store_op_none : extension revision 1 + VK_KHR_maintenance1 : extension revision 2 + VK_KHR_maintenance2 : extension revision 1 + VK_KHR_maintenance3 : extension revision 1 + VK_KHR_maintenance4 : extension revision 2 + VK_KHR_maintenance5 : extension revision 1 + VK_KHR_maintenance6 : extension revision 1 + VK_KHR_maintenance7 : extension revision 1 + VK_KHR_maintenance8 : extension revision 1 + VK_KHR_map_memory2 : extension revision 1 + VK_KHR_multiview : extension revision 1 + VK_KHR_pipeline_library : extension revision 1 + VK_KHR_push_descriptor : extension revision 2 + VK_KHR_ray_query : extension revision 1 + VK_KHR_ray_tracing_maintenance1 : extension revision 1 + VK_KHR_ray_tracing_pipeline : extension revision 1 + VK_KHR_ray_tracing_position_fetch : extension revision 1 + VK_KHR_relaxed_block_layout : extension revision 1 + VK_KHR_sampler_mirror_clamp_to_edge : extension revision 3 + VK_KHR_sampler_ycbcr_conversion : extension revision 14 + VK_KHR_separate_depth_stencil_layouts : extension revision 1 + VK_KHR_shader_atomic_int64 : extension revision 1 + VK_KHR_shader_clock : extension revision 1 + VK_KHR_shader_draw_parameters : extension revision 1 + VK_KHR_shader_expect_assume : extension revision 1 + VK_KHR_shader_float16_int8 : extension revision 1 + VK_KHR_shader_float_controls : extension revision 4 + VK_KHR_shader_float_controls2 : extension revision 1 + VK_KHR_shader_integer_dot_product : extension revision 1 + VK_KHR_shader_maximal_reconvergence : extension revision 1 + VK_KHR_shader_non_semantic_info : extension revision 1 + VK_KHR_shader_relaxed_extended_instruction : extension revision 1 + VK_KHR_shader_subgroup_extended_types : extension revision 1 + VK_KHR_shader_subgroup_rotate : extension revision 2 + VK_KHR_shader_terminate_invocation : extension revision 1 + VK_KHR_spirv_1_4 : extension revision 1 + VK_KHR_storage_buffer_storage_class : extension revision 1 + VK_KHR_swapchain : extension revision 70 + VK_KHR_swapchain_mutable_format : extension revision 1 + VK_KHR_synchronization2 : extension revision 1 + VK_KHR_timeline_semaphore : extension revision 2 + VK_KHR_uniform_buffer_standard_layout : extension revision 1 + VK_KHR_variable_pointers : extension revision 1 + VK_KHR_vertex_attribute_divisor : extension revision 1 + VK_KHR_vulkan_memory_model : extension revision 3 + VK_KHR_zero_initialize_workgroup_memory : extension revision 1 + +VkQueueFamilyProperties: +======================== + queueProperties[0]: + ------------------- + minImageTransferGranularity = (1,1,1) + queueCount = 1 + queueFlags = QUEUE_GRAPHICS_BIT | QUEUE_COMPUTE_BIT | QUEUE_TRANSFER_BIT | QUEUE_SPARSE_BINDING_BIT + timestampValidBits = 64 + present support = false + +VkPhysicalDeviceMemoryProperties: +================================= +memoryHeaps: count = 1 + memoryHeaps[0]: + size = 8454619136 (0x1f7ef4000) (7.87 GiB) + budget = 8454619136 (0x1f7ef4000) (7.87 GiB) + usage = 4796039168 (0x11dddc000) (4.47 GiB) + flags: count = 1 + MEMORY_HEAP_DEVICE_LOCAL_BIT +memoryTypes: count = 1 + memoryTypes[0]: + heapIndex = 0 + propertyFlags = 0x000f: count = 4 + MEMORY_PROPERTY_DEVICE_LOCAL_BIT + MEMORY_PROPERTY_HOST_VISIBLE_BIT + MEMORY_PROPERTY_HOST_COHERENT_BIT + MEMORY_PROPERTY_HOST_CACHED_BIT + usable for: + IMAGE_TILING_OPTIMAL: + color images + FORMAT_D16_UNORM + FORMAT_X8_D24_UNORM_PACK32 + FORMAT_D32_SFLOAT + FORMAT_S8_UINT + FORMAT_D24_UNORM_S8_UINT + FORMAT_D32_SFLOAT_S8_UINT + IMAGE_TILING_LINEAR: + color images + +VkPhysicalDeviceFeatures: +========================= + robustBufferAccess = true + fullDrawIndexUint32 = true + imageCubeArray = true + independentBlend = true + geometryShader = true + tessellationShader = true + sampleRateShading = true + dualSrcBlend = true + logicOp = true + multiDrawIndirect = true + drawIndirectFirstInstance = true + depthClamp = true + depthBiasClamp = true + fillModeNonSolid = true + depthBounds = false + wideLines = true + largePoints = true + alphaToOne = true + multiViewport = true + samplerAnisotropy = true + textureCompressionETC2 = false + textureCompressionASTC_LDR = false + textureCompressionBC = true + occlusionQueryPrecise = true + pipelineStatisticsQuery = true + vertexPipelineStoresAndAtomics = true + fragmentStoresAndAtomics = true + shaderTessellationAndGeometryPointSize = true + shaderImageGatherExtended = true + shaderStorageImageExtendedFormats = true + shaderStorageImageMultisample = true + shaderStorageImageReadWithoutFormat = true + shaderStorageImageWriteWithoutFormat = true + shaderUniformBufferArrayDynamicIndexing = true + shaderSampledImageArrayDynamicIndexing = true + shaderStorageBufferArrayDynamicIndexing = true + shaderStorageImageArrayDynamicIndexing = true + shaderClipDistance = true + shaderCullDistance = true + shaderFloat64 = true + shaderInt64 = true + shaderInt16 = true + shaderResourceResidency = true + shaderResourceMinLod = false + sparseBinding = true + sparseResidencyBuffer = true + sparseResidencyImage2D = true + sparseResidencyImage3D = true + sparseResidency2Samples = false + sparseResidency4Samples = false + sparseResidency8Samples = false + sparseResidency16Samples = false + sparseResidencyAliased = true + variableMultisampleRate = false + inheritedQueries = false + +VkPhysicalDevice4444FormatsFeaturesEXT: +--------------------------------------- + formatA4R4G4B4 = true + formatA4B4G4R4 = true + +VkPhysicalDeviceAccelerationStructureFeaturesKHR: +------------------------------------------------- + accelerationStructure = true + accelerationStructureCaptureReplay = false + accelerationStructureIndirectBuild = false + accelerationStructureHostCommands = false + descriptorBindingAccelerationStructureUpdateAfterBind = true + +VkPhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT: +-------------------------------------------------------------- + attachmentFeedbackLoopDynamicState = true + +VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT: +-------------------------------------------------------- + attachmentFeedbackLoopLayout = true + +VkPhysicalDeviceBorderColorSwizzleFeaturesEXT: +---------------------------------------------- + borderColorSwizzle = true + borderColorSwizzleFromImage = true + +VkPhysicalDeviceColorWriteEnableFeaturesEXT: +-------------------------------------------- + colorWriteEnable = true + +VkPhysicalDeviceComputeShaderDerivativesFeaturesKHR: +---------------------------------------------------- + computeDerivativeGroupQuads = true + computeDerivativeGroupLinear = true + +VkPhysicalDeviceConditionalRenderingFeaturesEXT: +------------------------------------------------ + conditionalRendering = true + inheritedConditionalRendering = false + +VkPhysicalDeviceCustomBorderColorFeaturesEXT: +--------------------------------------------- + customBorderColors = true + customBorderColorWithoutFormat = true + +VkPhysicalDeviceDepthClipControlFeaturesEXT: +-------------------------------------------- + depthClipControl = true + +VkPhysicalDeviceDepthClipEnableFeaturesEXT: +------------------------------------------- + depthClipEnable = true + +VkPhysicalDeviceDescriptorBufferFeaturesEXT: +-------------------------------------------- + descriptorBuffer = true + descriptorBufferCaptureReplay = false + descriptorBufferImageLayoutIgnored = true + descriptorBufferPushDescriptors = true + +VkPhysicalDeviceDeviceGeneratedCommandsFeaturesEXT: +--------------------------------------------------- + deviceGeneratedCommands = true + dynamicGeneratedPipelineLayout = true + +VkPhysicalDeviceDynamicRenderingUnusedAttachmentsFeaturesEXT: +------------------------------------------------------------- + dynamicRenderingUnusedAttachments = true + +VkPhysicalDeviceExtendedDynamicState2FeaturesEXT: +------------------------------------------------- + extendedDynamicState2 = true + extendedDynamicState2LogicOp = true + extendedDynamicState2PatchControlPoints = true + +VkPhysicalDeviceExtendedDynamicState3FeaturesEXT: +------------------------------------------------- + extendedDynamicState3TessellationDomainOrigin = true + extendedDynamicState3DepthClampEnable = true + extendedDynamicState3PolygonMode = true + extendedDynamicState3RasterizationSamples = true + extendedDynamicState3SampleMask = true + extendedDynamicState3AlphaToCoverageEnable = true + extendedDynamicState3AlphaToOneEnable = true + extendedDynamicState3LogicOpEnable = true + extendedDynamicState3ColorBlendEnable = true + extendedDynamicState3ColorBlendEquation = true + extendedDynamicState3ColorWriteMask = true + extendedDynamicState3RasterizationStream = false + extendedDynamicState3ConservativeRasterizationMode = false + extendedDynamicState3ExtraPrimitiveOverestimationSize = false + extendedDynamicState3DepthClipEnable = true + extendedDynamicState3SampleLocationsEnable = false + extendedDynamicState3ColorBlendAdvanced = false + extendedDynamicState3ProvokingVertexMode = true + extendedDynamicState3LineRasterizationMode = true + extendedDynamicState3LineStippleEnable = true + extendedDynamicState3DepthClipNegativeOneToOne = true + extendedDynamicState3ViewportWScalingEnable = false + extendedDynamicState3ViewportSwizzle = false + extendedDynamicState3CoverageToColorEnable = false + extendedDynamicState3CoverageToColorLocation = false + extendedDynamicState3CoverageModulationMode = false + extendedDynamicState3CoverageModulationTableEnable = false + extendedDynamicState3CoverageModulationTable = false + extendedDynamicState3CoverageReductionMode = false + extendedDynamicState3RepresentativeFragmentTestEnable = false + extendedDynamicState3ShadingRateImageEnable = false + +VkPhysicalDeviceExtendedDynamicStateFeaturesEXT: +------------------------------------------------ + extendedDynamicState = true + +VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT: +--------------------------------------------------- + graphicsPipelineLibrary = true + +VkPhysicalDeviceImage2DViewOf3DFeaturesEXT: +------------------------------------------- + image2DViewOf3D = true + sampler2DViewOf3D = true + +VkPhysicalDeviceImageSlicedViewOf3DFeaturesEXT: +----------------------------------------------- + imageSlicedViewOf3D = true + +VkPhysicalDeviceLegacyVertexAttributesFeaturesEXT: +-------------------------------------------------- + legacyVertexAttributes = true + +VkPhysicalDeviceMaintenance7FeaturesKHR: +---------------------------------------- + maintenance7 = true + +VkPhysicalDeviceMemoryPriorityFeaturesEXT: +------------------------------------------ + memoryPriority = true + +VkPhysicalDeviceMeshShaderFeaturesEXT: +-------------------------------------- + taskShader = true + meshShader = true + multiviewMeshShader = false + primitiveFragmentShadingRateMeshShader = false + meshShaderQueries = true + +VkPhysicalDeviceMultiDrawFeaturesEXT: +------------------------------------- + multiDraw = true + +VkPhysicalDeviceMultisampledRenderToSingleSampledFeaturesEXT: +------------------------------------------------------------- + multisampledRenderToSingleSampled = true + +VkPhysicalDeviceMutableDescriptorTypeFeaturesEXT: +------------------------------------------------- + mutableDescriptorType = true + +VkPhysicalDeviceNestedCommandBufferFeaturesEXT: +----------------------------------------------- + nestedCommandBuffer = true + nestedCommandBufferRendering = true + nestedCommandBufferSimultaneousUse = true + +VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT: +---------------------------------------------- + nonSeamlessCubeMap = true + +VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT: +----------------------------------------------------- + pageableDeviceLocalMemory = true + +VkPhysicalDevicePipelineLibraryGroupHandlesFeaturesEXT: +------------------------------------------------------- + pipelineLibraryGroupHandles = true + +VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT: +-------------------------------------------------------- + primitiveTopologyListRestart = true + primitiveTopologyPatchListRestart = true + +VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT: +---------------------------------------------------- + primitivesGeneratedQuery = true + primitivesGeneratedQueryWithRasterizerDiscard = true + primitivesGeneratedQueryWithNonZeroStreams = true + +VkPhysicalDeviceProvokingVertexFeaturesEXT: +------------------------------------------- + provokingVertexLast = true + transformFeedbackPreservesProvokingVertex = true + +VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT: +-------------------------------------------------------------- + rasterizationOrderColorAttachmentAccess = true + rasterizationOrderDepthAttachmentAccess = true + rasterizationOrderStencilAttachmentAccess = true + +VkPhysicalDeviceRayQueryFeaturesKHR: +------------------------------------ + rayQuery = true + +VkPhysicalDeviceRayTracingMaintenance1FeaturesKHR: +-------------------------------------------------- + rayTracingMaintenance1 = true + rayTracingPipelineTraceRaysIndirect2 = true + +VkPhysicalDeviceRayTracingPipelineFeaturesKHR: +---------------------------------------------- + rayTracingPipeline = true + rayTracingPipelineShaderGroupHandleCaptureReplay = false + rayTracingPipelineShaderGroupHandleCaptureReplayMixed = false + rayTracingPipelineTraceRaysIndirect = true + rayTraversalPrimitiveCulling = true + +VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR: +--------------------------------------------------- + rayTracingPositionFetch = true + +VkPhysicalDeviceRobustness2FeaturesEXT: +--------------------------------------- + robustBufferAccess2 = true + robustImageAccess2 = true + nullDescriptor = true + +VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT: +---------------------------------------------- + shaderBufferFloat16Atomics = false + shaderBufferFloat16AtomicAdd = false + shaderBufferFloat16AtomicMinMax = false + shaderBufferFloat32AtomicMinMax = true + shaderBufferFloat64AtomicMinMax = false + shaderSharedFloat16Atomics = false + shaderSharedFloat16AtomicAdd = false + shaderSharedFloat16AtomicMinMax = false + shaderSharedFloat32AtomicMinMax = true + shaderSharedFloat64AtomicMinMax = false + shaderImageFloat32AtomicMinMax = true + sparseImageFloat32AtomicMinMax = false + +VkPhysicalDeviceShaderAtomicFloatFeaturesEXT: +--------------------------------------------- + shaderBufferFloat32Atomics = true + shaderBufferFloat32AtomicAdd = true + shaderBufferFloat64Atomics = false + shaderBufferFloat64AtomicAdd = false + shaderSharedFloat32Atomics = true + shaderSharedFloat32AtomicAdd = true + shaderSharedFloat64Atomics = false + shaderSharedFloat64AtomicAdd = false + shaderImageFloat32Atomics = true + shaderImageFloat32AtomicAdd = true + sparseImageFloat32Atomics = true + sparseImageFloat32AtomicAdd = true + +VkPhysicalDeviceShaderClockFeaturesKHR: +--------------------------------------- + shaderSubgroupClock = true + shaderDeviceClock = true + +VkPhysicalDeviceShaderMaximalReconvergenceFeaturesKHR: +------------------------------------------------------ + shaderMaximalReconvergence = true + +VkPhysicalDeviceShaderObjectFeaturesEXT: +---------------------------------------- + shaderObject = true + +VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR: +------------------------------------------------------------ + shaderRelaxedExtendedInstruction = true + +VkPhysicalDeviceShaderReplicatedCompositesFeaturesEXT: +------------------------------------------------------ + shaderReplicatedComposites = true + +VkPhysicalDeviceSwapchainMaintenance1FeaturesEXT: +------------------------------------------------- + swapchainMaintenance1 = true + +VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT: +------------------------------------------------ + texelBufferAlignment = true + +VkPhysicalDeviceTransformFeedbackFeaturesEXT: +--------------------------------------------- + transformFeedback = true + geometryStreams = true + +VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT: +--------------------------------------------------- + vertexInputDynamicState = true + +VkPhysicalDeviceVulkan11Features: +--------------------------------- + storageBuffer16BitAccess = true + uniformAndStorageBuffer16BitAccess = true + storagePushConstant16 = true + storageInputOutput16 = false + multiview = true + multiviewGeometryShader = true + multiviewTessellationShader = true + variablePointersStorageBuffer = true + variablePointers = true + protectedMemory = false + samplerYcbcrConversion = true + shaderDrawParameters = true + +VkPhysicalDeviceVulkan12Features: +--------------------------------- + samplerMirrorClampToEdge = true + drawIndirectCount = true + storageBuffer8BitAccess = true + uniformAndStorageBuffer8BitAccess = true + storagePushConstant8 = true + shaderBufferInt64Atomics = true + shaderSharedInt64Atomics = true + shaderFloat16 = true + shaderInt8 = true + descriptorIndexing = true + shaderInputAttachmentArrayDynamicIndexing = true + shaderUniformTexelBufferArrayDynamicIndexing = true + shaderStorageTexelBufferArrayDynamicIndexing = true + shaderUniformBufferArrayNonUniformIndexing = true + shaderSampledImageArrayNonUniformIndexing = true + shaderStorageBufferArrayNonUniformIndexing = true + shaderStorageImageArrayNonUniformIndexing = true + shaderInputAttachmentArrayNonUniformIndexing = true + shaderUniformTexelBufferArrayNonUniformIndexing = true + shaderStorageTexelBufferArrayNonUniformIndexing = true + descriptorBindingUniformBufferUpdateAfterBind = true + descriptorBindingSampledImageUpdateAfterBind = true + descriptorBindingStorageImageUpdateAfterBind = true + descriptorBindingStorageBufferUpdateAfterBind = true + descriptorBindingUniformTexelBufferUpdateAfterBind = true + descriptorBindingStorageTexelBufferUpdateAfterBind = true + descriptorBindingUpdateUnusedWhilePending = true + descriptorBindingPartiallyBound = true + descriptorBindingVariableDescriptorCount = true + runtimeDescriptorArray = true + samplerFilterMinmax = true + scalarBlockLayout = true + imagelessFramebuffer = true + uniformBufferStandardLayout = true + shaderSubgroupExtendedTypes = true + separateDepthStencilLayouts = true + hostQueryReset = true + timelineSemaphore = true + bufferDeviceAddress = true + bufferDeviceAddressCaptureReplay = false + bufferDeviceAddressMultiDevice = false + vulkanMemoryModel = true + vulkanMemoryModelDeviceScope = true + vulkanMemoryModelAvailabilityVisibilityChains = true + shaderOutputViewportIndex = true + shaderOutputLayer = true + subgroupBroadcastDynamicId = true + +VkPhysicalDeviceVulkan13Features: +--------------------------------- + robustImageAccess = true + inlineUniformBlock = true + descriptorBindingInlineUniformBlockUpdateAfterBind = true + pipelineCreationCacheControl = true + privateData = true + shaderDemoteToHelperInvocation = true + shaderTerminateInvocation = true + subgroupSizeControl = true + computeFullSubgroups = true + synchronization2 = true + textureCompressionASTC_HDR = false + shaderZeroInitializeWorkgroupMemory = true + dynamicRendering = true + shaderIntegerDotProduct = true + maintenance4 = true + +VkPhysicalDeviceVulkan14Features: +--------------------------------- + globalPriorityQuery = true + shaderSubgroupRotate = true + shaderSubgroupRotateClustered = true + shaderFloatControls2 = true + shaderExpectAssume = true + rectangularLines = true + bresenhamLines = true + smoothLines = true + stippledRectangularLines = true + stippledBresenhamLines = true + stippledSmoothLines = true + vertexAttributeInstanceRateDivisor = true + vertexAttributeInstanceRateZeroDivisor = true + indexTypeUint8 = true + dynamicRenderingLocalRead = true + maintenance5 = true + maintenance6 = true + pipelineProtectedAccess = true + pipelineRobustness = true + hostImageCopy = true + pushDescriptor = true + +VkPhysicalDeviceYcbcr2Plane444FormatsFeaturesEXT: +------------------------------------------------- + ycbcr2plane444Formats = true + +VkPhysicalDeviceYcbcrImageArraysFeaturesEXT: +-------------------------------------------- + ycbcrImageArrays = true + + diff --git a/external/ffmpeg-snapshot/COPYING.LGPLv2.1 b/external/ffmpeg-snapshot/COPYING.LGPLv2.1 new file mode 100644 index 0000000..58af0d3 --- /dev/null +++ b/external/ffmpeg-snapshot/COPYING.LGPLv2.1 @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/external/ffmpeg-snapshot/PROVENANCE.md b/external/ffmpeg-snapshot/PROVENANCE.md new file mode 100644 index 0000000..e04d294 --- /dev/null +++ b/external/ffmpeg-snapshot/PROVENANCE.md @@ -0,0 +1,92 @@ +# FFmpeg source snapshot + +Verbatim subset of FFmpeg source pinned for use as reference +implementations of the VP9 8×8 inverse DCT (Phase 1 target of +`daedalus-fourier`). See `../../docs/phase2.md §2` and `§5` for +the rationale. + +## Upstream pin + +- **Repository**: https://github.com/FFmpeg/FFmpeg +- **Tag**: `n7.1.3` (matches `libavcodec61 8:7.1.3-0+deb13u1+rpt1` + shipping in Debian Trixie on the dev host `hertz`) +- **Annotated tag object**: `0a9a757e96fdf053697084bbd1f620edeac9d084` +- **Commit object (tag target)**: `f46e514491172d15bd74b4abb1814cd2f05a763e` +- **Snapshot fetched**: 2026-05-18 (UTC), via + `https://raw.githubusercontent.com/FFmpeg/FFmpeg/n7.1.3/` + +## Files in this snapshot + +All files are byte-for-byte copies of the upstream source at the +tagged commit, no modifications. + +| Path | Lines | Bytes | SHA-256 | +|---|---|---|---| +| `libavcodec/vp9dsp_template.c` | 2578 | 89045 | `41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f` | +| `libavcodec/aarch64/vp9itxfm_neon.S` | 1580 | 63534 | `82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6` | +| `libavcodec/aarch64/neon.S` | 173 | 7496 | `72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538` | +| `libavutil/aarch64/asm.S` | 260 | 8069 | `c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3` | +| `COPYING.LGPLv2.1` | 502 | — | `b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe` | + +Verify with: + +```sh +( cd external/ffmpeg-snapshot && sha256sum -c <<'EOF' +41b21f667a6c497b620aa1637d8269badc45d1ac7e621d694441c5bf39356e4f libavcodec/vp9dsp_template.c +82ee3ceed4735c63576bafdcee28e2215652743ade55a9eab46a16d9530369f6 libavcodec/aarch64/vp9itxfm_neon.S +72d36ce6c3fcc5e53de869cfe10fda16225ebe580c32891bccc240a30a85a538 libavcodec/aarch64/neon.S +c0d03143b1bc5a9e358222d08d2d449d595271844fe7a3dc23bffb91abe8b0e3 libavutil/aarch64/asm.S +b634ab5640e258563c536e658cad87080553df6f34f62269a21d554844e58bfe COPYING.LGPLv2.1 +EOF +) +``` + +## License + +LGPL-2.1-or-later. See `COPYING.LGPLv2.1`. Original copyright +holders include the FFmpeg authors and Google Inc. (2016) for +the aarch64 NEON paths. The snapshot inherits FFmpeg's license +in full. + +## Why each file is in this snapshot + +- `libavcodec/vp9dsp_template.c` — contains `idct_idct_8x8_add_c`, + the bit-exact C reference for the Phase 1 kernel under test (M1). +- `libavcodec/aarch64/vp9itxfm_neon.S` — contains + `ff_vp9_idct_idct_8x8_add_neon`, the NEON throughput baseline + (M3). Also defines `idct8`, `dmbutterfly0`, `dmbutterfly`, + `dmbutterfly_l`, `butterfly_8h`, and the `idct_coeffs` constant + table. +- `libavcodec/aarch64/neon.S` — defines `transpose_8x8H` used by + `vp9itxfm_neon.S`. +- `libavutil/aarch64/asm.S` — defines `function`, `endfunc`, + `movrel`, `const`, `endconst`, and other assembly preamble + macros required to assemble the above NEON files. + +## Re-vendoring procedure + +If the upstream pin needs to change (e.g., hertz updates to a +newer libavcodec): + +```sh +TAG=nX.Y.Z +BASE=https://raw.githubusercontent.com/FFmpeg/FFmpeg/$TAG +cd external/ffmpeg-snapshot +for f in libavcodec/vp9dsp_template.c \ + libavcodec/aarch64/vp9itxfm_neon.S \ + libavcodec/aarch64/neon.S \ + libavutil/aarch64/asm.S \ + COPYING.LGPLv2.1; do + curl -sSf -o "$f" "$BASE/$f" +done +sha256sum libavcodec/vp9dsp_template.c \ + libavcodec/aarch64/vp9itxfm_neon.S \ + libavcodec/aarch64/neon.S \ + libavutil/aarch64/asm.S \ + COPYING.LGPLv2.1 +# update this PROVENANCE.md with the new tag, commit hash, and hashes +``` + +After re-vendoring, re-run the bit-exact gate (M1) and throughput +baseline (M3) — both can shift across FFmpeg versions even when +the VP9 spec doesn't change (e.g., NEON micro-optimizations). diff --git a/external/ffmpeg-snapshot/config.h b/external/ffmpeg-snapshot/config.h new file mode 100644 index 0000000..a5586a6 --- /dev/null +++ b/external/ffmpeg-snapshot/config.h @@ -0,0 +1,27 @@ +/* + * Minimal config.h shim for assembling the vendored FFmpeg .S files + * outside the FFmpeg build tree. + * + * The vendored .S files (vp9itxfm_neon.S, neon.S, asm.S) reference + * exactly 7 preprocessor symbols, enumerated below. Values target + * aarch64-Linux with modern binutils (≥2.41) — matches the Debian + * Trixie environment on hertz (the project's dev host). + * + * See ../../docs/phase2.md §5 for the source-copy rationale and + * PROVENANCE.md for the upstream pin (FFmpeg n7.1.3). + */ +#pragma once + +#define HAVE_AS_FUNC 1 +#define HAVE_AS_ARCH_DIRECTIVE 1 +#define AS_ARCH_LEVEL armv8-a +#define HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE 1 +#define HAVE_AS_ARCHEXT_I8MM_DIRECTIVE 1 +#define HAVE_SECTION_DATA_REL_RO 1 +#define CONFIG_PIC 1 + +/* Symbol prefix for exported labels. On ELF/Linux this is empty + * (no leading underscore). FFmpeg's configure script normally + * defines this in the generated config.h; we replicate the + * Linux-target value here. */ +#define EXTERN_ASM diff --git a/external/ffmpeg-snapshot/libavcodec/aarch64/neon.S b/external/ffmpeg-snapshot/libavcodec/aarch64/neon.S new file mode 100644 index 0000000..f6fb13b --- /dev/null +++ b/external/ffmpeg-snapshot/libavcodec/aarch64/neon.S @@ -0,0 +1,173 @@ +/* + * This file is part of FFmpeg. + * + * Copyright (c) 2023 J. Dekker + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +.macro clip min, max, regs:vararg +.irp x, \regs + smax \x, \x, \min +.endr +.irp x, \regs + smin \x, \x, \max +.endr +.endm + +.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8b, \r0\().8b, \r1\().8b + trn2 \r9\().8b, \r0\().8b, \r1\().8b + trn1 \r1\().8b, \r2\().8b, \r3\().8b + trn2 \r3\().8b, \r2\().8b, \r3\().8b + trn1 \r0\().8b, \r4\().8b, \r5\().8b + trn2 \r5\().8b, \r4\().8b, \r5\().8b + trn1 \r2\().8b, \r6\().8b, \r7\().8b + trn2 \r7\().8b, \r6\().8b, \r7\().8b + + trn1 \r4\().4h, \r0\().4h, \r2\().4h + trn2 \r2\().4h, \r0\().4h, \r2\().4h + trn1 \r6\().4h, \r5\().4h, \r7\().4h + trn2 \r7\().4h, \r5\().4h, \r7\().4h + trn1 \r5\().4h, \r9\().4h, \r3\().4h + trn2 \r9\().4h, \r9\().4h, \r3\().4h + trn1 \r3\().4h, \r8\().4h, \r1\().4h + trn2 \r8\().4h, \r8\().4h, \r1\().4h + + trn1 \r0\().2s, \r3\().2s, \r4\().2s + trn2 \r4\().2s, \r3\().2s, \r4\().2s + + trn1 \r1\().2s, \r5\().2s, \r6\().2s + trn2 \r5\().2s, \r5\().2s, \r6\().2s + + trn2 \r6\().2s, \r8\().2s, \r2\().2s + trn1 \r2\().2s, \r8\().2s, \r2\().2s + + trn1 \r3\().2s, \r9\().2s, \r7\().2s + trn2 \r7\().2s, \r9\().2s, \r7\().2s +.endm + +.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 + trn1 \t0\().16b, \r0\().16b, \r1\().16b + trn2 \t1\().16b, \r0\().16b, \r1\().16b + trn1 \r1\().16b, \r2\().16b, \r3\().16b + trn2 \r3\().16b, \r2\().16b, \r3\().16b + trn1 \r0\().16b, \r4\().16b, \r5\().16b + trn2 \r5\().16b, \r4\().16b, \r5\().16b + trn1 \r2\().16b, \r6\().16b, \r7\().16b + trn2 \r7\().16b, \r6\().16b, \r7\().16b + + trn1 \r4\().8h, \r0\().8h, \r2\().8h + trn2 \r2\().8h, \r0\().8h, \r2\().8h + trn1 \r6\().8h, \r5\().8h, \r7\().8h + trn2 \r7\().8h, \r5\().8h, \r7\().8h + trn1 \r5\().8h, \t1\().8h, \r3\().8h + trn2 \t1\().8h, \t1\().8h, \r3\().8h + trn1 \r3\().8h, \t0\().8h, \r1\().8h + trn2 \t0\().8h, \t0\().8h, \r1\().8h + + trn1 \r0\().4s, \r3\().4s, \r4\().4s + trn2 \r4\().4s, \r3\().4s, \r4\().4s + + trn1 \r1\().4s, \r5\().4s, \r6\().4s + trn2 \r5\().4s, \r5\().4s, \r6\().4s + + trn2 \r6\().4s, \t0\().4s, \r2\().4s + trn1 \r2\().4s, \t0\().4s, \r2\().4s + + trn1 \r3\().4s, \t1\().4s, \r7\().4s + trn2 \r7\().4s, \t1\().4s, \r7\().4s +.endm + +.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().16b, \r0\().16b, \r1\().16b + trn2 \t5\().16b, \r0\().16b, \r1\().16b + trn1 \t6\().16b, \r2\().16b, \r3\().16b + trn2 \t7\().16b, \r2\().16b, \r3\().16b + + trn1 \r0\().8h, \t4\().8h, \t6\().8h + trn2 \r2\().8h, \t4\().8h, \t6\().8h + trn1 \r1\().8h, \t5\().8h, \t7\().8h + trn2 \r3\().8h, \t5\().8h, \t7\().8h +.endm + +.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8b, \r0\().8b, \r1\().8b + trn2 \t5\().8b, \r0\().8b, \r1\().8b + trn1 \t6\().8b, \r2\().8b, \r3\().8b + trn2 \t7\().8b, \r2\().8b, \r3\().8b + + trn1 \r0\().4h, \t4\().4h, \t6\().4h + trn2 \r2\().4h, \t4\().4h, \t6\().4h + trn1 \r1\().4h, \t5\().4h, \t7\().4h + trn2 \r3\().4h, \t5\().4h, \t7\().4h +.endm + +.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7 + trn1 \r4\().4h, \r0\().4h, \r1\().4h + trn2 \r5\().4h, \r0\().4h, \r1\().4h + trn1 \r6\().4h, \r2\().4h, \r3\().4h + trn2 \r7\().4h, \r2\().4h, \r3\().4h + + trn1 \r0\().2s, \r4\().2s, \r6\().2s + trn2 \r2\().2s, \r4\().2s, \r6\().2s + trn1 \r1\().2s, \r5\().2s, \r7\().2s + trn2 \r3\().2s, \r5\().2s, \r7\().2s +.endm + +.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8h, \r0\().8h, \r1\().8h + trn2 \t5\().8h, \r0\().8h, \r1\().8h + trn1 \t6\().8h, \r2\().8h, \r3\().8h + trn2 \t7\().8h, \r2\().8h, \r3\().8h + + trn1 \r0\().4s, \t4\().4s, \t6\().4s + trn2 \r2\().4s, \t4\().4s, \t6\().4s + trn1 \r1\().4s, \t5\().4s, \t7\().4s + trn2 \r3\().4s, \t5\().4s, \t7\().4s +.endm + +.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8h, \r0\().8h, \r1\().8h + trn2 \r9\().8h, \r0\().8h, \r1\().8h + trn1 \r1\().8h, \r2\().8h, \r3\().8h + trn2 \r3\().8h, \r2\().8h, \r3\().8h + trn1 \r0\().8h, \r4\().8h, \r5\().8h + trn2 \r5\().8h, \r4\().8h, \r5\().8h + trn1 \r2\().8h, \r6\().8h, \r7\().8h + trn2 \r7\().8h, \r6\().8h, \r7\().8h + + trn1 \r4\().4s, \r0\().4s, \r2\().4s + trn2 \r2\().4s, \r0\().4s, \r2\().4s + trn1 \r6\().4s, \r5\().4s, \r7\().4s + trn2 \r7\().4s, \r5\().4s, \r7\().4s + trn1 \r5\().4s, \r9\().4s, \r3\().4s + trn2 \r9\().4s, \r9\().4s, \r3\().4s + trn1 \r3\().4s, \r8\().4s, \r1\().4s + trn2 \r8\().4s, \r8\().4s, \r1\().4s + + trn1 \r0\().2d, \r3\().2d, \r4\().2d + trn2 \r4\().2d, \r3\().2d, \r4\().2d + + trn1 \r1\().2d, \r5\().2d, \r6\().2d + trn2 \r5\().2d, \r5\().2d, \r6\().2d + + trn2 \r6\().2d, \r8\().2d, \r2\().2d + trn1 \r2\().2d, \r8\().2d, \r2\().2d + + trn1 \r3\().2d, \r9\().2d, \r7\().2d + trn2 \r7\().2d, \r9\().2d, \r7\().2d + +.endm diff --git a/external/ffmpeg-snapshot/libavcodec/aarch64/vp9itxfm_neon.S b/external/ffmpeg-snapshot/libavcodec/aarch64/vp9itxfm_neon.S new file mode 100644 index 0000000..a27f7b8 --- /dev/null +++ b/external/ffmpeg-snapshot/libavcodec/aarch64/vp9itxfm_neon.S @@ -0,0 +1,1580 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +const itxfm4_coeffs, align=4 + .short 11585, 0, 6270, 15137 +iadst4_coeffs: + .short 5283, 15212, 9929, 13377 +endconst + +const iadst8_coeffs, align=4 + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 +idct_coeffs: + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 +endconst + +const iadst16_coeffs, align=4 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 +endconst + +// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14 +// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14 +// in/out are .8h registers; this can do with 4 temp registers, but is +// more efficient if 6 temp registers are available. +.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 +.if \neg > 0 + neg \tmp4\().4h, v0.4h +.endif + add \tmp1\().8h, \in1\().8h, \in2\().8h + sub \tmp2\().8h, \in1\().8h, \in2\().8h +.if \neg > 0 + smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0] + smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0] +.else + smull \tmp3\().4s, \tmp1\().4h, v0.h[0] + smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0] +.endif +.ifb \tmp5 + rshrn \out1\().4h, \tmp3\().4s, #14 + rshrn2 \out1\().8h, \tmp4\().4s, #14 + smull \tmp3\().4s, \tmp2\().4h, v0.h[0] + smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0] + rshrn \out2\().4h, \tmp3\().4s, #14 + rshrn2 \out2\().8h, \tmp4\().4s, #14 +.else + smull \tmp5\().4s, \tmp2\().4h, v0.h[0] + smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0] + rshrn \out1\().4h, \tmp3\().4s, #14 + rshrn2 \out1\().8h, \tmp4\().4s, #14 + rshrn \out2\().4h, \tmp5\().4s, #14 + rshrn2 \out2\().8h, \tmp6\().4s, #14 +.endif +.endm + +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 + smull \tmp1\().4s, \in1\().4h, v0.h[0] + smull2 \tmp2\().4s, \in1\().8h, v0.h[0] + rshrn \out1\().4h, \tmp1\().4s, #14 + rshrn2 \out1\().8h, \tmp2\().4s, #14 + rshrn \out2\().4h, \tmp1\().4s, #14 + rshrn2 \out2\().8h, \tmp2\().4s, #14 +.endm + +// out1,out2 = in1 * coef1 - in2 * coef2 +// out3,out4 = in1 * coef2 + in2 * coef1 +// out are 4 x .4s registers, in are 2 x .8h registers +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 + smull \out1\().4s, \in1\().4h, \coef1 + smull2 \out2\().4s, \in1\().8h, \coef1 + smull \out3\().4s, \in1\().4h, \coef2 + smull2 \out4\().4s, \in1\().8h, \coef2 + smlsl \out1\().4s, \in2\().4h, \coef2 + smlsl2 \out2\().4s, \in2\().8h, \coef2 + smlal \out3\().4s, \in2\().4h, \coef1 + smlal2 \out4\().4s, \in2\().8h, \coef1 +.endm + +// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 +// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 +// inout are 2 x .8h registers +.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 +.if \neg > 0 + neg \tmp3\().4s, \tmp3\().4s + neg \tmp4\().4s, \tmp4\().4s +.endif + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout1\().4h, \coef1 + smull2 \tmp2\().4s, \inout1\().8h, \coef1 + smull \tmp3\().4s, \inout1\().4h, \coef2 + smull2 \tmp4\().4s, \inout1\().8h, \coef2 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout2\().4h, \coef2 + smull2 \tmp2\().4s, \inout2\().8h, \coef2 + smull \tmp3\().4s, \inout2\().4h, \coef1 + smull2 \tmp4\().4s, \inout2\().8h, \coef1 + neg \tmp1\().4s, \tmp1\().4s + neg \tmp2\().4s, \tmp2\().4s + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 +.endm + +.macro dsmull_h out1, out2, in, coef + smull \out1\().4s, \in\().4h, \coef + smull2 \out2\().4s, \in\().8h, \coef +.endm + +.macro drshrn_h out, in1, in2, shift + rshrn \out\().4h, \in1\().4s, \shift + rshrn2 \out\().8h, \in2\().4s, \shift +.endm + + +// out1 = in1 + in2 +// out2 = in1 - in2 +.macro butterfly_8h out1, out2, in1, in2 + add \out1\().8h, \in1\().8h, \in2\().8h + sub \out2\().8h, \in1\().8h, \in2\().8h +.endm + +// out1 = in1 - in2 +// out2 = in1 + in2 +.macro butterfly_8h_r out1, out2, in1, in2 + sub \out1\().8h, \in1\().8h, \in2\().8h + add \out2\().8h, \in1\().8h, \in2\().8h +.endm + +// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 +// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 +// out are 2 x .8h registers, in are 4 x .4s registers +.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 + add \tmp1\().4s, \in1\().4s, \in3\().4s + add \tmp2\().4s, \in2\().4s, \in4\().4s + sub \tmp3\().4s, \in1\().4s, \in3\().4s + sub \tmp4\().4s, \in2\().4s, \in4\().4s + rshrn \out1\().4h, \tmp1\().4s, #14 + rshrn2 \out1\().8h, \tmp2\().4s, #14 + rshrn \out2\().4h, \tmp3\().4s, #14 + rshrn2 \out2\().8h, \tmp4\().4s, #14 +.endm + +.macro iwht4 c0, c1, c2, c3 + add \c0\().4h, \c0\().4h, \c1\().4h + sub v17.4h, \c2\().4h, \c3\().4h + sub v16.4h, \c0\().4h, v17.4h + sshr v16.4h, v16.4h, #1 + sub \c2\().4h, v16.4h, \c1\().4h + sub \c1\().4h, v16.4h, \c3\().4h + add \c3\().4h, v17.4h, \c2\().4h + sub \c0\().4h, \c0\().4h, \c1\().4h +.endm + +.macro idct4 c0, c1, c2, c3 + smull v22.4s, \c1\().4h, v0.h[3] + smull v20.4s, \c1\().4h, v0.h[2] + add v16.4h, \c0\().4h, \c2\().4h + sub v17.4h, \c0\().4h, \c2\().4h + smlal v22.4s, \c3\().4h, v0.h[2] + smull v18.4s, v16.4h, v0.h[0] + smull v19.4s, v17.4h, v0.h[0] + smlsl v20.4s, \c3\().4h, v0.h[3] + rshrn v22.4h, v22.4s, #14 + rshrn v18.4h, v18.4s, #14 + rshrn v19.4h, v19.4s, #14 + rshrn v20.4h, v20.4s, #14 + add \c0\().4h, v18.4h, v22.4h + sub \c3\().4h, v18.4h, v22.4h + add \c1\().4h, v19.4h, v20.4h + sub \c2\().4h, v19.4h, v20.4h +.endm + +.macro iadst4 c0, c1, c2, c3 + smull v16.4s, \c0\().4h, v0.h[4] + smlal v16.4s, \c2\().4h, v0.h[5] + smlal v16.4s, \c3\().4h, v0.h[6] + smull v17.4s, \c0\().4h, v0.h[6] + smlsl v17.4s, \c2\().4h, v0.h[4] + sub \c0\().4h, \c0\().4h, \c2\().4h + smlsl v17.4s, \c3\().4h, v0.h[5] + add \c0\().4h, \c0\().4h, \c3\().4h + smull v19.4s, \c1\().4h, v0.h[7] + smull v18.4s, \c0\().4h, v0.h[7] + add v20.4s, v16.4s, v19.4s + add v21.4s, v17.4s, v19.4s + rshrn \c0\().4h, v20.4s, #14 + add v16.4s, v16.4s, v17.4s + rshrn \c1\().4h, v21.4s, #14 + sub v16.4s, v16.4s, v19.4s + rshrn \c2\().4h, v18.4s, #14 + rshrn \c3\().4h, v16.4s, #14 +.endm + +// The public functions in this file have got the following signature: +// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +.macro itxfm_func4x4 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 +.ifc \txfm1,\txfm2 +.ifc \txfm1,idct + movrel x4, itxfm4_coeffs + ld1 {v0.4h}, [x4] +.endif +.ifc \txfm1,iadst + movrel x4, iadst4_coeffs + ld1 {v0.d}[1], [x4] +.endif +.else + movrel x4, itxfm4_coeffs + ld1 {v0.8h}, [x4] +.endif + + movi v31.8h, #0 +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.ne 1f + // DC-only for idct/idct + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + st1 {v31.h}[0], [x2] + dup v4.4h, v2.h[0] + mov v5.16b, v4.16b + mov v6.16b, v4.16b + mov v7.16b, v4.16b + b 2f +.endif + +1: + ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2] + st1 {v31.8h}, [x2], #16 + +.ifc \txfm1,iwht + sshr v4.4h, v4.4h, #2 + sshr v5.4h, v5.4h, #2 + sshr v6.4h, v6.4h, #2 + sshr v7.4h, v7.4h, #2 +.endif + + \txfm1\()4 v4, v5, v6, v7 + + st1 {v31.8h}, [x2], #16 + // Transpose 4x4 with 16 bit elements + transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19 + + \txfm2\()4 v4, v5, v6, v7 +2: + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[0], [x0], x1 +.ifnc \txfm1,iwht + srshr v4.4h, v4.4h, #4 + srshr v5.4h, v5.4h, #4 + srshr v6.4h, v6.4h, #4 + srshr v7.4h, v7.4h, #4 +.endif + uaddw v4.8h, v4.8h, v0.8b + uaddw v5.8h, v5.8h, v1.8b + ld1 {v2.s}[0], [x0], x1 + ld1 {v3.s}[0], [x0], x1 + sqxtun v0.8b, v4.8h + sqxtun v1.8b, v5.8h + sub x0, x0, x1, lsl #2 + + uaddw v6.8h, v6.8h, v2.8b + uaddw v7.8h, v7.8h, v3.8b + st1 {v0.s}[0], [x0], x1 + sqxtun v2.8b, v6.8h + sqxtun v3.8b, v7.8h + + st1 {v1.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + + ret +endfunc +.endm + +itxfm_func4x4 idct, idct +itxfm_func4x4 iadst, idct +itxfm_func4x4 idct, iadst +itxfm_func4x4 iadst, iadst +itxfm_func4x4 iwht, iwht + + +.macro idct8 + dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a + dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a + dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a + dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a + + butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3 + butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a + butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a + butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2 + + dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5 + + butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7] + butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6] + butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5] + butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4] +.endm + +.macro iadst8 + dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a + dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a + dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a + dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a + + dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4 + dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5 + dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6 + dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7 + + butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2 + butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3 + neg v23.8h, v23.8h // v23 = out[7] + + dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4] + neg v19.8h, v19.8h // v19 = out[3] + + dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a + dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a + + dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6 + dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7 + neg v17.8h, v17.8h // v17 = out[1] + + dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5] + neg v21.8h, v21.8h // v21 = out[5] +.endm + + +.macro itxfm_func8x8 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 + // The iadst also uses a few coefficients from + // idct, so those always need to be loaded. +.ifc \txfm1\()_\txfm2,idct_idct + movrel x4, idct_coeffs +.else + movrel x4, iadst8_coeffs + ld1 {v1.8h}, [x4], #16 +.endif + ld1 {v0.8h}, [x4] + + movi v2.8h, #0 + movi v3.8h, #0 + movi v4.8h, #0 + movi v5.8h, #0 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.ne 1f + // DC-only for idct/idct + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + st1 {v3.h}[0], [x2] + dup v16.8h, v2.h[0] + mov v17.16b, v16.16b + mov v18.16b, v16.16b + mov v19.16b, v16.16b + mov v20.16b, v16.16b + mov v21.16b, v16.16b + mov v22.16b, v16.16b + mov v23.16b, v16.16b + b 2f +.endif +1: + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64 + sub x2, x2, #128 + st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 + st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 + + \txfm1\()8 + + // Transpose 8x8 with 16 bit elements + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + + \txfm2\()8 +2: + mov x3, x0 + // Add into the destination + ld1 {v0.8b}, [x0], x1 + srshr v16.8h, v16.8h, #5 + ld1 {v1.8b}, [x0], x1 + srshr v17.8h, v17.8h, #5 + ld1 {v2.8b}, [x0], x1 + srshr v18.8h, v18.8h, #5 + uaddw v16.8h, v16.8h, v0.8b + ld1 {v3.8b}, [x0], x1 + srshr v19.8h, v19.8h, #5 + uaddw v17.8h, v17.8h, v1.8b + ld1 {v4.8b}, [x0], x1 + srshr v20.8h, v20.8h, #5 + uaddw v18.8h, v18.8h, v2.8b + sqxtun v0.8b, v16.8h + ld1 {v5.8b}, [x0], x1 + srshr v21.8h, v21.8h, #5 + uaddw v19.8h, v19.8h, v3.8b + sqxtun v1.8b, v17.8h + ld1 {v6.8b}, [x0], x1 + srshr v22.8h, v22.8h, #5 + uaddw v20.8h, v20.8h, v4.8b + sqxtun v2.8b, v18.8h + ld1 {v7.8b}, [x0], x1 + srshr v23.8h, v23.8h, #5 + uaddw v21.8h, v21.8h, v5.8b + sqxtun v3.8b, v19.8h + + st1 {v0.8b}, [x3], x1 + uaddw v22.8h, v22.8h, v6.8b + st1 {v1.8b}, [x3], x1 + sqxtun v4.8b, v20.8h + st1 {v2.8b}, [x3], x1 + uaddw v23.8h, v23.8h, v7.8b + st1 {v3.8b}, [x3], x1 + sqxtun v5.8b, v21.8h + st1 {v4.8b}, [x3], x1 + sqxtun v6.8b, v22.8h + st1 {v5.8b}, [x3], x1 + sqxtun v7.8b, v23.8h + + st1 {v6.8b}, [x3], x1 + st1 {v7.8b}, [x3], x1 + + ret +endfunc +.endm + +itxfm_func8x8 idct, idct +itxfm_func8x8 iadst, idct +itxfm_func8x8 idct, iadst +itxfm_func8x8 iadst, iadst + + +function idct16x16_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + dup v2.8h, v2.h[0] + st1 {v1.h}[0], [x2] + + srshr v2.8h, v2.8h, #6 + + mov x3, x0 + mov x4, #16 +1: + // Loop to add the constant from v2 into all 16x16 outputs + subs x4, x4, #2 + ld1 {v3.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 + uaddw v16.8h, v2.8h, v3.8b + uaddw2 v17.8h, v2.8h, v3.16b + uaddw v18.8h, v2.8h, v4.8b + uaddw2 v19.8h, v2.8h, v4.16b + sqxtun v3.8b, v16.8h + sqxtun2 v3.16b, v17.8h + sqxtun v4.8b, v18.8h + sqxtun2 v4.16b, v19.8h + st1 {v3.16b}, [x3], x1 + st1 {v4.16b}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct16_end + butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a + butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6 + butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5 + butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4 + butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a + butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10 + butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13 + butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a + + dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a + dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 + + butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15] + butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14] + butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] + butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8] + butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13] + butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12] + butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11] + butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10] + ret +.endm + +function idct16 + dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_half + dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_quarter + dsmull_h v24, v25, v19, v1.h[7] + dsmull_h v4, v5, v17, v1.h[0] + dsmull_h v7, v6, v18, v0.h[5] + dsmull_h v30, v31, v18, v0.h[4] + neg v24.4s, v24.4s + neg v25.4s, v25.4s + dsmull_h v29, v28, v17, v1.h[1] + dsmull_h v26, v27, v19, v1.h[6] + dsmull_h v22, v23, v16, v0.h[0] + drshrn_h v24, v24, v25, #14 + drshrn_h v16, v4, v5, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v6, v30, v31, #14 + drshrn_h v29, v29, v28, #14 + drshrn_h v17, v26, v27, #14 + drshrn_h v28, v22, v23, #14 + + dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3] + dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3] + neg v22.4s, v22.4s + neg v23.4s, v23.4s + drshrn_h v27, v20, v21, #14 + drshrn_h v21, v22, v23, #14 + drshrn_h v23, v18, v19, #14 + drshrn_h v25, v30, v31, #14 + mov v4.16b, v28.16b + mov v5.16b, v28.16b + dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 + mov v20.16b, v28.16b + idct16_end +endfunc + +function iadst16 + ld1 {v0.8h,v1.8h}, [x11] + + dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 + dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8 + dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a + dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2 + dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a + + dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10 + dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a + dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4 + dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a + + dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12 + dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a + dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6 + dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a + + dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14 + ld1 {v0.8h}, [x10] + dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a + dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8 + dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a + + dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13 + dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a + dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10 + butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0 + dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a + + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15 + butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1 + dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a + dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a + + butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2 + butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3 + + dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15 + + dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a + dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a + neg v29.8h, v29.8h // v29 = out[13] + + dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a + dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a + + butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a + butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10 + + dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 + neg v19.8h, v19.8h // v19 = out[3] + dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 + + butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a + butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11 + + dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] + dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] + dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] + dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] + + neg v31.8h, v5.8h // v31 = out[15] + neg v17.8h, v3.8h // v17 = out[1] + + mov v16.16b, v2.16b + mov v30.16b, v4.16b + ret +endfunc + +// Helper macros; we can't use these expressions directly within +// e.g. .irp due to the extra concatenation \(). Therefore wrap +// them in macros to allow using .irp below. +.macro load i, src, inc + ld1 {v\i\().8h}, [\src], \inc +.endm +.macro store i, dst, inc + st1 {v\i\().8h}, [\dst], \inc +.endm +.macro movi_v i, size, imm + movi v\i\()\size, \imm +.endm +.macro load_clear i, src, inc + ld1 {v\i\().8h}, [\src] + st1 {v2.8h}, [\src], \inc +.endm + +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 + srshr \coef0, \coef0, #6 + ld1 {v2.8b}, [x0], x1 + srshr \coef1, \coef1, #6 + ld1 {v3.8b}, [x3], x1 + srshr \coef2, \coef2, #6 + ld1 {v4.8b}, [x0], x1 + srshr \coef3, \coef3, #6 + uaddw \coef0, \coef0, v2.8b + ld1 {v5.8b}, [x3], x1 + uaddw \coef1, \coef1, v3.8b + srshr \coef4, \coef4, #6 + ld1 {v6.8b}, [x0], x1 + srshr \coef5, \coef5, #6 + ld1 {v7.8b}, [x3], x1 + sqxtun v2.8b, \coef0 + srshr \coef6, \coef6, #6 + sqxtun v3.8b, \coef1 + srshr \coef7, \coef7, #6 + uaddw \coef2, \coef2, v4.8b + ld1 {\tmp1}, [x0], x1 + uaddw \coef3, \coef3, v5.8b + ld1 {\tmp2}, [x3], x1 + sqxtun v4.8b, \coef2 + sub x0, x0, x1, lsl #2 + sub x3, x3, x1, lsl #2 + sqxtun v5.8b, \coef3 + uaddw \coef4, \coef4, v6.8b + st1 {v2.8b}, [x0], x1 + uaddw \coef5, \coef5, v7.8b + st1 {v3.8b}, [x3], x1 + sqxtun v6.8b, \coef4 + st1 {v4.8b}, [x0], x1 + sqxtun v7.8b, \coef5 + st1 {v5.8b}, [x3], x1 + uaddw \coef6, \coef6, \tmp1 + st1 {v6.8b}, [x0], x1 + uaddw \coef7, \coef7, \tmp2 + st1 {v7.8b}, [x3], x1 + sqxtun \tmp1, \coef6 + sqxtun \tmp2, \coef7 + st1 {\tmp1}, [x0], x1 + st1 {\tmp2}, [x3], x1 +.endm + +// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, +// transpose into a horizontal 16x8 slice and store. +// x0 = dst (temp buffer) +// x1 = slice offset +// x2 = src +// x9 = input stride +.macro itxfm16_1d_funcs txfm +function \txfm\()16_1d_8x16_pass1_neon + mov x14, x30 + + movi v2.8h, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr + + bl \txfm\()16 + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + cmp x1, #8 + b.eq 1f +.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 + store \i, x0, #16 +.endr + ret x14 +1: + // Special case: For the last input column (x1 == 8), + // which would be stored as the last row in the temp buffer, + // don't store the first 8x8 block, but keep it in registers + // for the first slice of the second pass (where it is the + // last 8x8 block). +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + add x0, x0, #16 + store \i, x0, #16 +.endr + mov v24.16b, v16.16b + mov v25.16b, v17.16b + mov v26.16b, v18.16b + mov v27.16b, v19.16b + mov v28.16b, v20.16b + mov v29.16b, v21.16b + mov v30.16b, v22.16b + mov v31.16b, v23.16b + ret x14 +endfunc + +// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, +// load the destination pixels (from a similar 8x16 slice), add and store back. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x3 = slice offset +// x9 = temp buffer stride +function \txfm\()16_1d_8x16_pass2_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + cbz x3, 1f +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl \txfm\()16 + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + ret x14 +endfunc +.endm + +itxfm16_1d_funcs idct +itxfm16_1d_funcs iadst + +.macro itxfm_func16x16 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.eq idct16x16_dc_add_neon +.endif + mov x15, x30 + // iadst16 requires clobbering v8-v15, but idct16 doesn't need to. +.ifnc \txfm1\()_\txfm2,idct_idct + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] +.endif + + sub sp, sp, #512 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + movrel x10, idct_coeffs +.ifnc \txfm1\()_\txfm2,idct_idct + movrel x11, iadst16_coeffs +.endif +.ifc \txfm1,idct + ld1 {v0.8h,v1.8h}, [x10] +.endif + mov x9, #32 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #10 + b.le idct16x16_quarter_add_neon + cmp w3, #38 + b.le idct16x16_half_add_neon +.endif + +.irp i, 0, 8 + add x0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i == 8 + cmp w3, #38 + b.le 1f +.endif +.endif + mov x1, #\i + add x2, x6, #(\i*2) + bl \txfm1\()16_1d_8x16_pass1_neon +.endr +.ifc \txfm1\()_\txfm2,iadst_idct + ld1 {v0.8h,v1.8h}, [x10] +.endif + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + // Set v24-v31 to zero, for the in-register passthrough of + // coefficients to pass 2. Since we only do two slices, this can + // only ever happen for the second slice. So we only need to store + // zeros to the temp buffer for the second half of the buffer. + // Move x0 to the second half, and use x9 == 32 as increment. + add x0, x0, #16 +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + movi_v \i, .16b, #0 + st1 {v24.8h}, [x0], x9 +.endr +3: +.endif + +.irp i, 0, 8 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + mov x3, #\i + bl \txfm2\()16_1d_8x16_pass2_neon +.endr + + add sp, sp, #512 +.ifnc \txfm1\()_\txfm2,idct_idct + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 +.endif + ret x15 +endfunc +.endm + +itxfm_func16x16 idct, idct +itxfm_func16x16 iadst, idct +itxfm_func16x16 idct, iadst +itxfm_func16x16 iadst, iadst + +function idct16_1d_8x16_pass1_quarter_neon + mov x14, x30 + movi v2.8h, #0 +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr + + bl idct16_quarter + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + // The first 8x8 block is kept in registers for the second pass, + // store the rest in the temp buffer. + // Since only a 4x4 part of the input was nonzero, this means that + // only 4 rows are nonzero after transposing, and the second pass + // only reads the topmost 4 rows. Therefore only store the topmost + // 4 rows. + add x0, x0, #16 +.irp i, 24, 25, 26, 27 + store \i, x0, x9 +.endr + ret x14 +endfunc + +function idct16_1d_8x16_pass2_quarter_neon + mov x14, x30 + cbz x3, 1f +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_quarter + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + ret x14 +endfunc + +function idct16_1d_8x16_pass1_half_neon + mov x14, x30 + movi v2.8h, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr + + bl idct16_half + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + // The first 8x8 block is kept in registers for the second pass, + // store the rest in the temp buffer. + add x0, x0, #16 +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + store \i, x0, x9 +.endr + ret x14 +endfunc + +function idct16_1d_8x16_pass2_half_neon + mov x14, x30 + cbz x3, 1f +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_half + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + ret x14 +endfunc + +.macro idct16_partial size +function idct16x16_\size\()_add_neon + add x0, sp, #(0*32) + add x2, x6, #(0*2) + bl idct16_1d_8x16_pass1_\size\()_neon +.irp i, 0, 8 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + mov x3, #\i + bl idct16_1d_8x16_pass2_\size\()_neon +.endr + + add sp, sp, #512 + ret x15 +endfunc +.endm + +idct16_partial quarter +idct16_partial half + +function idct32x32_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + dup v2.8h, v2.h[0] + st1 {v1.h}[0], [x2] + + srshr v0.8h, v2.8h, #6 + + mov x3, x0 + mov x4, #32 +1: + // Loop to add the constant v0 into all 32x32 outputs + subs x4, x4, #2 + ld1 {v1.16b,v2.16b}, [x0], x1 + uaddw v16.8h, v0.8h, v1.8b + uaddw2 v17.8h, v0.8h, v1.16b + ld1 {v3.16b,v4.16b}, [x0], x1 + uaddw v18.8h, v0.8h, v2.8b + uaddw2 v19.8h, v0.8h, v2.16b + uaddw v20.8h, v0.8h, v3.8b + uaddw2 v21.8h, v0.8h, v3.16b + uaddw v22.8h, v0.8h, v4.8b + uaddw2 v23.8h, v0.8h, v4.16b + sqxtun v1.8b, v16.8h + sqxtun2 v1.16b, v17.8h + sqxtun v2.8b, v18.8h + sqxtun2 v2.16b, v19.8h + sqxtun v3.8b, v20.8h + sqxtun2 v3.16b, v21.8h + st1 {v1.16b,v2.16b}, [x3], x1 + sqxtun v4.8b, v22.8h + sqxtun2 v4.16b, v23.8h + st1 {v3.16b,v4.16b}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct32_end + butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a + butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18 + butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a + butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21 + butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a + butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26 + butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a + butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29 + + dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a + dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 + dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 + dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a + + butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24 + butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a + butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16 + butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a + butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21 + butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a + butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26 + butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20 + + dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20 + dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a + dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 + dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a + ret +.endm + +function idct32_odd + dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_half + dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_quarter + dsmull_h v4, v5, v16, v8.h[0] + dsmull_h v28, v29, v19, v8.h[7] + dsmull_h v30, v31, v16, v8.h[1] + dsmull_h v22, v23, v17, v9.h[6] + dsmull_h v7, v6, v17, v9.h[7] + dsmull_h v26, v27, v19, v8.h[6] + dsmull_h v20, v21, v18, v9.h[0] + dsmull_h v24, v25, v18, v9.h[1] + + neg v28.4s, v28.4s + neg v29.4s, v29.4s + neg v7.4s, v7.4s + neg v6.4s, v6.4s + + drshrn_h v4, v4, v5, #14 + drshrn_h v5, v28, v29, #14 + drshrn_h v29, v30, v31, #14 + drshrn_h v28, v22, v23, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v31, v26, v27, #14 + drshrn_h v6, v20, v21, #14 + drshrn_h v30, v24, v25, #14 + + dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5] + dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5] + drshrn_h v23, v16, v17, #14 + drshrn_h v24, v18, v19, #14 + neg v20.4s, v20.4s + neg v21.4s, v21.4s + drshrn_h v27, v27, v26, #14 + drshrn_h v20, v20, v21, #14 + dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7] + drshrn_h v21, v16, v17, #14 + drshrn_h v26, v18, v19, #14 + dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7] + drshrn_h v25, v16, v17, #14 + neg v18.4s, v18.4s + neg v19.4s, v19.4s + drshrn_h v22, v18, v19, #14 + + idct32_end +endfunc + +.macro idct32_funcs suffix +// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. +// The 32-point IDCT can be decomposed into two 16-point IDCTs; +// a normal IDCT16 with every other input component (the even ones, with +// each output written twice), followed by a separate 16-point IDCT +// of the odd inputs, added/subtracted onto the outputs of the first idct16. +// x0 = dst (temp buffer) +// x1 = unused +// x2 = src +// x9 = double input stride +function idct32_1d_8x32_pass1\suffix\()_neon + mov x14, x30 + movi v2.8h, #0 + + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct16\suffix + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the + // two transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the registers a, b horizontally, followed by the + // same registers b, a mirrored. +.macro store_rev a, b + // There's no rev128 instruction, but we reverse each 64 bit + // half, and then flip them using an ext with 8 bytes offset. + rev64 v3.8h, \b + st1 {\a}, [x0], #16 + rev64 v2.8h, \a + ext v3.16b, v3.16b, v3.16b, #8 + st1 {\b}, [x0], #16 + ext v2.16b, v2.16b, v2.16b, #8 + st1 {v3.8h}, [x0], #16 + st1 {v2.8h}, [x0], #16 +.endm + store_rev v16.8h, v24.8h + store_rev v17.8h, v25.8h + store_rev v18.8h, v26.8h + store_rev v19.8h, v27.8h + store_rev v20.8h, v28.8h + store_rev v21.8h, v29.8h + store_rev v22.8h, v30.8h + store_rev v23.8h, v31.8h + sub x0, x0, #512 +.purgem store_rev + + // Move x2 back to the start of the input, and move + // to the first odd row +.ifb \suffix + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half + sub x2, x2, x9, lsl #3 +.endif + add x2, x2, #64 + + movi v2.8h, #0 + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct32_odd\suffix + + transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 + transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 + + // Store the registers a, b horizontally, + // adding into the output first, and the mirrored, + // subtracted from the output. +.macro store_rev a, b + ld1 {v4.8h}, [x0] + rev64 v3.8h, \b + add v4.8h, v4.8h, \a + rev64 v2.8h, \a + st1 {v4.8h}, [x0], #16 + ext v3.16b, v3.16b, v3.16b, #8 + ld1 {v5.8h}, [x0] + ext v2.16b, v2.16b, v2.16b, #8 + add v5.8h, v5.8h, \b + st1 {v5.8h}, [x0], #16 + ld1 {v6.8h}, [x0] + sub v6.8h, v6.8h, v3.8h + st1 {v6.8h}, [x0], #16 + ld1 {v7.8h}, [x0] + sub v7.8h, v7.8h, v2.8h + st1 {v7.8h}, [x0], #16 +.endm + + store_rev v31.8h, v23.8h + store_rev v30.8h, v22.8h + store_rev v29.8h, v21.8h + store_rev v28.8h, v20.8h + store_rev v27.8h, v19.8h + store_rev v26.8h, v18.8h + store_rev v25.8h, v17.8h + store_rev v24.8h, v16.8h +.purgem store_rev + ret x14 +endfunc + +// This is mostly the same as 8x32_pass1, but without the transpose, +// and use the source as temp buffer between the two idct passes, and +// add into the destination. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x7 = negative double temp buffer stride +// x9 = double temp buffer stride +function idct32_1d_8x32_pass2\suffix\()_neon + mov x14, x30 + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + + bl idct16\suffix + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + store \i, x2, x9 +.endr + + sub x2, x2, x9, lsl #4 + add x2, x2, #64 + + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + sub x2, x2, #64 + + bl idct32_odd\suffix + +.macro load_acc_store a, b, c, d, neg=0 +.if \neg == 0 + ld1 {v4.8h}, [x2], x9 + ld1 {v5.8h}, [x2], x9 + add v4.8h, v4.8h, \a + ld1 {v6.8h}, [x2], x9 + add v5.8h, v5.8h, \b + ld1 {v7.8h}, [x2], x9 + add v6.8h, v6.8h, \c + add v7.8h, v7.8h, \d +.else + ld1 {v4.8h}, [x2], x7 + ld1 {v5.8h}, [x2], x7 + sub v4.8h, v4.8h, \a + ld1 {v6.8h}, [x2], x7 + sub v5.8h, v5.8h, \b + ld1 {v7.8h}, [x2], x7 + sub v6.8h, v6.8h, \c + sub v7.8h, v7.8h, \d +.endif + ld1 {v10.8b}, [x0], x1 + ld1 {v11.8b}, [x0], x1 + srshr v4.8h, v4.8h, #6 + ld1 {v2.8b}, [x0], x1 + srshr v5.8h, v5.8h, #6 + uaddw v4.8h, v4.8h, v10.8b + ld1 {v3.8b}, [x0], x1 + srshr v6.8h, v6.8h, #6 + uaddw v5.8h, v5.8h, v11.8b + srshr v7.8h, v7.8h, #6 + sub x0, x0, x1, lsl #2 + uaddw v6.8h, v6.8h, v2.8b + sqxtun v4.8b, v4.8h + uaddw v7.8h, v7.8h, v3.8b + sqxtun v5.8b, v5.8h + st1 {v4.8b}, [x0], x1 + sqxtun v6.8b, v6.8h + st1 {v5.8b}, [x0], x1 + sqxtun v7.8b, v7.8h + st1 {v6.8b}, [x0], x1 + st1 {v7.8b}, [x0], x1 +.endm + load_acc_store v31.8h, v30.8h, v29.8h, v28.8h + load_acc_store v27.8h, v26.8h, v25.8h, v24.8h + load_acc_store v23.8h, v22.8h, v21.8h, v20.8h + load_acc_store v19.8h, v18.8h, v17.8h, v16.8h + sub x2, x2, x9 + load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1 + load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1 + load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1 + load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1 +.purgem load_acc_store + ret x14 +endfunc +.endm + +idct32_funcs +idct32_funcs _quarter +idct32_funcs _half + +const min_eob_idct_idct_32, align=4 + .short 0, 34, 135, 336 +endconst + +function ff_vp9_idct_idct_32x32_add_neon, export=1 + cmp w3, #1 + b.eq idct32x32_dc_add_neon + + movrel x10, idct_coeffs + + mov x15, x30 + + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] + + sub sp, sp, #2048 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + // Double stride of the input, since we only read every other line + mov x9, #128 + neg x7, x9 + + ld1 {v0.8h,v1.8h}, [x10], #32 + ld1 {v8.8h,v9.8h}, [x10] + + cmp w3, #34 + b.le idct32x32_quarter_add_neon + cmp w3, #135 + b.le idct32x32_half_add_neon + + movrel x12, min_eob_idct_idct_32, 2 + +.irp i, 0, 8, 16, 24 + add x0, sp, #(\i*64) +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(32 - \i)/4 + b.le 1f +.endif + add x2, x6, #(\i*2) + bl idct32_1d_8x32_pass1_neon +.endr + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64 +.endr + b.ne 2b +3: +.irp i, 0, 8, 16, 24 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + bl idct32_1d_8x32_pass2_neon +.endr + + add sp, sp, #2048 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], #0x20 + + ret x15 +endfunc + +.macro idct32_partial size +function idct32x32_\size\()_add_neon + add x0, sp, #(0*64) + add x2, x6, #(0*2) + bl idct32_1d_8x32_pass1_\size\()_neon +.ifc \size,half + add x0, sp, #(8*64) + add x2, x6, #(8*2) + bl idct32_1d_8x32_pass1_\size\()_neon +.endif +.irp i, 0, 8, 16, 24 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + bl idct32_1d_8x32_pass2_\size\()_neon +.endr + + add sp, sp, #2048 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], #0x20 + + ret x15 +endfunc +.endm + +idct32_partial quarter +idct32_partial half diff --git a/external/ffmpeg-snapshot/libavcodec/vp9dsp_template.c b/external/ffmpeg-snapshot/libavcodec/vp9dsp_template.c new file mode 100644 index 0000000..9e5b251 --- /dev/null +++ b/external/ffmpeg-snapshot/libavcodec/vp9dsp_template.c @@ -0,0 +1,2578 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje + * Copyright (C) 2013 Clément Bœsch + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "bit_depth_template.c" +#include "vp9dsp.h" + +#if BIT_DEPTH != 12 + +// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8) +// back with h264pred.[ch] + +static void vert_4x4_c(uint8_t *restrict _dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 p4 = AV_RN4PA(top); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, p4); + AV_WN4PA(dst + stride * 1, p4); + AV_WN4PA(dst + stride * 2, p4); + AV_WN4PA(dst + stride * 3, p4); +} + +static void vert_8x8_c(uint8_t *restrict _dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; +#if BIT_DEPTH == 8 + uint64_t p8 = AV_RN64A(top); +#else + pixel4 p4a = AV_RN4PA(top + 0); + pixel4 p4b = AV_RN4PA(top + 4); +#endif + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { +#if BIT_DEPTH == 8 + AV_WN64A(dst, p8); +#else + AV_WN4PA(dst + 0, p4a); + AV_WN4PA(dst + 4, p4b); +#endif + dst += stride; + } +} + +static void vert_16x16_c(uint8_t *restrict _dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; +#if BIT_DEPTH == 8 + uint64_t p8a = AV_RN64A(top); + uint64_t p8b = AV_RN64A(top + 8); +#else + pixel4 p4a = AV_RN4PA(top + 0); + pixel4 p4b = AV_RN4PA(top + 4); + pixel4 p4c = AV_RN4PA(top + 8); + pixel4 p4d = AV_RN4PA(top + 12); +#endif + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { +#if BIT_DEPTH == 8 + AV_WN64A(dst + 0, p8a); + AV_WN64A(dst + 8, p8b); +#else + AV_WN4PA(dst + 0, p4a); + AV_WN4PA(dst + 4, p4b); + AV_WN4PA(dst + 8, p4c); + AV_WN4PA(dst + 12, p4d); +#endif + dst += stride; + } +} + +static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; +#if BIT_DEPTH == 8 + uint64_t p8a = AV_RN64A(top); + uint64_t p8b = AV_RN64A(top + 8); + uint64_t p8c = AV_RN64A(top + 16); + uint64_t p8d = AV_RN64A(top + 24); +#else + pixel4 p4a = AV_RN4PA(top + 0); + pixel4 p4b = AV_RN4PA(top + 4); + pixel4 p4c = AV_RN4PA(top + 8); + pixel4 p4d = AV_RN4PA(top + 12); + pixel4 p4e = AV_RN4PA(top + 16); + pixel4 p4f = AV_RN4PA(top + 20); + pixel4 p4g = AV_RN4PA(top + 24); + pixel4 p4h = AV_RN4PA(top + 28); +#endif + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { +#if BIT_DEPTH == 8 + AV_WN64A(dst + 0, p8a); + AV_WN64A(dst + 8, p8b); + AV_WN64A(dst + 16, p8c); + AV_WN64A(dst + 24, p8d); +#else + AV_WN4PA(dst + 0, p4a); + AV_WN4PA(dst + 4, p4b); + AV_WN4PA(dst + 8, p4c); + AV_WN4PA(dst + 12, p4d); + AV_WN4PA(dst + 16, p4e); + AV_WN4PA(dst + 20, p4f); + AV_WN4PA(dst + 24, p4g); + AV_WN4PA(dst + 28, p4h); +#endif + dst += stride; + } +} + +static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3])); + AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2])); + AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1])); + AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0])); +} + +static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]); + + AV_WN4PA(dst + 0, p4); + AV_WN4PA(dst + 4, p4); + dst += stride; + } +} + +static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]); + + AV_WN4PA(dst + 0, p4); + AV_WN4PA(dst + 4, p4); + AV_WN4PA(dst + 8, p4); + AV_WN4PA(dst + 12, p4); + dst += stride; + } +} + +static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]); + + AV_WN4PA(dst + 0, p4); + AV_WN4PA(dst + 4, p4); + AV_WN4PA(dst + 8, p4); + AV_WN4PA(dst + 12, p4); + AV_WN4PA(dst + 16, p4); + AV_WN4PA(dst + 20, p4); + AV_WN4PA(dst + 24, p4); + AV_WN4PA(dst + 28, p4); + dst += stride; + } +} + +#endif /* BIT_DEPTH != 12 */ + +static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 4; y++) { + int l_m_tl = left[3 - y] - tl; + + dst[0] = av_clip_pixel(top[0] + l_m_tl); + dst[1] = av_clip_pixel(top[1] + l_m_tl); + dst[2] = av_clip_pixel(top[2] + l_m_tl); + dst[3] = av_clip_pixel(top[3] + l_m_tl); + dst += stride; + } +} + +static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + int l_m_tl = left[7 - y] - tl; + + dst[0] = av_clip_pixel(top[0] + l_m_tl); + dst[1] = av_clip_pixel(top[1] + l_m_tl); + dst[2] = av_clip_pixel(top[2] + l_m_tl); + dst[3] = av_clip_pixel(top[3] + l_m_tl); + dst[4] = av_clip_pixel(top[4] + l_m_tl); + dst[5] = av_clip_pixel(top[5] + l_m_tl); + dst[6] = av_clip_pixel(top[6] + l_m_tl); + dst[7] = av_clip_pixel(top[7] + l_m_tl); + dst += stride; + } +} + +static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + int l_m_tl = left[15 - y] - tl; + + dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl); + dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl); + dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl); + dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl); + dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl); + dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl); + dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl); + dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl); + dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl); + dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl); + dst[10] = av_clip_pixel(top[10] + l_m_tl); + dst[11] = av_clip_pixel(top[11] + l_m_tl); + dst[12] = av_clip_pixel(top[12] + l_m_tl); + dst[13] = av_clip_pixel(top[13] + l_m_tl); + dst[14] = av_clip_pixel(top[14] + l_m_tl); + dst[15] = av_clip_pixel(top[15] + l_m_tl); + dst += stride; + } +} + +static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + int l_m_tl = left[31 - y] - tl; + + dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl); + dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl); + dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl); + dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl); + dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl); + dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl); + dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl); + dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl); + dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl); + dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl); + dst[10] = av_clip_pixel(top[10] + l_m_tl); + dst[11] = av_clip_pixel(top[11] + l_m_tl); + dst[12] = av_clip_pixel(top[12] + l_m_tl); + dst[13] = av_clip_pixel(top[13] + l_m_tl); + dst[14] = av_clip_pixel(top[14] + l_m_tl); + dst[15] = av_clip_pixel(top[15] + l_m_tl); + dst[16] = av_clip_pixel(top[16] + l_m_tl); + dst[17] = av_clip_pixel(top[17] + l_m_tl); + dst[18] = av_clip_pixel(top[18] + l_m_tl); + dst[19] = av_clip_pixel(top[19] + l_m_tl); + dst[20] = av_clip_pixel(top[20] + l_m_tl); + dst[21] = av_clip_pixel(top[21] + l_m_tl); + dst[22] = av_clip_pixel(top[22] + l_m_tl); + dst[23] = av_clip_pixel(top[23] + l_m_tl); + dst[24] = av_clip_pixel(top[24] + l_m_tl); + dst[25] = av_clip_pixel(top[25] + l_m_tl); + dst[26] = av_clip_pixel(top[26] + l_m_tl); + dst[27] = av_clip_pixel(top[27] + l_m_tl); + dst[28] = av_clip_pixel(top[28] + l_m_tl); + dst[29] = av_clip_pixel(top[29] + l_m_tl); + dst[30] = av_clip_pixel(top[30] + l_m_tl); + dst[31] = av_clip_pixel(top[31] + l_m_tl); + dst += stride; + } +} + +#if BIT_DEPTH != 12 + +static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + + top[0] + top[1] + top[2] + top[3] + 4) >> 3); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, dc); + AV_WN4PA(dst + stride * 1, dc); + AV_WN4PA(dst + stride * 2, dc); + AV_WN4PA(dst + stride * 3, dc); +} + +static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + + left[6] + left[7] + top[0] + top[1] + top[2] + top[3] + + top[4] + top[5] + top[6] + top[7] + 8) >> 4); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + dst += stride; + } +} + +static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] + + left[7] + left[8] + left[9] + left[10] + left[11] + left[12] + + left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] + + top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] + + top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + dst += stride; + } +} + +static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] + + left[7] + left[8] + left[9] + left[10] + left[11] + left[12] + + left[13] + left[14] + left[15] + left[16] + left[17] + left[18] + + left[19] + left[20] + left[21] + left[22] + left[23] + left[24] + + left[25] + left[26] + left[27] + left[28] + left[29] + left[30] + + left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] + + top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] + + top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] + + top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] + + top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + AV_WN4PA(dst + 16, dc); + AV_WN4PA(dst + 20, dc); + AV_WN4PA(dst + 24, dc); + AV_WN4PA(dst + 28, dc); + dst += stride; + } +} + +static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, dc); + AV_WN4PA(dst + stride * 1, dc); + AV_WN4PA(dst + stride * 2, dc); + AV_WN4PA(dst + stride * 3, dc); +} + +static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + + left[4] + left[5] + left[6] + left[7] + 4) >> 3); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + dst += stride; + } +} + +static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + + left[6] + left[7] + left[8] + left[9] + left[10] + left[11] + + left[12] + left[13] + left[14] + left[15] + 8) >> 4); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + dst += stride; + } +} + +static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + + left[6] + left[7] + left[8] + left[9] + left[10] + left[11] + + left[12] + left[13] + left[14] + left[15] + left[16] + left[17] + + left[18] + left[19] + left[20] + left[21] + left[22] + left[23] + + left[24] + left[25] + left[26] + left[27] + left[28] + left[29] + + left[30] + left[31] + 16) >> 5); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + AV_WN4PA(dst + 16, dc); + AV_WN4PA(dst + 20, dc); + AV_WN4PA(dst + 24, dc); + AV_WN4PA(dst + 28, dc); + dst += stride; + } +} + +static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, dc); + AV_WN4PA(dst + stride * 1, dc); + AV_WN4PA(dst + stride * 2, dc); + AV_WN4PA(dst + stride * 3, dc); +} + +static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((top[0] + top[1] + top[2] + top[3] + + top[4] + top[5] + top[6] + top[7] + 4) >> 3); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + dst += stride; + } +} + +static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] + + top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + + top[12] + top[13] + top[14] + top[15] + 8) >> 4); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + dst += stride; + } +} + +static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] + + top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + + top[12] + top[13] + top[14] + top[15] + top[16] + top[17] + + top[18] + top[19] + top[20] + top[21] + top[22] + top[23] + + top[24] + top[25] + top[26] + top[27] + top[28] + top[29] + + top[30] + top[31] + 16) >> 5); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + AV_WN4PA(dst + 16, dc); + AV_WN4PA(dst + 20, dc); + AV_WN4PA(dst + 24, dc); + AV_WN4PA(dst + 28, dc); + dst += stride; + } +} + +#endif /* BIT_DEPTH != 12 */ + +static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, val); + AV_WN4PA(dst + stride * 1, val); + AV_WN4PA(dst + stride * 2, val); + AV_WN4PA(dst + stride * 3, val); +} + +static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + dst += stride; + } +} + +static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + dst += stride; + } +} + +static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + AV_WN4PA(dst + 16, val); + AV_WN4PA(dst + 20, val); + AV_WN4PA(dst + 24, val); + AV_WN4PA(dst + 28, val); + dst += stride; + } +} + +static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, val); + AV_WN4PA(dst + stride * 1, val); + AV_WN4PA(dst + stride * 2, val); + AV_WN4PA(dst + stride * 3, val);} + +static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + dst += stride; + } +} + +static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + dst += stride; + } +} + +static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + AV_WN4PA(dst + 16, val); + AV_WN4PA(dst + 20, val); + AV_WN4PA(dst + 24, val); + AV_WN4PA(dst + 28, val); + dst += stride; + } +} + +static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, val); + AV_WN4PA(dst + stride * 1, val); + AV_WN4PA(dst + stride * 2, val); + AV_WN4PA(dst + stride * 3, val); +} + +static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + dst += stride; + } +} + +static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + dst += stride; + } +} + +static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + AV_WN4PA(dst + 16, val); + AV_WN4PA(dst + 20, val); + AV_WN4PA(dst + 24, val); + AV_WN4PA(dst + 28, val); + dst += stride; + } +} + +#if BIT_DEPTH != 12 + +#if BIT_DEPTH == 8 +#define memset_bpc memset +#else +static inline void memset_bpc(uint16_t *dst, int val, int len) { + int n; + for (n = 0; n < len; n++) { + dst[n] = val; + } +} +#endif + +#define DST(x, y) dst[(x) + (y) * stride] + +static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7]; + + stride /= sizeof(pixel); + DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2; + DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2; + DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2; + DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2; + DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2; + DST(3,3) = a7; // note: this is different from vp8 and such +} + +#define def_diag_downleft(size) \ +static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + int i, j; \ + pixel v[size - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) \ + v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \ + v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \ +\ + for (j = 0; j < size; j++) { \ + memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \ + memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \ + } \ +} + +def_diag_downleft(8) +def_diag_downleft(16) +def_diag_downleft(32) + +static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + const pixel *left = (const pixel *) _left; + int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0]; + + stride /= sizeof(pixel); + DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2; + DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2; + DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2; + DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2; + DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2; +} + +#define def_diag_downright(size) \ +static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel v[size + size - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + v[i ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \ + v[size + 1 + i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \ + } \ + v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \ + v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \ + v[size ] = (top[-1] + top[0] * 2 + top[ 1] + 2) >> 2; \ +\ + for (j = 0; j < size; j++) \ + memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \ +} + +def_diag_downright(8) +def_diag_downright(16) +def_diag_downright(32) + +static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + const pixel *left = (const pixel *) _left; + int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + l0 = left[3], l1 = left[2], l2 = left[1]; + + stride /= sizeof(pixel); + DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2; + DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1; + DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2; + DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1; + DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2; + DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1; + DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(3,0) = (a2 + a3 + 1) >> 1; + DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2; +} + +#define def_vert_right(size) \ +static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size/2 - 2; i++) { \ + vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \ + ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \ + } \ + vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \ + ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \ +\ + ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \ + vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \ + for (i = 0; i < size - 1; i++) { \ + ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \ + vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \ + } \ +\ + for (j = 0; j < size / 2; j++) { \ + memcpy(dst + j*2 *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \ + memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \ + } \ +} + +def_vert_right(8) +def_vert_right(16) +def_vert_right(32) + +static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + const pixel *left = (const pixel *) _left; + int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0], + tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2]; + + stride /= sizeof(pixel); + DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2; + DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1; + DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2; + DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1; + DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2; + DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1; + DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,3) = (l2 + l3 + 1) >> 1; + DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2; +} + +#define def_hor_down(size) \ +static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel v[size * 3 - 2]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + v[i*2 ] = (left[i + 1] + left[i + 0] + 1) >> 1; \ + v[i*2 + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \ + v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \ + } \ + v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \ + v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \ + v[size*2 - 1] = (top[0] + top[-1] * 2 + left[size - 1] + 2) >> 2; \ + v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \ +\ + for (j = 0; j < size; j++) \ + memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \ +} + +def_hor_down(8) +def_hor_down(16) +def_hor_down(32) + +static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + a4 = top[4], a5 = top[5], a6 = top[6]; + + stride /= sizeof(pixel); + DST(0,0) = (a0 + a1 + 1) >> 1; + DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1; + DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2; + DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1; + DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2; + DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1; + DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2; + DST(3,2) = (a4 + a5 + 1) >> 1; + DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2; +} + +#define def_vert_left(size) \ +static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + int i, j; \ + pixel ve[size - 1], vo[size - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + ve[i] = (top[i] + top[i + 1] + 1) >> 1; \ + vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \ + } \ + ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \ + vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \ +\ + for (j = 0; j < size / 2; j++) { \ + memcpy(dst + j*2 * stride, ve + j, (size - j - 1) * sizeof(pixel)); \ + memset_bpc(dst + j*2 * stride + size - j - 1, top[size - 1], j + 1); \ + memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \ + memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \ + } \ +} + +def_vert_left(8) +def_vert_left(16) +def_vert_left(32) + +static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3]; + + stride /= sizeof(pixel); + DST(0,0) = (l0 + l1 + 1) >> 1; + DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1; + DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2; + DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1; + DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2; + DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3; +} + +#define def_hor_up(size) \ +static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel v[size*2 - 2]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + v[i*2 ] = (left[i] + left[i + 1] + 1) >> 1; \ + v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \ + } \ + v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \ + v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \ +\ + for (j = 0; j < size / 2; j++) \ + memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \ + for (j = size / 2; j < size; j++) { \ + memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \ + memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \ + 2 + j*2 - size); \ + } \ +} + +def_hor_up(8) +def_hor_up(16) +def_hor_up(32) + +#undef DST + +#endif /* BIT_DEPTH != 12 */ + +#if BIT_DEPTH != 8 +void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp); +#endif +#if BIT_DEPTH != 10 +static +#endif +av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp) +{ +#define init_intra_pred_bd_aware(tx, sz) \ + dsp->intra_pred[tx][TM_VP8_PRED] = tm_##sz##_c; \ + dsp->intra_pred[tx][DC_128_PRED] = dc_128_##sz##_c; \ + dsp->intra_pred[tx][DC_127_PRED] = dc_127_##sz##_c; \ + dsp->intra_pred[tx][DC_129_PRED] = dc_129_##sz##_c + +#if BIT_DEPTH == 12 + ff_vp9dsp_intrapred_init_10(dsp); +#define init_intra_pred(tx, sz) \ + init_intra_pred_bd_aware(tx, sz) +#else + #define init_intra_pred(tx, sz) \ + dsp->intra_pred[tx][VERT_PRED] = vert_##sz##_c; \ + dsp->intra_pred[tx][HOR_PRED] = hor_##sz##_c; \ + dsp->intra_pred[tx][DC_PRED] = dc_##sz##_c; \ + dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = diag_downleft_##sz##_c; \ + dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \ + dsp->intra_pred[tx][VERT_RIGHT_PRED] = vert_right_##sz##_c; \ + dsp->intra_pred[tx][HOR_DOWN_PRED] = hor_down_##sz##_c; \ + dsp->intra_pred[tx][VERT_LEFT_PRED] = vert_left_##sz##_c; \ + dsp->intra_pred[tx][HOR_UP_PRED] = hor_up_##sz##_c; \ + dsp->intra_pred[tx][LEFT_DC_PRED] = dc_left_##sz##_c; \ + dsp->intra_pred[tx][TOP_DC_PRED] = dc_top_##sz##_c; \ + init_intra_pred_bd_aware(tx, sz) +#endif + + init_intra_pred(TX_4X4, 4x4); + init_intra_pred(TX_8X8, 8x8); + init_intra_pred(TX_16X16, 16x16); + init_intra_pred(TX_32X32, 32x32); + +#undef init_intra_pred +#undef init_intra_pred_bd_aware +} + +#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \ +static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \ + ptrdiff_t stride, \ + int16_t *_block, int eob) \ +{ \ + int i, j; \ + pixel *dst = (pixel *) _dst; \ + dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \ +\ + stride /= sizeof(pixel); \ + if (has_dconly && eob == 1) { \ + const int t = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \ + * 11585 + (1 << 13)) >> 14; \ + block[0] = 0; \ + for (i = 0; i < sz; i++) { \ + for (j = 0; j < sz; j++) \ + dst[j * stride] = av_clip_pixel(dst[j * stride] + \ + (bits ? \ + (int)(t + (1U << (bits - 1))) >> bits : \ + t)); \ + dst++; \ + } \ + return; \ + } \ +\ + for (i = 0; i < sz; i++) \ + type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \ + memset(block, 0, sz * sz * sizeof(*block)); \ + for (i = 0; i < sz; i++) { \ + type_b##sz##_1d(tmp + i, sz, out, 1); \ + for (j = 0; j < sz; j++) \ + dst[j * stride] = av_clip_pixel(dst[j * stride] + \ + (bits ? \ + (int)(out[j] + (1U << (bits - 1))) >> bits : \ + out[j])); \ + dst++; \ + } \ +} + +#define itxfm_wrap(sz, bits) \ +itxfm_wrapper(idct, idct, sz, bits, 1) \ +itxfm_wrapper(iadst, idct, sz, bits, 0) \ +itxfm_wrapper(idct, iadst, sz, bits, 0) \ +itxfm_wrapper(iadst, iadst, sz, bits, 0) + +#define IN(x) ((dctint) in[(x) * stride]) + +static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3; + + t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14; + t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14; + t2 = (IN(1) * 6270 - IN(3) * 15137 + (1 << 13)) >> 14; + t3 = (IN(1) * 15137 + IN(3) * 6270 + (1 << 13)) >> 14; + + out[0] = t0 + t3; + out[1] = t1 + t2; + out[2] = t1 - t2; + out[3] = t0 - t3; +} + +static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3; + + t0 = 5283 * IN(0) + 15212 * IN(2) + 9929 * IN(3); + t1 = 9929 * IN(0) - 5283 * IN(2) - 15212 * IN(3); + t2 = 13377 * (IN(0) - IN(2) + IN(3)); + t3 = 13377 * IN(1); + + out[0] = (t0 + t3 + (1 << 13)) >> 14; + out[1] = (t1 + t3 + (1 << 13)) >> 14; + out[2] = (t2 + (1 << 13)) >> 14; + out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14; +} + +itxfm_wrap(4, 4) + +static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a; + + t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14; + t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14; + t2a = (IN(2) * 6270 - IN(6) * 15137 + (1 << 13)) >> 14; + t3a = (IN(2) * 15137 + IN(6) * 6270 + (1 << 13)) >> 14; + t4a = (IN(1) * 3196 - IN(7) * 16069 + (1 << 13)) >> 14; + t5a = (IN(5) * 13623 - IN(3) * 9102 + (1 << 13)) >> 14; + t6a = (IN(5) * 9102 + IN(3) * 13623 + (1 << 13)) >> 14; + t7a = (IN(1) * 16069 + IN(7) * 3196 + (1 << 13)) >> 14; + + t0 = t0a + t3a; + t1 = t1a + t2a; + t2 = t1a - t2a; + t3 = t0a - t3a; + t4 = t4a + t5a; + t5a = t4a - t5a; + t7 = t7a + t6a; + t6a = t7a - t6a; + + t5 = ((t6a - t5a) * 11585 + (1 << 13)) >> 14; + t6 = ((t6a + t5a) * 11585 + (1 << 13)) >> 14; + + out[0] = t0 + t7; + out[1] = t1 + t6; + out[2] = t2 + t5; + out[3] = t3 + t4; + out[4] = t3 - t4; + out[5] = t2 - t5; + out[6] = t1 - t6; + out[7] = t0 - t7; +} + +static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a; + + t0a = 16305 * IN(7) + 1606 * IN(0); + t1a = 1606 * IN(7) - 16305 * IN(0); + t2a = 14449 * IN(5) + 7723 * IN(2); + t3a = 7723 * IN(5) - 14449 * IN(2); + t4a = 10394 * IN(3) + 12665 * IN(4); + t5a = 12665 * IN(3) - 10394 * IN(4); + t6a = 4756 * IN(1) + 15679 * IN(6); + t7a = 15679 * IN(1) - 4756 * IN(6); + + t0 = (t0a + t4a + (1 << 13)) >> 14; + t1 = (t1a + t5a + (1 << 13)) >> 14; + t2 = (t2a + t6a + (1 << 13)) >> 14; + t3 = (t3a + t7a + (1 << 13)) >> 14; + t4 = (t0a - t4a + (1 << 13)) >> 14; + t5 = (t1a - t5a + (1 << 13)) >> 14; + t6 = (t2a - t6a + (1 << 13)) >> 14; + t7 = (t3a - t7a + (1 << 13)) >> 14; + + t4a = 15137U * t4 + 6270U * t5; + t5a = 6270U * t4 - 15137U * t5; + t6a = 15137U * t7 - 6270U * t6; + t7a = 6270U * t7 + 15137U * t6; + + out[0] = t0 + t2; + out[7] = -(t1 + t3); + t2 = t0 - t2; + t3 = t1 - t3; + + out[1] = -((dctint)((1U << 13) + t4a + t6a) >> 14); + out[6] = (dctint)((1U << 13) + t5a + t7a) >> 14; + t6 = (dctint)((1U << 13) + t4a - t6a) >> 14; + t7 = (dctint)((1U << 13) + t5a - t7a) >> 14; + + out[3] = -((dctint)((t2 + t3) * 11585U + (1 << 13)) >> 14); + out[4] = (dctint)((t2 - t3) * 11585U + (1 << 13)) >> 14; + out[2] = (dctint)((t6 + t7) * 11585U + (1 << 13)) >> 14; + out[5] = -((dctint)((t6 - t7) * 11585U + (1 << 13)) >> 14); +} + +itxfm_wrap(8, 5) + +static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a; + dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a; + + t0a = (dctint)((IN(0) + IN(8)) * 11585U + (1 << 13)) >> 14; + t1a = (dctint)((IN(0) - IN(8)) * 11585U + (1 << 13)) >> 14; + t2a = (dctint)(IN(4) * 6270U - IN(12) * 15137U + (1 << 13)) >> 14; + t3a = (dctint)(IN(4) * 15137U + IN(12) * 6270U + (1 << 13)) >> 14; + t4a = (dctint)(IN(2) * 3196U - IN(14) * 16069U + (1 << 13)) >> 14; + t7a = (dctint)(IN(2) * 16069U + IN(14) * 3196U + (1 << 13)) >> 14; + t5a = (dctint)(IN(10) * 13623U - IN(6) * 9102U + (1 << 13)) >> 14; + t6a = (dctint)(IN(10) * 9102U + IN(6) * 13623U + (1 << 13)) >> 14; + t8a = (dctint)(IN(1) * 1606U - IN(15) * 16305U + (1 << 13)) >> 14; + t15a = (dctint)(IN(1) * 16305U + IN(15) * 1606U + (1 << 13)) >> 14; + t9a = (dctint)(IN(9) * 12665U - IN(7) * 10394U + (1 << 13)) >> 14; + t14a = (dctint)(IN(9) * 10394U + IN(7) * 12665U + (1 << 13)) >> 14; + t10a = (dctint)(IN(5) * 7723U - IN(11) * 14449U + (1 << 13)) >> 14; + t13a = (dctint)(IN(5) * 14449U + IN(11) * 7723U + (1 << 13)) >> 14; + t11a = (dctint)(IN(13) * 15679U - IN(3) * 4756U + (1 << 13)) >> 14; + t12a = (dctint)(IN(13) * 4756U + IN(3) * 15679U + (1 << 13)) >> 14; + + t0 = t0a + t3a; + t1 = t1a + t2a; + t2 = t1a - t2a; + t3 = t0a - t3a; + t4 = t4a + t5a; + t5 = t4a - t5a; + t6 = t7a - t6a; + t7 = t7a + t6a; + t8 = t8a + t9a; + t9 = t8a - t9a; + t10 = t11a - t10a; + t11 = t11a + t10a; + t12 = t12a + t13a; + t13 = t12a - t13a; + t14 = t15a - t14a; + t15 = t15a + t14a; + + t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14; + t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14; + t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14; + t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14; + t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14; + t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14; + + t0a = t0 + t7; + t1a = t1 + t6a; + t2a = t2 + t5a; + t3a = t3 + t4; + t4 = t3 - t4; + t5 = t2 - t5a; + t6 = t1 - t6a; + t7 = t0 - t7; + t8a = t8 + t11; + t9 = t9a + t10a; + t10 = t9a - t10a; + t11a = t8 - t11; + t12a = t15 - t12; + t13 = t14a - t13a; + t14 = t14a + t13a; + t15a = t15 + t12; + + t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14; + t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14; + t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14; + t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14; + + out[ 0] = t0a + t15a; + out[ 1] = t1a + t14; + out[ 2] = t2a + t13a; + out[ 3] = t3a + t12; + out[ 4] = t4 + t11; + out[ 5] = t5 + t10a; + out[ 6] = t6 + t9; + out[ 7] = t7 + t8a; + out[ 8] = t7 - t8a; + out[ 9] = t6 - t9; + out[10] = t5 - t10a; + out[11] = t4 - t11; + out[12] = t3a - t12; + out[13] = t2a - t13a; + out[14] = t1a - t14; + out[15] = t0a - t15a; +} + +static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a; + dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a; + + t0 = IN(15) * 16364U + IN(0) * 804U; + t1 = IN(15) * 804U - IN(0) * 16364U; + t2 = IN(13) * 15893U + IN(2) * 3981U; + t3 = IN(13) * 3981U - IN(2) * 15893U; + t4 = IN(11) * 14811U + IN(4) * 7005U; + t5 = IN(11) * 7005U - IN(4) * 14811U; + t6 = IN(9) * 13160U + IN(6) * 9760U; + t7 = IN(9) * 9760U - IN(6) * 13160U; + t8 = IN(7) * 11003U + IN(8) * 12140U; + t9 = IN(7) * 12140U - IN(8) * 11003U; + t10 = IN(5) * 8423U + IN(10) * 14053U; + t11 = IN(5) * 14053U - IN(10) * 8423U; + t12 = IN(3) * 5520U + IN(12) * 15426U; + t13 = IN(3) * 15426U - IN(12) * 5520U; + t14 = IN(1) * 2404U + IN(14) * 16207U; + t15 = IN(1) * 16207U - IN(14) * 2404U; + + t0a = (dctint)((1U << 13) + t0 + t8 ) >> 14; + t1a = (dctint)((1U << 13) + t1 + t9 ) >> 14; + t2a = (dctint)((1U << 13) + t2 + t10) >> 14; + t3a = (dctint)((1U << 13) + t3 + t11) >> 14; + t4a = (dctint)((1U << 13) + t4 + t12) >> 14; + t5a = (dctint)((1U << 13) + t5 + t13) >> 14; + t6a = (dctint)((1U << 13) + t6 + t14) >> 14; + t7a = (dctint)((1U << 13) + t7 + t15) >> 14; + t8a = (dctint)((1U << 13) + t0 - t8 ) >> 14; + t9a = (dctint)((1U << 13) + t1 - t9 ) >> 14; + t10a = (dctint)((1U << 13) + t2 - t10) >> 14; + t11a = (dctint)((1U << 13) + t3 - t11) >> 14; + t12a = (dctint)((1U << 13) + t4 - t12) >> 14; + t13a = (dctint)((1U << 13) + t5 - t13) >> 14; + t14a = (dctint)((1U << 13) + t6 - t14) >> 14; + t15a = (dctint)((1U << 13) + t7 - t15) >> 14; + + t8 = t8a * 16069U + t9a * 3196U; + t9 = t8a * 3196U - t9a * 16069U; + t10 = t10a * 9102U + t11a * 13623U; + t11 = t10a * 13623U - t11a * 9102U; + t12 = t13a * 16069U - t12a * 3196U; + t13 = t13a * 3196U + t12a * 16069U; + t14 = t15a * 9102U - t14a * 13623U; + t15 = t15a * 13623U + t14a * 9102U; + + t0 = t0a + t4a; + t1 = t1a + t5a; + t2 = t2a + t6a; + t3 = t3a + t7a; + t4 = t0a - t4a; + t5 = t1a - t5a; + t6 = t2a - t6a; + t7 = t3a - t7a; + t8a = (dctint)((1U << 13) + t8 + t12) >> 14; + t9a = (dctint)((1U << 13) + t9 + t13) >> 14; + t10a = (dctint)((1U << 13) + t10 + t14) >> 14; + t11a = (dctint)((1U << 13) + t11 + t15) >> 14; + t12a = (dctint)((1U << 13) + t8 - t12) >> 14; + t13a = (dctint)((1U << 13) + t9 - t13) >> 14; + t14a = (dctint)((1U << 13) + t10 - t14) >> 14; + t15a = (dctint)((1U << 13) + t11 - t15) >> 14; + + t4a = t4 * 15137U + t5 * 6270U; + t5a = t4 * 6270U - t5 * 15137U; + t6a = t7 * 15137U - t6 * 6270U; + t7a = t7 * 6270U + t6 * 15137U; + t12 = t12a * 15137U + t13a * 6270U; + t13 = t12a * 6270U - t13a * 15137U; + t14 = t15a * 15137U - t14a * 6270U; + t15 = t15a * 6270U + t14a * 15137U; + + out[ 0] = t0 + t2; + out[15] = -(t1 + t3); + t2a = t0 - t2; + t3a = t1 - t3; + out[ 3] = -((dctint)((1U << 13) + t4a + t6a) >> 14); + out[12] = (dctint)((1U << 13) + t5a + t7a) >> 14; + t6 = (dctint)((1U << 13) + t4a - t6a) >> 14; + t7 = (dctint)((1U << 13) + t5a - t7a) >> 14; + out[ 1] = -(t8a + t10a); + out[14] = t9a + t11a; + t10 = t8a - t10a; + t11 = t9a - t11a; + out[ 2] = (dctint)((1U << 13) + t12 + t14) >> 14; + out[13] = -((dctint)((1U << 13) + t13 + t15) >> 14); + t14a = (dctint)((1U << 13) + t12 - t14) >> 14; + t15a = (dctint)((1U << 13) + t13 - t15) >> 14; + + out[ 7] = (dctint)(-(t2a + t3a) * 11585U + (1 << 13)) >> 14; + out[ 8] = (dctint)( (t2a - t3a) * 11585U + (1 << 13)) >> 14; + out[ 4] = (dctint)( (t7 + t6) * 11585U + (1 << 13)) >> 14; + out[11] = (dctint)( (t7 - t6) * 11585U + (1 << 13)) >> 14; + out[ 6] = (dctint)( (t11 + t10) * 11585U + (1 << 13)) >> 14; + out[ 9] = (dctint)( (t11 - t10) * 11585U + (1 << 13)) >> 14; + out[ 5] = (dctint)(-(t14a + t15a) * 11585U + (1 << 13)) >> 14; + out[10] = (dctint)( (t14a - t15a) * 11585U + (1 << 13)) >> 14; +} + +itxfm_wrap(16, 6) + +static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0a = (dctint)((IN(0) + IN(16)) * 11585U + (1 << 13)) >> 14; + dctint t1a = (dctint)((IN(0) - IN(16)) * 11585U + (1 << 13)) >> 14; + dctint t2a = (dctint)(IN( 8) * 6270U - IN(24) * 15137U + (1 << 13)) >> 14; + dctint t3a = (dctint)(IN( 8) * 15137U + IN(24) * 6270U + (1 << 13)) >> 14; + dctint t4a = (dctint)(IN( 4) * 3196U - IN(28) * 16069U + (1 << 13)) >> 14; + dctint t7a = (dctint)(IN( 4) * 16069U + IN(28) * 3196U + (1 << 13)) >> 14; + dctint t5a = (dctint)(IN(20) * 13623U - IN(12) * 9102U + (1 << 13)) >> 14; + dctint t6a = (dctint)(IN(20) * 9102U + IN(12) * 13623U + (1 << 13)) >> 14; + dctint t8a = (dctint)(IN( 2) * 1606U - IN(30) * 16305U + (1 << 13)) >> 14; + dctint t15a = (dctint)(IN( 2) * 16305U + IN(30) * 1606U + (1 << 13)) >> 14; + dctint t9a = (dctint)(IN(18) * 12665U - IN(14) * 10394U + (1 << 13)) >> 14; + dctint t14a = (dctint)(IN(18) * 10394U + IN(14) * 12665U + (1 << 13)) >> 14; + dctint t10a = (dctint)(IN(10) * 7723U - IN(22) * 14449U + (1 << 13)) >> 14; + dctint t13a = (dctint)(IN(10) * 14449U + IN(22) * 7723U + (1 << 13)) >> 14; + dctint t11a = (dctint)(IN(26) * 15679U - IN( 6) * 4756U + (1 << 13)) >> 14; + dctint t12a = (dctint)(IN(26) * 4756U + IN( 6) * 15679U + (1 << 13)) >> 14; + dctint t16a = (dctint)(IN( 1) * 804U - IN(31) * 16364U + (1 << 13)) >> 14; + dctint t31a = (dctint)(IN( 1) * 16364U + IN(31) * 804U + (1 << 13)) >> 14; + dctint t17a = (dctint)(IN(17) * 12140U - IN(15) * 11003U + (1 << 13)) >> 14; + dctint t30a = (dctint)(IN(17) * 11003U + IN(15) * 12140U + (1 << 13)) >> 14; + dctint t18a = (dctint)(IN( 9) * 7005U - IN(23) * 14811U + (1 << 13)) >> 14; + dctint t29a = (dctint)(IN( 9) * 14811U + IN(23) * 7005U + (1 << 13)) >> 14; + dctint t19a = (dctint)(IN(25) * 15426U - IN( 7) * 5520U + (1 << 13)) >> 14; + dctint t28a = (dctint)(IN(25) * 5520U + IN( 7) * 15426U + (1 << 13)) >> 14; + dctint t20a = (dctint)(IN( 5) * 3981U - IN(27) * 15893U + (1 << 13)) >> 14; + dctint t27a = (dctint)(IN( 5) * 15893U + IN(27) * 3981U + (1 << 13)) >> 14; + dctint t21a = (dctint)(IN(21) * 14053U - IN(11) * 8423U + (1 << 13)) >> 14; + dctint t26a = (dctint)(IN(21) * 8423U + IN(11) * 14053U + (1 << 13)) >> 14; + dctint t22a = (dctint)(IN(13) * 9760U - IN(19) * 13160U + (1 << 13)) >> 14; + dctint t25a = (dctint)(IN(13) * 13160U + IN(19) * 9760U + (1 << 13)) >> 14; + dctint t23a = (dctint)(IN(29) * 16207U - IN( 3) * 2404U + (1 << 13)) >> 14; + dctint t24a = (dctint)(IN(29) * 2404U + IN( 3) * 16207U + (1 << 13)) >> 14; + + dctint t0 = t0a + t3a; + dctint t1 = t1a + t2a; + dctint t2 = t1a - t2a; + dctint t3 = t0a - t3a; + dctint t4 = t4a + t5a; + dctint t5 = t4a - t5a; + dctint t6 = t7a - t6a; + dctint t7 = t7a + t6a; + dctint t8 = t8a + t9a; + dctint t9 = t8a - t9a; + dctint t10 = t11a - t10a; + dctint t11 = t11a + t10a; + dctint t12 = t12a + t13a; + dctint t13 = t12a - t13a; + dctint t14 = t15a - t14a; + dctint t15 = t15a + t14a; + dctint t16 = t16a + t17a; + dctint t17 = t16a - t17a; + dctint t18 = t19a - t18a; + dctint t19 = t19a + t18a; + dctint t20 = t20a + t21a; + dctint t21 = t20a - t21a; + dctint t22 = t23a - t22a; + dctint t23 = t23a + t22a; + dctint t24 = t24a + t25a; + dctint t25 = t24a - t25a; + dctint t26 = t27a - t26a; + dctint t27 = t27a + t26a; + dctint t28 = t28a + t29a; + dctint t29 = t28a - t29a; + dctint t30 = t31a - t30a; + dctint t31 = t31a + t30a; + + t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14; + t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14; + t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14; + t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14; + t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14; + t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14; + t17a = (dctint)( t30 * 3196U - t17 * 16069U + (1 << 13)) >> 14; + t30a = (dctint)( t30 * 16069U + t17 * 3196U + (1 << 13)) >> 14; + t18a = (dctint)(-(t29 * 16069U + t18 * 3196U) + (1 << 13)) >> 14; + t29a = (dctint)( t29 * 3196U - t18 * 16069U + (1 << 13)) >> 14; + t21a = (dctint)( t26 * 13623U - t21 * 9102U + (1 << 13)) >> 14; + t26a = (dctint)( t26 * 9102U + t21 * 13623U + (1 << 13)) >> 14; + t22a = (dctint)(-(t25 * 9102U + t22 * 13623U) + (1 << 13)) >> 14; + t25a = (dctint)( t25 * 13623U - t22 * 9102U + (1 << 13)) >> 14; + + t0a = t0 + t7; + t1a = t1 + t6a; + t2a = t2 + t5a; + t3a = t3 + t4; + t4a = t3 - t4; + t5 = t2 - t5a; + t6 = t1 - t6a; + t7a = t0 - t7; + t8a = t8 + t11; + t9 = t9a + t10a; + t10 = t9a - t10a; + t11a = t8 - t11; + t12a = t15 - t12; + t13 = t14a - t13a; + t14 = t14a + t13a; + t15a = t15 + t12; + t16a = t16 + t19; + t17 = t17a + t18a; + t18 = t17a - t18a; + t19a = t16 - t19; + t20a = t23 - t20; + t21 = t22a - t21a; + t22 = t22a + t21a; + t23a = t23 + t20; + t24a = t24 + t27; + t25 = t25a + t26a; + t26 = t25a - t26a; + t27a = t24 - t27; + t28a = t31 - t28; + t29 = t30a - t29a; + t30 = t30a + t29a; + t31a = t31 + t28; + + t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14; + t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14; + t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14; + t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14; + t18a = (dctint)( t29 * 6270U - t18 * 15137U + (1 << 13)) >> 14; + t29a = (dctint)( t29 * 15137U + t18 * 6270U + (1 << 13)) >> 14; + t19 = (dctint)( t28a * 6270U - t19a * 15137U + (1 << 13)) >> 14; + t28 = (dctint)( t28a * 15137U + t19a * 6270U + (1 << 13)) >> 14; + t20 = (dctint)(-(t27a * 15137U + t20a * 6270U) + (1 << 13)) >> 14; + t27 = (dctint)( t27a * 6270U - t20a * 15137U + (1 << 13)) >> 14; + t21a = (dctint)(-(t26 * 15137U + t21 * 6270U) + (1 << 13)) >> 14; + t26a = (dctint)( t26 * 6270U - t21 * 15137U + (1 << 13)) >> 14; + + t0 = t0a + t15a; + t1 = t1a + t14; + t2 = t2a + t13a; + t3 = t3a + t12; + t4 = t4a + t11; + t5a = t5 + t10a; + t6a = t6 + t9; + t7 = t7a + t8a; + t8 = t7a - t8a; + t9a = t6 - t9; + t10 = t5 - t10a; + t11a = t4a - t11; + t12a = t3a - t12; + t13 = t2a - t13a; + t14a = t1a - t14; + t15 = t0a - t15a; + t16 = t16a + t23a; + t17a = t17 + t22; + t18 = t18a + t21a; + t19a = t19 + t20; + t20a = t19 - t20; + t21 = t18a - t21a; + t22a = t17 - t22; + t23 = t16a - t23a; + t24 = t31a - t24a; + t25a = t30 - t25; + t26 = t29a - t26a; + t27a = t28 - t27; + t28a = t28 + t27; + t29 = t29a + t26a; + t30a = t30 + t25; + t31 = t31a + t24a; + + t20 = (dctint)((t27a - t20a) * 11585U + (1 << 13)) >> 14; + t27 = (dctint)((t27a + t20a) * 11585U + (1 << 13)) >> 14; + t21a = (dctint)((t26 - t21 ) * 11585U + (1 << 13)) >> 14; + t26a = (dctint)((t26 + t21 ) * 11585U + (1 << 13)) >> 14; + t22 = (dctint)((t25a - t22a) * 11585U + (1 << 13)) >> 14; + t25 = (dctint)((t25a + t22a) * 11585U + (1 << 13)) >> 14; + t23a = (dctint)((t24 - t23 ) * 11585U + (1 << 13)) >> 14; + t24a = (dctint)((t24 + t23 ) * 11585U + (1 << 13)) >> 14; + + out[ 0] = t0 + t31; + out[ 1] = t1 + t30a; + out[ 2] = t2 + t29; + out[ 3] = t3 + t28a; + out[ 4] = t4 + t27; + out[ 5] = t5a + t26a; + out[ 6] = t6a + t25; + out[ 7] = t7 + t24a; + out[ 8] = t8 + t23a; + out[ 9] = t9a + t22; + out[10] = t10 + t21a; + out[11] = t11a + t20; + out[12] = t12a + t19a; + out[13] = t13 + t18; + out[14] = t14a + t17a; + out[15] = t15 + t16; + out[16] = t15 - t16; + out[17] = t14a - t17a; + out[18] = t13 - t18; + out[19] = t12a - t19a; + out[20] = t11a - t20; + out[21] = t10 - t21a; + out[22] = t9a - t22; + out[23] = t8 - t23a; + out[24] = t7 - t24a; + out[25] = t6a - t25; + out[26] = t5a - t26a; + out[27] = t4 - t27; + out[28] = t3 - t28a; + out[29] = t2 - t29; + out[30] = t1 - t30a; + out[31] = t0 - t31; +} + +itxfm_wrapper(idct, idct, 32, 6, 1) + +static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + int t0, t1, t2, t3, t4; + + if (pass == 0) { + t0 = IN(0) >> 2; + t1 = IN(3) >> 2; + t2 = IN(1) >> 2; + t3 = IN(2) >> 2; + } else { + t0 = IN(0); + t1 = IN(3); + t2 = IN(1); + t3 = IN(2); + } + + t0 += t2; + t3 -= t1; + t4 = (t0 - t3) >> 1; + t1 = t4 - t1; + t2 = t4 - t2; + t0 -= t1; + t3 += t2; + + out[0] = t0; + out[1] = t1; + out[2] = t2; + out[3] = t3; +} + +itxfm_wrapper(iwht, iwht, 4, 0, 0) + +#undef IN +#undef itxfm_wrapper +#undef itxfm_wrap + +static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp) +{ +#define init_itxfm(tx, sz) \ + dsp->itxfm_add[tx][DCT_DCT] = idct_idct_##sz##_add_c; \ + dsp->itxfm_add[tx][DCT_ADST] = iadst_idct_##sz##_add_c; \ + dsp->itxfm_add[tx][ADST_DCT] = idct_iadst_##sz##_add_c; \ + dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c + +#define init_idct(tx, nm) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c + + init_itxfm(TX_4X4, 4x4); + init_itxfm(TX_8X8, 8x8); + init_itxfm(TX_16X16, 16x16); + init_idct(TX_32X32, idct_idct_32x32); + init_idct(4 /* lossless */, iwht_iwht_4x4); + +#undef init_itxfm +#undef init_idct +} + +static av_always_inline void loop_filter(pixel *dst, int E, int I, int H, + ptrdiff_t stridea, ptrdiff_t strideb, + int wd) +{ + int i, F = 1 << (BIT_DEPTH - 8); + + E <<= (BIT_DEPTH - 8); + I <<= (BIT_DEPTH - 8); + H <<= (BIT_DEPTH - 8); + for (i = 0; i < 8; i++, dst += stridea) { + int p7, p6, p5, p4; + int p3 = dst[strideb * -4], p2 = dst[strideb * -3]; + int p1 = dst[strideb * -2], p0 = dst[strideb * -1]; + int q0 = dst[strideb * +0], q1 = dst[strideb * +1]; + int q2 = dst[strideb * +2], q3 = dst[strideb * +3]; + int q4, q5, q6, q7; + int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I && + FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I && + FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I && + FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E; + int flat8out, flat8in; + + if (!fm) + continue; + + if (wd >= 16) { + p7 = dst[strideb * -8]; + p6 = dst[strideb * -7]; + p5 = dst[strideb * -6]; + p4 = dst[strideb * -5]; + q4 = dst[strideb * +4]; + q5 = dst[strideb * +5]; + q6 = dst[strideb * +6]; + q7 = dst[strideb * +7]; + + flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F && + FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F && + FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F && + FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F; + } + + if (wd >= 8) + flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F && + FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F && + FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F; + + if (wd >= 16 && flat8out && flat8in) { + dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 + + p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4; + dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 + + p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4; + dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 + + p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4; + dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 + + p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4; + dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 + + p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4; + dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4; + dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4; + dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4; + dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4; + dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + + q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + + q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + + q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + + q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4; + } else if (wd >= 8 && flat8in) { + dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3; + dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3; + dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3; + dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3; + dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3; + dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3; + } else { + int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H; + + if (hev) { + int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2; + f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1); + + f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + + dst[strideb * -1] = av_clip_pixel(p0 + f2); + dst[strideb * +0] = av_clip_pixel(q0 - f1); + } else { + int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2; + + f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + + dst[strideb * -1] = av_clip_pixel(p0 + f2); + dst[strideb * +0] = av_clip_pixel(q0 - f1); + + f = (f1 + 1) >> 1; + dst[strideb * -2] = av_clip_pixel(p1 + f); + dst[strideb * +1] = av_clip_pixel(q1 - f); + } + } + } +} + +#define lf_8_fn(dir, wd, stridea, strideb) \ +static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \ + ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + pixel *dst = (pixel *) _dst; \ + stride /= sizeof(pixel); \ + loop_filter(dst, E, I, H, stridea, strideb, wd); \ +} + +#define lf_8_fns(wd) \ +lf_8_fn(h, wd, stride, 1) \ +lf_8_fn(v, wd, 1, stride) + +lf_8_fns(4) +lf_8_fns(8) +lf_8_fns(16) + +#undef lf_8_fn +#undef lf_8_fns + +#define lf_16_fn(dir, stridea) \ +static void loop_filter_##dir##_16_16_c(uint8_t *dst, \ + ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \ + loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \ +} + +lf_16_fn(h, stride) +lf_16_fn(v, sizeof(pixel)) + +#undef lf_16_fn + +#define lf_mix_fn(dir, wd1, wd2, stridea) \ +static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \ + ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \ + loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \ +} + +#define lf_mix_fns(wd1, wd2) \ +lf_mix_fn(h, wd1, wd2, stride) \ +lf_mix_fn(v, wd1, wd2, sizeof(pixel)) + +lf_mix_fns(4, 4) +lf_mix_fns(4, 8) +lf_mix_fns(8, 4) +lf_mix_fns(8, 8) + +#undef lf_mix_fn +#undef lf_mix_fns + +static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp) +{ + dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c; + dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c; + dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c; + dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c; + dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c; + dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c; + + dsp->loop_filter_16[0] = loop_filter_h_16_16_c; + dsp->loop_filter_16[1] = loop_filter_v_16_16_c; + + dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c; + dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c; + dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c; + dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c; + dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c; + dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c; + dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c; + dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c; +} + +#if BIT_DEPTH != 12 + +static av_always_inline void copy_c(uint8_t *restrict dst, ptrdiff_t dst_stride, + const uint8_t *restrict src, + ptrdiff_t src_stride, int w, int h) +{ + do { + memcpy(dst, src, w * sizeof(pixel)); + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +static av_always_inline void avg_c(uint8_t *restrict _dst, ptrdiff_t dst_stride, + const uint8_t *restrict _src, + ptrdiff_t src_stride, int w, int h) +{ + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x += 4) + AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x]))); + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +#define fpel_fn(type, sz) \ +static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + type##_c(dst, dst_stride, src, src_stride, sz, h); \ +} + +#define copy_avg_fn(sz) \ +fpel_fn(copy, sz) \ +fpel_fn(avg, sz) + +copy_avg_fn(64) +copy_avg_fn(32) +copy_avg_fn(16) +copy_avg_fn(8) +copy_avg_fn(4) + +#undef fpel_fn +#undef copy_avg_fn + +#endif /* BIT_DEPTH != 12 */ + +#define FILTER_8TAP(src, x, F, stride) \ + av_clip_pixel((F[0] * src[x + -3 * stride] + \ + F[1] * src[x + -2 * stride] + \ + F[2] * src[x + -1 * stride] + \ + F[3] * src[x + +0 * stride] + \ + F[4] * src[x + +1 * stride] + \ + F[5] * src[x + +2 * stride] + \ + F[6] * src[x + +3 * stride] + \ + F[7] * src[x + +4 * stride] + 64) >> 7) + +static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, ptrdiff_t ds, + const int16_t *filter, int avg) +{ + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1; + } else { + dst[x] = FILTER_8TAP(src, x, filter, ds); + } + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +#define filter_8tap_1d_fn(opn, opa, dir, ds) \ +static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, const int16_t *filter) \ +{ \ + do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \ +} + +filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel)) +filter_8tap_1d_fn(put, 0, h, 1) +filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel)) +filter_8tap_1d_fn(avg, 1, h, 1) + +#undef filter_8tap_1d_fn + +static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, const int16_t *filterx, + const int16_t *filtery, int avg) +{ + int tmp_h = h + 7; + pixel tmp[64 * 71], *tmp_ptr = tmp; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + src -= src_stride * 3; + do { + int x; + + for (x = 0; x < w; x++) + tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1); + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp + 64 * 3; + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1; + } else { + dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64); + } + + tmp_ptr += 64; + dst += dst_stride; + } while (--h); +} + +#define filter_8tap_2d_fn(opn, opa) \ +static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, const int16_t *filterx, \ + const int16_t *filtery) \ +{ \ + do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \ +} + +filter_8tap_2d_fn(put, 0) +filter_8tap_2d_fn(avg, 1) + +#undef filter_8tap_2d_fn + +#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \ +static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \ + ff_vp9_subpel_filters[type_idx][dir_m]); \ +} + +#define filter_fn_2d(sz, type, type_idx, avg) \ +static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \ + ff_vp9_subpel_filters[type_idx][mx], \ + ff_vp9_subpel_filters[type_idx][my]); \ +} + +#if BIT_DEPTH != 12 + +#define FILTER_BILIN(src, x, mxy, stride) \ + (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4)) + +static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, ptrdiff_t ds, int mxy, int avg) +{ + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1; + } else { + dst[x] = FILTER_BILIN(src, x, mxy, ds); + } + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +#define bilin_1d_fn(opn, opa, dir, ds) \ +static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mxy) \ +{ \ + do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \ +} + +bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel)) +bilin_1d_fn(put, 0, h, 1) +bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel)) +bilin_1d_fn(avg, 1, h, 1) + +#undef bilin_1d_fn + +static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, int mx, int my, int avg) +{ + pixel tmp[64 * 65], *tmp_ptr = tmp; + int tmp_h = h + 1; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x++) + tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1); + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp; + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1; + } else { + dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64); + } + + tmp_ptr += 64; + dst += dst_stride; + } while (--h); +} + +#define bilin_2d_fn(opn, opa) \ +static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my) \ +{ \ + do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \ +} + +bilin_2d_fn(put, 0) +bilin_2d_fn(avg, 1) + +#undef bilin_2d_fn + +#define bilinf_fn_1d(sz, dir, dir_m, avg) \ +static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \ +} + +#define bilinf_fn_2d(sz, avg) \ +static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \ +} + +#else + +#define bilinf_fn_1d(a, b, c, d) +#define bilinf_fn_2d(a, b) + +#endif + +#define filter_fn(sz, avg) \ +filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \ +filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \ +filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg) \ +filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg) \ +filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg) \ +filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg) \ +filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg) \ +filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg) \ +filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg) \ +bilinf_fn_1d(sz, h, mx, avg) \ +bilinf_fn_1d(sz, v, my, avg) \ +bilinf_fn_2d(sz, avg) + +#define filter_fn_set(avg) \ +filter_fn(64, avg) \ +filter_fn(32, avg) \ +filter_fn(16, avg) \ +filter_fn(8, avg) \ +filter_fn(4, avg) + +filter_fn_set(put) +filter_fn_set(avg) + +#undef filter_fn +#undef filter_fn_set +#undef filter_fn_1d +#undef filter_fn_2d +#undef bilinf_fn_1d +#undef bilinf_fn_2d + +#if BIT_DEPTH != 8 +void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp); +#endif +#if BIT_DEPTH != 10 +static +#endif +av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp) +{ +#if BIT_DEPTH == 12 + ff_vp9dsp_mc_init_10(dsp); +#else /* BIT_DEPTH == 12 */ + +#define init_fpel(idx1, idx2, sz, type) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = type##sz##_c; \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = type##sz##_c + +#define init_copy_avg(idx, sz) \ + init_fpel(idx, 0, sz, copy); \ + init_fpel(idx, 1, sz, avg) + + init_copy_avg(0, 64); + init_copy_avg(1, 32); + init_copy_avg(2, 16); + init_copy_avg(3, 8); + init_copy_avg(4, 4); + +#undef init_copy_avg +#undef init_fpel + +#endif /* BIT_DEPTH == 12 */ + +#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c + +#if BIT_DEPTH == 12 +#define init_subpel1 init_subpel1_bd_aware +#else +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \ + init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c +#endif + +#define init_subpel2(idx, idxh, idxv, dir, type) \ + init_subpel1(0, idx, idxh, idxv, 64, dir, type); \ + init_subpel1(1, idx, idxh, idxv, 32, dir, type); \ + init_subpel1(2, idx, idxh, idxv, 16, dir, type); \ + init_subpel1(3, idx, idxh, idxv, 8, dir, type); \ + init_subpel1(4, idx, idxh, idxv, 4, dir, type) + +#define init_subpel3(idx, type) \ + init_subpel2(idx, 1, 1, hv, type); \ + init_subpel2(idx, 0, 1, v, type); \ + init_subpel2(idx, 1, 0, h, type) + + init_subpel3(0, put); + init_subpel3(1, avg); + +#undef init_subpel1 +#undef init_subpel2 +#undef init_subpel3 +#undef init_subpel1_bd_aware +} + +static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, int mx, int my, + int dx, int dy, int avg, + const int16_t (*filters)[8]) +{ + int tmp_h = (((h - 1) * dy + my) >> 4) + 8; + pixel tmp[64 * 135], *tmp_ptr = tmp; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + src -= src_stride * 3; + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1); + imx += dx; + ioff += imx >> 4; + imx &= 0xf; + } + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp + 64 * 3; + do { + int x; + const int16_t *filter = filters[my]; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1; + } else { + dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64); + } + + my += dy; + tmp_ptr += (my >> 4) * 64; + my &= 0xf; + dst += dst_stride; + } while (--h); +} + +#define scaled_filter_8tap_fn(opn, opa) \ +static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy, \ + const int16_t (*filters)[8]) \ +{ \ + do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \ + opa, filters); \ +} + +scaled_filter_8tap_fn(put, 0) +scaled_filter_8tap_fn(avg, 1) + +#undef scaled_filter_8tap_fn + +#undef FILTER_8TAP + +#define scaled_filter_fn(sz, type, type_idx, avg) \ +static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my, int dx, int dy) \ +{ \ + avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \ + ff_vp9_subpel_filters[type_idx]); \ +} + +#if BIT_DEPTH != 12 + +static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, int mx, int my, + int dx, int dy, int avg) +{ + pixel tmp[64 * 129], *tmp_ptr = tmp; + int tmp_h = (((h - 1) * dy + my) >> 4) + 2; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1); + imx += dx; + ioff += imx >> 4; + imx &= 0xf; + } + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp; + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1; + } else { + dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64); + } + + my += dy; + tmp_ptr += (my >> 4) * 64; + my &= 0xf; + dst += dst_stride; + } while (--h); +} + +#define scaled_bilin_fn(opn, opa) \ +static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy) \ +{ \ + do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \ +} + +scaled_bilin_fn(put, 0) +scaled_bilin_fn(avg, 1) + +#undef scaled_bilin_fn + +#undef FILTER_BILIN + +#define scaled_bilinf_fn(sz, avg) \ +static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my, int dx, int dy) \ +{ \ + avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \ +} + +#else + +#define scaled_bilinf_fn(a, b) + +#endif + +#define scaled_filter_fns(sz, avg) \ +scaled_filter_fn(sz, regular, FILTER_8TAP_REGULAR, avg) \ +scaled_filter_fn(sz, smooth, FILTER_8TAP_SMOOTH, avg) \ +scaled_filter_fn(sz, sharp, FILTER_8TAP_SHARP, avg) \ +scaled_bilinf_fn(sz, avg) + +#define scaled_filter_fn_set(avg) \ +scaled_filter_fns(64, avg) \ +scaled_filter_fns(32, avg) \ +scaled_filter_fns(16, avg) \ +scaled_filter_fns(8, avg) \ +scaled_filter_fns(4, avg) + +scaled_filter_fn_set(put) +scaled_filter_fn_set(avg) + +#undef scaled_filter_fns +#undef scaled_filter_fn_set +#undef scaled_filter_fn +#undef scaled_bilinf_fn + +#if BIT_DEPTH != 8 +void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp); +#endif +#if BIT_DEPTH != 10 +static +#endif +av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp) +{ +#define init_scaled_bd_aware(idx1, idx2, sz, type) \ + dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \ + dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \ + dsp->smc[idx1][FILTER_8TAP_SHARP ][idx2] = type##_scaled_sharp_##sz##_c + +#if BIT_DEPTH == 12 + ff_vp9dsp_scaled_mc_init_10(dsp); +#define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d) +#else +#define init_scaled(idx1, idx2, sz, type) \ + init_scaled_bd_aware(idx1, idx2, sz, type); \ + dsp->smc[idx1][FILTER_BILINEAR ][idx2] = type##_scaled_bilin_##sz##_c +#endif + +#define init_scaled_put_avg(idx, sz) \ + init_scaled(idx, 0, sz, put); \ + init_scaled(idx, 1, sz, avg) + + init_scaled_put_avg(0, 64); + init_scaled_put_avg(1, 32); + init_scaled_put_avg(2, 16); + init_scaled_put_avg(3, 8); + init_scaled_put_avg(4, 4); + +#undef init_scaled_put_avg +#undef init_scaled +#undef init_scaled_bd_aware +} + +av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp) +{ + FUNC(ff_vp9dsp_intrapred_init)(dsp); + vp9dsp_itxfm_init(dsp); + vp9dsp_loopfilter_init(dsp); + FUNC(ff_vp9dsp_mc_init)(dsp); + FUNC(ff_vp9dsp_scaled_mc_init)(dsp); +} diff --git a/external/ffmpeg-snapshot/libavutil/aarch64/asm.S b/external/ffmpeg-snapshot/libavutil/aarch64/asm.S new file mode 100644 index 0000000..1840f9f --- /dev/null +++ b/external/ffmpeg-snapshot/libavutil/aarch64/asm.S @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF # +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC # +#endif + +#ifndef __has_feature +# define __has_feature(x) 0 +#endif + +#if HAVE_AS_ARCH_DIRECTIVE + .arch AS_ARCH_LEVEL +#endif + +#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE +#define ENABLE_DOTPROD .arch_extension dotprod +#define DISABLE_DOTPROD .arch_extension nodotprod +#else +#define ENABLE_DOTPROD +#define DISABLE_DOTPROD +#endif + +#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE +#define ENABLE_I8MM .arch_extension i8mm +#define DISABLE_I8MM .arch_extension noi8mm +#else +#define ENABLE_I8MM +#define DISABLE_I8MM +#endif + +DISABLE_DOTPROD +DISABLE_I8MM + + +/* Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * + * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to + * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be + * used immediately before saving the LR register (x30) to the stack. + * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring + * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone + * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also + * have the same value at the two points. For example: + * + * .global f + * f: + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or + * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an + * indirect call target. In particular, all symbols exported from a file must + * begin with one of these macros. For example, a leaf function that does not + * save LR can instead use |AARCH64_VALID_CALL_TARGET|: + * + * .globl return_zero + * return_zero: + * AARCH64_VALID_CALL_TARGET + * mov x0, #0 + * ret + * + * A non-leaf function which does not immediately save LR may need both macros + * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function + * may jump to an alternate implementation before setting up the stack: + * + * .globl with_early_jump + * with_early_jump: + * AARCH64_VALID_CALL_TARGET + * cmp x0, #128 + * b.lt .Lwith_early_jump_128 + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * .Lwith_early_jump_128: + * ... + * ret + * + * These annotations are only required with indirect calls. Private symbols that + * are only the target of direct calls do not require annotations. Also note + * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not + * indirect jumps (BR). Indirect jumps in assembly are supported through + * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and + * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. + * + * Although not necessary, it is safe to use these macros in 32-bit ARM + * assembly. This may be used to simplify dual 32-bit and 64-bit files. + * + * References: + * - "ELF for the Arm® 64-bit Architecture" + * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst + * - "Providing protection for complex software" + * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + */ +#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) +# define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has BTI +# define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +# define AARCH64_VALID_JUMP_TARGET hint #38 // BTI 'j' +#else +# define GNU_PROPERTY_AARCH64_BTI 0 // No BTI +# define AARCH64_VALID_CALL_TARGET +# define AARCH64_VALID_JUMP_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) +# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A +# define AARCH64_SIGN_LINK_REGISTER paciasp +# define AARCH64_VALIDATE_LINK_REGISTER autiasp +# elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B +# define AARCH64_SIGN_LINK_REGISTER pacibsp +# define AARCH64_VALIDATE_LINK_REGISTER autibsp +# else +# error Pointer authentication defines no valid key! +# endif +# if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) +# error Authentication of leaf functions is enabled but not supported in FFmpeg! +# endif +# define GNU_PROPERTY_AARCH64_PAC (1 << 1) +#else +# define GNU_PROPERTY_AARCH64_PAC 0 +# define AARCH64_SIGN_LINK_REGISTER +# define AARCH64_VALIDATE_LINK_REGISTER +#endif + + +#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) + .pushsection .note.gnu.property, "a" + .balign 8 + .long 4 + .long 0x10 + .long 0x5 + .asciz "GNU" + .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4 + .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) + .long 0 + .popsection +#endif + +.macro function name, export=0, align=2 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name +EXTERN_ASM\name: + AARCH64_VALID_CALL_TARGET + .else +ELF .type \name, %function +FUNC .func \name +\name: + .endif +.endm + +.macro const name, align=2, relocate=0 + .macro endconst +ELF .size \name, . - \name + .purgem endconst + .endm +#if HAVE_SECTION_DATA_REL_RO +.if \relocate + .section .data.rel.ro +.else + .section .rodata +.endif +#elif defined(_WIN32) + .section .rdata +#elif !defined(__MACH__) + .section .rodata +#else + .const_data +#endif + .align \align +\name: +.endm + +.macro movrel rd, val, offset=0 +#if CONFIG_PIC && defined(__APPLE__) + .if \offset < 0 + adrp \rd, \val@PAGE + add \rd, \rd, \val@PAGEOFF + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset)@PAGE + add \rd, \rd, \val+(\offset)@PAGEOFF + .endif +#elif CONFIG_PIC && defined(_WIN32) + .if \offset < 0 + adrp \rd, \val + add \rd, \rd, :lo12:\val + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) + .endif +#elif CONFIG_PIC +# if __has_feature(hwaddress_sanitizer) + adrp \rd, :pg_hi21_nc:\val+(\offset) +# else + adrp \rd, \val+(\offset) +# endif + add \rd, \rd, :lo12:\val+(\offset) +#else + ldr \rd, =\val+\offset +#endif +.endm + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 diff --git a/src/.gitkeep b/src/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/.gitkeep b/tests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/bench_neon_idct.c b/tests/bench_neon_idct.c new file mode 100644 index 0000000..0ea5556 --- /dev/null +++ b/tests/bench_neon_idct.c @@ -0,0 +1,248 @@ +/* + * Phase 3 — NEON baseline microbench for VP9 8×8 DCT_DCT IDCT add. + * + * Reports two numbers: + * M1 (correctness): bit-exact match rate, our C reference vs + * FFmpeg's NEON, across N random blocks. + * M3 (throughput): NEON sustained MblockS on this host. + * + * Both are gating measurements for Phase 1 (see docs/phase1.md). + * NO QPU work happens here — that's later phases. + * + * Build: see CMakeLists.txt at project root. + * Run: ./bench_neon_idct [--blocks N] [--iters K] [--seed S] + * + * License: BSD-2-Clause (daedalus-fourier), but this binary + * statically links the LGPL-2.1+ FFmpeg NEON snapshot + * — distribute the binary under LGPL-2.1+ in that case. + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include + +/* Our C reference (tests/vp9_idct8_ref.c). */ +extern void daedalus_vp9_idct_idct_8x8_add_ref( + uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +/* FFmpeg NEON entry point (vendored vp9itxfm_neon.S). */ +extern void ff_vp9_idct_idct_8x8_add_neon( + uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +/* ---- Random-block generation ----------------------------------- */ + +/* xorshift64 — deterministic per seed, fast enough not to dominate + * the measurement. */ +static uint64_t xs64_state; +static inline uint64_t xs64(void) +{ + uint64_t x = xs64_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs64_state = x; +} + +/* Random VP9-plausible coefficient block: most coefficients zero, + * a handful of nonzero ones in low-frequency positions. Bias chosen + * so eob is typically in [4, 32], hitting the general (non-DC) path. + * For Phase 3 baseline this isn't load-balanced against a real + * bitstream distribution — Phase 7 may revisit. */ +static int gen_block(int16_t block[64]) +{ + memset(block, 0, 64 * sizeof(*block)); + int eob = 0; + int n_nonzero = 1 + (int)(xs64() % 16); + for (int i = 0; i < n_nonzero; i++) { + /* Bias toward low-freq positions via xs64() % (xs64() % 64 + 1). */ + int pos = (int)(xs64() % 64); + /* Coefficient range: signed 12-bit (typical dequant output). */ + int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096); + block[pos] = coef; + if (pos + 1 > eob) eob = pos + 1; + } + if (eob == 0) eob = 1; + return eob; +} + +static void gen_pred(uint8_t pred[64]) +{ + for (int i = 0; i < 64; i++) + pred[i] = (uint8_t)(xs64() & 0xff); +} + +/* ---- Wall-clock timing (CLOCK_MONOTONIC_RAW) ------------------- */ + +static double now_seconds(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +/* ---- Phase 1 M1: bit-exact gate -------------------------------- */ + +static int correctness_check(uint64_t seed, int n_blocks) +{ + xs64_state = seed ? seed : 0xdeadbeefcafebabeULL; + int mismatches = 0; + int dc_only_seen = 0; + + int16_t block_a[64], block_b[64]; + uint8_t pred[64]; + uint8_t dst_a[64], dst_b[64]; + + for (int i = 0; i < n_blocks; i++) { + int eob = gen_block(block_a); + memcpy(block_b, block_a, sizeof(block_a)); + gen_pred(pred); + memcpy(dst_a, pred, 64); + memcpy(dst_b, pred, 64); + + daedalus_vp9_idct_idct_8x8_add_ref(dst_a, 8, block_a, eob); + ff_vp9_idct_idct_8x8_add_neon(dst_b, 8, block_b, eob); + + if (memcmp(dst_a, dst_b, 64) != 0) { + if (mismatches < 4) { + fprintf(stderr, "MISMATCH block %d eob=%d:\n", i, eob); + for (int r = 0; r < 8; r++) { + fprintf(stderr, " row %d ref ", r); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r * 8 + c]); + fprintf(stderr, " neon "); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r * 8 + c]); + fprintf(stderr, "\n"); + } + } + mismatches++; + } + if (eob == 1) dc_only_seen++; + } + + printf("M1 correctness: %d / %d blocks bit-exact match (%.4f%%)\n", + n_blocks - mismatches, n_blocks, + 100.0 * (n_blocks - mismatches) / n_blocks); + printf(" dc-only path frequency: %d / %d (%.2f%%)\n", + dc_only_seen, n_blocks, 100.0 * dc_only_seen / n_blocks); + return mismatches; +} + +/* ---- Phase 1 M3: NEON throughput ------------------------------- */ + +static void throughput_neon(uint64_t seed, int n_blocks, int iters) +{ + xs64_state = seed ? seed : 0xfeedfacecafebeefULL; + + /* Pre-generate all blocks + preds so generation cost is excluded + * from the timed region. Each block is consumed once per iteration + * (NEON path zeroes the block, so we restore from the master). */ + int16_t *blocks_master = malloc(n_blocks * 64 * sizeof(int16_t)); + int16_t *blocks_work = malloc(n_blocks * 64 * sizeof(int16_t)); + uint8_t *preds = malloc(n_blocks * 64); + uint8_t *dsts = malloc(n_blocks * 64); + int *eobs = malloc(n_blocks * sizeof(int)); + if (!blocks_master || !blocks_work || !preds || !dsts || !eobs) { + fprintf(stderr, "alloc failed\n"); + exit(1); + } + + for (int i = 0; i < n_blocks; i++) { + eobs[i] = gen_block(blocks_master + i * 64); + gen_pred(preds + i * 64); + } + + /* Warm-up. */ + memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t)); + memcpy(dsts, preds, n_blocks * 64); + for (int i = 0; i < n_blocks; i++) + ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8, + blocks_work + i * 64, eobs[i]); + + /* Timed region. */ + double t0 = now_seconds(); + for (int it = 0; it < iters; it++) { + memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t)); + memcpy(dsts, preds, n_blocks * 64); + for (int i = 0; i < n_blocks; i++) + ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8, + blocks_work + i * 64, eobs[i]); + } + double t1 = now_seconds(); + + /* memcpy cost-only run, to subtract setup overhead. */ + double s0 = now_seconds(); + for (int it = 0; it < iters; it++) { + memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t)); + memcpy(dsts, preds, n_blocks * 64); + } + double s1 = now_seconds(); + + double total_seconds = (t1 - t0) - (s1 - s0); + double total_blocks = (double) n_blocks * iters; + double mblocks_s = total_blocks / total_seconds / 1e6; + + printf("M3 NEON throughput:\n"); + printf(" blocks=%d iters=%d total=%.0f\n", n_blocks, iters, total_blocks); + printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", total_seconds); + printf(" elapsed (setup) =%.6f s\n", s1 - s0); + printf(" throughput = %.3f Mblock/s\n", mblocks_s); + printf(" per-block = %.1f ns\n", total_seconds / total_blocks * 1e9); + + /* Equivalent at 1920x1080: 32 400 blocks/frame -> FPS. */ + printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n", + mblocks_s * 1e6 / 32400.0); + + free(blocks_master); free(blocks_work); free(preds); + free(dsts); free(eobs); +} + +/* ---- CLI ------------------------------------------------------- */ + +static void usage(const char *p) +{ + fprintf(stderr, + "Usage: %s [--blocks N] [--iters K] [--seed S] [--no-correctness]\n" + "Defaults: N=1000000, K=10, S=0 (uses fixed default).\n", p); +} + +int main(int argc, char **argv) +{ + int n_blocks = 1000000; + int iters = 10; + uint64_t seed = 0; + int do_correctness = 1; + + static struct option opts[] = { + {"blocks", required_argument, 0, 'b'}, + {"iters", required_argument, 0, 'i'}, + {"seed", required_argument, 0, 's'}, + {"no-correctness", no_argument, 0, 'C'}, + {"help", no_argument, 0, 'h'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "b:i:s:Ch", opts, 0)) != -1;) { + switch (c) { + case 'b': n_blocks = atoi(optarg); break; + case 'i': iters = atoi(optarg); break; + case 's': seed = strtoull(optarg, 0, 0); break; + case 'C': do_correctness = 0; break; + case 'h': usage(argv[0]); return 0; + default: usage(argv[0]); return 2; + } + } + + if (do_correctness) { + printf("=== M1: bit-exact correctness (10000 random blocks) ===\n"); + int miss = correctness_check(seed, 10000); + if (miss != 0) { + fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); + return 1; + } + printf("\n"); + } + + printf("=== M3: NEON throughput ===\n"); + throughput_neon(seed, n_blocks, iters); + return 0; +} diff --git a/tests/bench_vulkan_dispatch.c b/tests/bench_vulkan_dispatch.c new file mode 100644 index 0000000..657ca6b --- /dev/null +++ b/tests/bench_vulkan_dispatch.c @@ -0,0 +1,279 @@ +/* + * Phase 3 — Vulkan compute dispatch-overhead microbench (M5). + * + * Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa + * v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a + * noop compute shader. Establishes the floor below which kernel + * batching is mandatory. + * + * Two measurements: + * M5a: empty command-buffer submit (no dispatch at all) + * M5b: 1-workgroup dispatch of an empty shader + * + * The delta M5b - M5a isolates the per-vkCmdDispatch cost from + * the per-vkQueueSubmit cost. + * + * Build: cmake -DDAEDALUS_BUILD_VULKAN=ON .. + * Run: ./bench_vulkan_dispatch [--iters N] [--spv PATH] + * + * License: BSD-2-Clause (daedalus-fourier). + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include + +#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \ + fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \ + exit(1); } } while (0) + +static double now_seconds(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +static uint32_t *read_spv(const char *path, size_t *out_size) +{ + FILE *f = fopen(path, "rb"); + if (!f) { perror(path); exit(1); } + fseek(f, 0, SEEK_END); + long sz = ftell(f); + fseek(f, 0, SEEK_SET); + if (sz <= 0 || (sz & 3)) { + fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz); + exit(1); + } + uint32_t *buf = malloc(sz); + if (!buf || fread(buf, 1, sz, f) != (size_t)sz) { + perror("read"); exit(1); + } + fclose(f); + *out_size = sz; + return buf; +} + +int main(int argc, char **argv) +{ + int iters = 100000; + const char *spv_path = "noop.spv"; + + static struct option opts[] = { + {"iters", required_argument, 0, 'i'}, + {"spv", required_argument, 0, 's'}, + {"help", no_argument, 0, 'h'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) { + switch (c) { + case 'i': iters = atoi(optarg); break; + case 's': spv_path = optarg; break; + case 'h': + fprintf(stderr, + "Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]); + return 0; + default: + return 2; + } + } + + /* ---- Instance ---- */ + VkApplicationInfo app = { + .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, + .pApplicationName = "daedalus-fourier-bench", + .apiVersion = VK_API_VERSION_1_3, + }; + VkInstanceCreateInfo ici = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + .pApplicationInfo = &app, + }; + VkInstance instance; + CHK(vkCreateInstance(&ici, NULL, &instance)); + + /* ---- Pick V3D physical device (skip llvmpipe) ---- */ + uint32_t pd_count = 0; + CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL)); + VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds)); + CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds)); + VkPhysicalDevice phys = VK_NULL_HANDLE; + VkPhysicalDeviceProperties props = {0}; + for (uint32_t i = 0; i < pd_count; i++) { + vkGetPhysicalDeviceProperties(pds[i], &props); + printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n", + i, props.deviceName, + VK_VERSION_MAJOR(props.apiVersion), + VK_VERSION_MINOR(props.apiVersion), + VK_VERSION_PATCH(props.apiVersion), + props.vendorID); + if (strstr(props.deviceName, "V3D") != NULL) { + phys = pds[i]; + } + } + if (phys == VK_NULL_HANDLE) { + fprintf(stderr, "no V3D device found; bailing.\n"); + return 1; + } + vkGetPhysicalDeviceProperties(phys, &props); + printf("selected: %s\n", props.deviceName); + free(pds); + + /* ---- Compute queue family ---- */ + uint32_t qfc = 0; + vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL); + VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp)); + vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp); + uint32_t qfi = (uint32_t) -1; + for (uint32_t i = 0; i < qfc; i++) { + if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { + qfi = i; break; + } + } + if (qfi == (uint32_t) -1) { + fprintf(stderr, "no compute queue family\n"); + return 1; + } + free(qfp); + + /* ---- Logical device ---- */ + float qprio = 1.0f; + VkDeviceQueueCreateInfo dqci = { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = qfi, + .queueCount = 1, + .pQueuePriorities = &qprio, + }; + VkDeviceCreateInfo dci = { + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .queueCreateInfoCount = 1, + .pQueueCreateInfos = &dqci, + }; + VkDevice dev; + CHK(vkCreateDevice(phys, &dci, NULL, &dev)); + VkQueue queue; + vkGetDeviceQueue(dev, qfi, 0, &queue); + + /* ---- Command pool + buffers ---- */ + VkCommandPoolCreateInfo cpci = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = qfi, + }; + VkCommandPool pool; + CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool)); + + VkCommandBuffer cb_empty, cb_dispatch; + VkCommandBufferAllocateInfo cbai = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty)); + CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch)); + + /* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */ + VkPipelineLayoutCreateInfo plci = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + }; + VkPipelineLayout playout; + CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout)); + + /* ---- Compute pipeline from noop SPIR-V ---- */ + size_t spv_size = 0; + uint32_t *spv = read_spv(spv_path, &spv_size); + VkShaderModuleCreateInfo smci = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .codeSize = spv_size, + .pCode = spv, + }; + VkShaderModule shader; + CHK(vkCreateShaderModule(dev, &smci, NULL, &shader)); + free(spv); + + VkComputePipelineCreateInfo cpci2 = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = shader, + .pName = "main", + }, + .layout = playout, + }; + VkPipeline pipe; + CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe)); + + /* ---- Record both command buffers once, reuse for every iteration ---- */ + VkCommandBufferBeginInfo cbbi = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + }; + + CHK(vkBeginCommandBuffer(cb_empty, &cbbi)); + CHK(vkEndCommandBuffer(cb_empty)); + + CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi)); + vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe); + vkCmdDispatch(cb_dispatch, 1, 1, 1); + CHK(vkEndCommandBuffer(cb_dispatch)); + + VkSubmitInfo si_empty = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .commandBufferCount = 1, .pCommandBuffers = &cb_empty, + }; + VkSubmitInfo si_disp = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .commandBufferCount = 1, .pCommandBuffers = &cb_dispatch, + }; + + /* ---- Warm-up ---- */ + for (int i = 0; i < 100; i++) { + CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE)); + CHK(vkQueueWaitIdle(queue)); + } + + /* ---- M5a: empty CB submit+wait ---- */ + double t0 = now_seconds(); + for (int i = 0; i < iters; i++) { + CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE)); + CHK(vkQueueWaitIdle(queue)); + } + double t1 = now_seconds(); + double m5a_per = (t1 - t0) / iters * 1e6; /* µs */ + + /* ---- M5b: 1-WG noop dispatch submit+wait ---- */ + double t2 = now_seconds(); + for (int i = 0; i < iters; i++) { + CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE)); + CHK(vkQueueWaitIdle(queue)); + } + double t3 = now_seconds(); + double m5b_per = (t3 - t2) / iters * 1e6; /* µs */ + + printf("\n=== M5: Vulkan compute dispatch overhead ===\n"); + printf(" iters per measurement: %d\n", iters); + printf(" M5a empty CB submit+wait: %.2f µs/op\n", m5a_per); + printf(" M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per); + printf(" delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n", + m5b_per - m5a_per); + printf("\n"); + printf(" Implication for kernel batching:\n"); + printf(" if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n"); + printf(" a single-block dispatch costs %.0fx more in overhead\n", + m5b_per * 1e3 / 100.0); + printf(" -> batch at least %.0f blocks per dispatch to break even.\n", + m5b_per * 1e3 / 100.0); + + /* ---- Tear down (minimal — process exit handles the rest) ---- */ + vkDestroyPipeline(dev, pipe, NULL); + vkDestroyShaderModule(dev, shader, NULL); + vkDestroyPipelineLayout(dev, playout, NULL); + vkDestroyCommandPool(dev, pool, NULL); + vkDestroyDevice(dev, NULL); + vkDestroyInstance(instance, NULL); + return 0; +} diff --git a/tests/shaders/noop.comp b/tests/shaders/noop.comp new file mode 100644 index 0000000..c2bc9fa --- /dev/null +++ b/tests/shaders/noop.comp @@ -0,0 +1,5 @@ +#version 450 +// Empty compute shader for measuring Vulkan dispatch overhead (M5). +// Reads nothing, writes nothing — pure dispatch round-trip floor. +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +void main() {} diff --git a/tests/vp9_idct8_ref.c b/tests/vp9_idct8_ref.c new file mode 100644 index 0000000..f8219df --- /dev/null +++ b/tests/vp9_idct8_ref.c @@ -0,0 +1,114 @@ +/* + * Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse + * transform + add (8-bit pixels), transcribed from the spec + * structure as represented in FFmpeg's libavcodec/vp9dsp_template.c + * (vendored under external/ffmpeg-snapshot/ at commit f46e514). + * + * Provided as a self-contained translation unit so the harness + * doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro + * expansion. Cross-checked against the vendored reference at + * runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()). + * + * License: LGPL-2.1-or-later (matches the upstream reference). + * + * Spec source: VP9 specification §8.7 — Inverse transform process. + */ +#include +#include +#include + +/* Q14 trig constants — VP9 spec table 8.7.1.4. */ +#define COSPI_16_64 11585 /* cos(pi/4) * 2^14 */ +#define COSPI_24_64 6270 /* cos(3pi/8) * 2^14 */ +#define COSPI_8_64 15137 /* sin(3pi/8) * 2^14 */ +#define COSPI_28_64 3196 /* cos(7pi/16)* 2^14 */ +#define COSPI_4_64 16069 /* sin(7pi/16)* 2^14 */ +#define COSPI_20_64 9102 /* cos(5pi/16)* 2^14 */ +#define COSPI_12_64 13623 /* sin(5pi/16)* 2^14 */ + +/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */ +static inline int32_t qround14(int64_t x) +{ + return (int32_t) ((x + (1 << 13)) >> 14); +} + +static inline uint8_t clip_u8(int x) +{ + return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x); +} + +/* 1-D 8-point inverse DCT, signed int32 throughout. Matches + * idct8_1d in libavcodec/vp9dsp_template.c (with the stride + * collapsed to indexed access; identical arithmetic). */ +static void idct8_1d(const int32_t in[8], int32_t out[8]) +{ + int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64); + int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64); + int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64); + int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64 + (int64_t)in[6] * COSPI_24_64); + int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64); + int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64); + int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64); + int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64 + (int64_t)in[7] * COSPI_28_64); + + int32_t t0 = t0a + t3a, t1 = t1a + t2a; + int32_t t2 = t1a - t2a, t3 = t0a - t3a; + int32_t t4 = t4a + t5a; + int32_t t5p = t4a - t5a; + int32_t t7 = t7a + t6a; + int32_t t6p = t7a - t6a; + + int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64); + int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64); + + out[0] = t0 + t7; out[1] = t1 + t6; + out[2] = t2 + t5; out[3] = t3 + t4; + out[4] = t3 - t4; out[5] = t2 - t5; + out[6] = t1 - t6; out[7] = t0 - t7; +} + +/* Public reference entry point. Signature matches + * ff_vp9_idct_idct_8x8_add_neon. After the call, *block is + * zeroed (matches FFmpeg behaviour). */ +void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob) +{ + int32_t tmp[64]; + int32_t out[8]; + int32_t col[8]; + + /* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then + * broadcast (+16) >> 5 added to every pixel. */ + if (eob == 1) { + int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64) + * (int64_t) COSPI_16_64); + block[0] = 0; + int32_t add = (dc + 16) >> 5; + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) + dst[r * stride + c] = clip_u8(dst[r * stride + c] + add); + return; + } + + /* 8 column passes, transposed write: IDCT of block column i lands + * in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which + * uses `tmp + i*8` as the column-pass output base — the transpose + * is implicit in the offset pattern, making the row pass below + * read columns of tmp and write columns of dst. */ + for (int i = 0; i < 8; i++) { + for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i]; + idct8_1d(col, out); + for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r]; + } + memset(block, 0, 64 * sizeof(*block)); + + /* 8 row passes: column i of tmp -> column i of dst (matches + * FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */ + for (int i = 0; i < 8; i++) { + for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i]; + idct8_1d(col, out); + for (int r = 0; r < 8; r++) + dst[r * stride + i] = clip_u8(dst[r * stride + i] + + ((out[r] + 16) >> 5)); + } +}