Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a092ee34aa | |||
| c01754e849 | |||
| 74687d9def | |||
| 65bd5c3fe3 | |||
| 737e87980d | |||
| 98553278dd | |||
| 0a042a8e95 | |||
| 3ecfc8b0ef | |||
| c154253432 | |||
| b3de96b21c | |||
| 68dccd2911 | |||
| 7d6f106919 |
+54
-2
@@ -284,7 +284,29 @@ if (DAEDALUS_BUILD_VULKAN)
|
|||||||
VERBATIM
|
VERBATIM
|
||||||
)
|
)
|
||||||
|
|
||||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV})
|
set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv)
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${H264_IDCT4_SPV}
|
||||||
|
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||||
|
-o ${H264_IDCT4_SPV}
|
||||||
|
${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
|
||||||
|
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
|
||||||
|
COMMENT "glslang: v3d_h264_idct4.comp -> v3d_h264_idct4.spv"
|
||||||
|
VERBATIM
|
||||||
|
)
|
||||||
|
|
||||||
|
set(H264_IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct8.spv)
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${H264_IDCT8_SPV}
|
||||||
|
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||||
|
-o ${H264_IDCT8_SPV}
|
||||||
|
${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
|
||||||
|
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
|
||||||
|
COMMENT "glslang: v3d_h264_idct8.comp -> v3d_h264_idct8.spv"
|
||||||
|
VERBATIM
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV})
|
||||||
|
|
||||||
# v3d_runner — reusable Vulkan plumbing.
|
# v3d_runner — reusable Vulkan plumbing.
|
||||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||||
@@ -412,6 +434,8 @@ if (DAEDALUS_BUILD_VULKAN)
|
|||||||
${LPF8_SPV}
|
${LPF8_SPV}
|
||||||
${CDEF_SPV}
|
${CDEF_SPV}
|
||||||
${H264DEBLOCK_SPV}
|
${H264DEBLOCK_SPV}
|
||||||
|
${H264_IDCT4_SPV}
|
||||||
|
${H264_IDCT8_SPV}
|
||||||
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
|
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
@@ -419,9 +443,33 @@ endif()
|
|||||||
# pkg-config file. Vulkan goes in Requires.private (consumer's
|
# pkg-config file. Vulkan goes in Requires.private (consumer's
|
||||||
# pkg-config call gets it via --static). pthread + dl are needed
|
# pkg-config call gets it via --static). pthread + dl are needed
|
||||||
# by the static archive's runtime helpers.
|
# by the static archive's runtime helpers.
|
||||||
|
#
|
||||||
|
# `prefix` is derived from ${pcfiledir} so the .pc is relocatable:
|
||||||
|
# pkg-config substitutes ${pcfiledir} with the directory holding the
|
||||||
|
# .pc at lookup time, and the relative path from
|
||||||
|
# <prefix>/<libdir>/pkgconfig back to <prefix> tells pkg-config the
|
||||||
|
# install prefix without baking it in. This is why
|
||||||
|
# `cmake --install build --prefix /foo` produces a .pc that correctly
|
||||||
|
# resolves `prefix=/foo` instead of baking whatever CMAKE_INSTALL_PREFIX
|
||||||
|
# was at *configure* time (default /usr/local). DESTDIR-staged
|
||||||
|
# installs work too: at runtime pkg-config sees the .pc at its real
|
||||||
|
# install path and computes the right prefix.
|
||||||
|
#
|
||||||
|
# Relative-path depth is computed from CMAKE_INSTALL_LIBDIR (and
|
||||||
|
# whatever multiarch tuple GNUInstallDirs adds) so Debian-style
|
||||||
|
# `lib/aarch64-linux-gnu/pkgconfig/...` resolves with the right number
|
||||||
|
# of `..` components. Layouts where libdir is *not* under prefix are
|
||||||
|
# not supported by this scheme; if a packager overrides libdir to an
|
||||||
|
# absolute path the relative-path machinery falls back to the absolute
|
||||||
|
# value (CMake's file(RELATIVE_PATH) prepends `..` until they meet),
|
||||||
|
# which is also relocatable but no longer prefix-agnostic.
|
||||||
|
file(RELATIVE_PATH PKGCONFIG_PCDIR_TO_PREFIX
|
||||||
|
"${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig"
|
||||||
|
"${CMAKE_INSTALL_PREFIX}")
|
||||||
|
|
||||||
set(PKGCONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/daedalus-fourier.pc)
|
set(PKGCONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/daedalus-fourier.pc)
|
||||||
file(WRITE ${PKGCONFIG_OUT}
|
file(WRITE ${PKGCONFIG_OUT}
|
||||||
"prefix=${CMAKE_INSTALL_PREFIX}
|
"prefix=\${pcfiledir}/${PKGCONFIG_PCDIR_TO_PREFIX}
|
||||||
exec_prefix=\${prefix}
|
exec_prefix=\${prefix}
|
||||||
libdir=\${prefix}/${CMAKE_INSTALL_LIBDIR}
|
libdir=\${prefix}/${CMAKE_INSTALL_LIBDIR}
|
||||||
includedir=\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}
|
includedir=\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}
|
||||||
@@ -468,6 +516,10 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
|
|||||||
target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
|
target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
|
||||||
target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
|
target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
|
||||||
|
|
||||||
|
add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
|
||||||
|
target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
|
||||||
|
target_compile_options(bench_pool_overhead PRIVATE -O2)
|
||||||
|
|
||||||
if (DAEDALUS_BUILD_VULKAN)
|
if (DAEDALUS_BUILD_VULKAN)
|
||||||
# (re-open the conditional so the closing endif() below balances)
|
# (re-open the conditional so the closing endif() below balances)
|
||||||
|
|
||||||
|
|||||||
@@ -4,9 +4,9 @@
|
|||||||
|
|
||||||
This document is forward-looking. It describes the generalized multi-SoC daedalus daemon architecture, but the immediate work block stays "finish Pi 5". Re-read this when:
|
This document is forward-looking. It describes the generalized multi-SoC daedalus daemon architecture, but the immediate work block stays "finish Pi 5". Re-read this when:
|
||||||
|
|
||||||
- A second aarch64 host without a working kernel-side V4L2 stateless decoder shows up in the fleet (most likely candidate: Pi 4, which has V3D 4.x and no rpivid stable upstream).
|
- HW decode on noether (Pi 4, the user's interactive workstation) becomes a real ask and rpivid upstream is still unstable. This is the most likely trigger — same SoC class as Pi 5 but weaker V3D 4.x, so the caps-file mechanism plus an extra row's worth of substrate measurements.
|
||||||
- A specific working-copy slowdown that the current Pi-5-only daedalus can't address motivates the generalization.
|
- AV1 playback on boltzmann (RK3588) starts mattering. rkvdec doesn't cover AV1, so the daedalus path becomes the only HW-accelerated option, and Mali Valhall compute substrate decisions need their own caps row.
|
||||||
- libva-v4l2-request-fourier evolves to need multi-node negotiation (currently it picks the first matching V4L2 node).
|
- libva-v4l2-request-fourier evolves to need multi-node negotiation (today it picks the first matching V4L2 node; a host with both rkvdec and daedalus-v4l2 nodes wants a preference policy).
|
||||||
|
|
||||||
Until then: this is decision context, not a TODO.
|
Until then: this is decision context, not a TODO.
|
||||||
|
|
||||||
@@ -51,13 +51,17 @@ The mfritsche fleet has heterogeneous aarch64 hardware decoders:
|
|||||||
|
|
||||||
| SoC | Host(s) | H.264 | HEVC | VP9 | AV1 | GPU compute |
|
| SoC | Host(s) | H.264 | HEVC | VP9 | AV1 | GPU compute |
|
||||||
|---|---|---|---|---|---|---|
|
|---|---|---|---|---|---|---|
|
||||||
| BCM2712 (Pi 5) | higgs, broglie | none | V3D7 (rpi-hevc-dec — SPS quirks) | none | none | V3D7 (Vulkan compute, queryable) |
|
| BCM2712 (Pi 5) | higgs, hertz, broglie, tesla (LXD on hertz) | none | V3D7 (rpi-hevc-dec — SPS quirks) | none | none | V3D7 (Vulkan compute, queryable) |
|
||||||
| BCM2711 (Pi 4) | dcw3 | rpivid (out of tree, unstable) | rpivid (out of tree, unstable) | none | none | V3D4 (Vulkan compute, weaker) |
|
| BCM2711 (Pi 4) | noether (interactive workstation), dcw3, dcw2 | rpivid (out of tree, unstable) | rpivid (out of tree, unstable) | none | none | V3D4 (Vulkan compute, weaker) |
|
||||||
| RK3588 | hertz, tesla | rkvdec V4L2 stateless (upstream) | rkvdec V4L2 stateless | rkvdec V4L2 stateless | none (rkvdec lacks AV1) | Mali Valhall (panvk) + RK NPU |
|
| RK3588 | boltzmann (32 GB, kernel-dev / MCP hub, 8 W always-on) | rkvdec V4L2 stateless (upstream) | rkvdec V4L2 stateless | rkvdec V4L2 stateless | none (rkvdec lacks AV1) | Mali Valhall (panvk-bifrost-video in dev) + RK NPU |
|
||||||
| Allwinner H6 | (not in current fleet, but Cedrus exists) | Cedrus V4L2 | Cedrus V4L2 | none | none | Mali Bifrost |
|
| Allwinner H6 | (not in current fleet, but Cedrus exists upstream) | Cedrus V4L2 | Cedrus V4L2 | none | none | Mali Bifrost |
|
||||||
|
|
||||||
No single SoC has a complete codec set. RK3588 lacks AV1; Pi 5 lacks H.264 + VP9 + AV1; Pi 4 has rpivid (out-of-tree, kernel-version-fragile); Allwinner Cedrus is H.264/HEVC only.
|
No single SoC has a complete codec set. RK3588 lacks AV1; Pi 5 lacks H.264 + VP9 + AV1; Pi 4 has rpivid (out-of-tree, kernel-version-fragile); Allwinner Cedrus is H.264/HEVC only.
|
||||||
|
|
||||||
|
A note on the Pi 5 row: hertz and tesla share hardware (tesla is an LXD container hosted on hertz) but are operationally distinct — tesla is the distcc/MCP worker, hertz is the LXD host with all the cron automations and the 17-tool lmcp hub. From a daedalus deployment perspective they count as **one** Pi 5 substrate; from a workflow perspective they're separate boxes.
|
||||||
|
|
||||||
|
A note on noether: it's the user's interactive workstation (Pi 4, BCM2711). Firefox + mpv run here. Any "I want HW decode on my main box" pressure lands first on this host, which puts Pi 4 (V3D4 + maybe-rpivid) closer to the front of the queue than the original draft of this document suggested.
|
||||||
|
|
||||||
The current daedalus model — "kernel substitution + libavcodec front end" — is the right answer for **Pi 5 specifically**, where no usable kernel V4L2 stateless decoder exists for the codecs we care about, and a Vulkan-capable GPU (V3D7) is available to help on a few kernels.
|
The current daedalus model — "kernel substitution + libavcodec front end" — is the right answer for **Pi 5 specifically**, where no usable kernel V4L2 stateless decoder exists for the codecs we care about, and a Vulkan-capable GPU (V3D7) is available to help on a few kernels.
|
||||||
|
|
||||||
The model is **not** the right answer for SoCs that already have working V4L2 stateless decoders for the requested codec — those should be passed through, not re-implemented through libavcodec + kernel substitution.
|
The model is **not** the right answer for SoCs that already have working V4L2 stateless decoders for the requested codec — those should be passed through, not re-implemented through libavcodec + kernel substitution.
|
||||||
@@ -207,15 +211,15 @@ Pass-through plugins are *thin* — they translate the daedalus daemon's wire pr
|
|||||||
|
|
||||||
**Today's calculus:**
|
**Today's calculus:**
|
||||||
|
|
||||||
- Pi 5 daedalus path is the only thing in the fleet that uses daedalus daemon. Generalizing for a single user is overdesign.
|
- Pi 5 (higgs + hertz + broglie + tesla) is **four hosts**, but **one SoC**. Adding the fifth Pi 5 host wouldn't pressure-test the architecture; they all share BCM2712 caps so the substrate decisions are identical across the row.
|
||||||
- RK3588 uses rkvdec directly through libva-v4l2-request-fourier; daedalus daemon is **not in the path** for any RK3588 codec. The "RK3588 support" the architecture above proposes is mostly a no-op routing decision plus an AV1 fallback that doesn't yet measure on Mali.
|
- boltzmann (RK3588) is the only non-Pi-5 always-on host in the fleet, and it uses rkvdec directly through libva-v4l2-request-fourier — daedalus daemon is **not in the path** for any RK3588 codec on it. The "RK3588 support" the architecture above proposes is mostly a no-op routing decision plus an AV1 fallback that doesn't yet measure on Mali. No forcing pressure from boltzmann today.
|
||||||
- Pi 4 with rpivid is the only realistic second motivator. rpivid upstream stability is the gate — if it lands cleanly, Pi 4 takes the pass-through path with no kernel substitution needed. If it stays out-of-tree-fragile, **then** the substrate-composed path with V3D4 + NEON becomes the right backend for Pi 4, and we need the per-SoC caps mechanism to handle V3D4's weaker compute.
|
- noether (Pi 4, this user's interactive workstation) and dcw3/dcw2 (also Pi 4) are the real second-SoC candidates. The gate is rpivid upstream stability: if it lands cleanly, Pi 4 takes the pass-through path with zero kernel substitution work. If it stays out-of-tree-fragile, **then** the substrate-composed path with V3D4 + NEON becomes the right backend for Pi 4, and we need the per-SoC caps mechanism to handle V3D4's weaker compute.
|
||||||
- The recipe layer in daedalus-fourier already scales cleanly. Adding more substrates is incremental, not architectural.
|
- The recipe layer in daedalus-fourier already scales cleanly. Adding more substrates is incremental, not architectural.
|
||||||
|
|
||||||
**The forcing function that flips this from "deferred" to "do it":**
|
**The forcing function that flips this from "deferred" to "do it":**
|
||||||
|
|
||||||
- Pi 4 enters daily use and rpivid is still not stable upstream — implies we need a Pi 4 substrate-composed path, which means at minimum a second caps file and the loader for it. At that point, building the full pluggable scaffold becomes proportionate.
|
- **noether-as-Firefox-host** — the user starts wanting HW decode on their main workstation and rpivid is still not stable upstream. Implies a Pi 4 substrate-composed path, which means at minimum a second caps file and the loader for it. At that point, building the full pluggable scaffold becomes proportionate. This is the most likely trigger; noether is already a daily-driver Pi 4.
|
||||||
- **Or:** an x86 host enters the fleet running mesa-panvk on a Pi-CM5-like board, and we need the daedalus daemon to discover it dynamically rather than being baked at build time.
|
- **boltzmann-as-AV1-decoder** — RK3588 has no AV1 HW decoder, and the user wants AV1 playback there (currently CPU-only). Triggers a cycle-5–equivalent measurement campaign on Mali Valhall to see whether `daedalus_recipe_dispatch_cdef_8x8` (or follow-on AV1 kernels) is worth running on Mali compute. If yes, we need an RK3588 caps file that overrides only the AV1 row while leaving H.264/HEVC/VP9 on rkvdec pass-through.
|
||||||
- **Or:** a third-party Pi 5 user needs to swap shaders for V3D firmware experiments without rebuilding the daemon — at that point dynamic shader loading + caps overrides become a feature ask.
|
- **Or:** a third-party Pi 5 user needs to swap shaders for V3D firmware experiments without rebuilding the daemon — at that point dynamic shader loading + caps overrides become a feature ask.
|
||||||
|
|
||||||
Until one of those happens: keep daedalus daemon Pi 5 specific. Push cross-SoC abstraction *up* to libva-v4l2-request-fourier (which already does most of it) rather than *down* into the daemon.
|
Until one of those happens: keep daedalus daemon Pi 5 specific. Push cross-SoC abstraction *up* to libva-v4l2-request-fourier (which already does most of it) rather than *down* into the daemon.
|
||||||
@@ -242,6 +246,7 @@ Until one of those happens: keep daedalus daemon Pi 5 specific. Push cross-SoC a
|
|||||||
|---|---|---|
|
|---|---|---|
|
||||||
| 2026-05-23 | **Defer generalization.** Finish Pi 5 substitution arc (cycle 9 PR #90 pending), then pivot to bug-fix backlog (daemon SEGV #145, D-state #146) before architecture work. | Architecture pivot is a multi-week scope; Pi 5 path is the only user-visible motivator today; deferring loses nothing because the recipe layer already abstracts kernels and libva-v4l2-request-fourier already abstracts V4L2 nodes. |
|
| 2026-05-23 | **Defer generalization.** Finish Pi 5 substitution arc (cycle 9 PR #90 pending), then pivot to bug-fix backlog (daemon SEGV #145, D-state #146) before architecture work. | Architecture pivot is a multi-week scope; Pi 5 path is the only user-visible motivator today; deferring loses nothing because the recipe layer already abstracts kernels and libva-v4l2-request-fourier already abstracts V4L2 nodes. |
|
||||||
| 2026-05-23 | **Document the design now, even though it's deferred.** | Captures the conceptual gap (shaders ≠ hardware decoders) and the two-backend conclusion while the analysis is fresh; saves re-litigating in 3–6 months. |
|
| 2026-05-23 | **Document the design now, even though it's deferred.** | Captures the conceptual gap (shaders ≠ hardware decoders) and the two-backend conclusion while the analysis is fresh; saves re-litigating in 3–6 months. |
|
||||||
|
| 2026-05-23 | **Correct fleet hardware mapping.** Original draft had hertz/tesla under RK3588 and omitted boltzmann + noether entirely. Verified via `/proc/device-tree/compatible`: hertz + tesla are Pi 5 (BCM2712), noether is Pi 4 (BCM2711), boltzmann is the only RK3588 in the fleet. Adjusted "Why deferred" / forcing-function reasoning accordingly — Pi 5 row is now 4 hosts (one SoC), noether is the realistic Pi 4 trigger, boltzmann is the realistic RK3588 trigger via AV1. | Original draft was speculative on host-to-SoC mapping; verified state changes which forcing functions are credible. |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
+292
-61
@@ -40,6 +40,10 @@ struct daedalus_ctx {
|
|||||||
v3d_pipeline cdef_pipe;
|
v3d_pipeline cdef_pipe;
|
||||||
int h264deblock_pipe_ready;
|
int h264deblock_pipe_ready;
|
||||||
v3d_pipeline h264deblock_pipe;
|
v3d_pipeline h264deblock_pipe;
|
||||||
|
int h264_idct4_pipe_ready;
|
||||||
|
v3d_pipeline h264_idct4_pipe;
|
||||||
|
int h264_idct8_pipe_ready;
|
||||||
|
v3d_pipeline h264_idct8_pipe;
|
||||||
};
|
};
|
||||||
|
|
||||||
daedalus_ctx *daedalus_ctx_create(void)
|
daedalus_ctx *daedalus_ctx_create(void)
|
||||||
@@ -53,6 +57,25 @@ daedalus_ctx *daedalus_ctx_create(void)
|
|||||||
|
|
||||||
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
|
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Per the "QPU is default substrate" decree 2026-05-23:
|
||||||
|
* setting DAEDALUS_FORCE_QPU=1 in the process env escalates this
|
||||||
|
* function to a full daedalus_ctx_create(), letting the libavcodec
|
||||||
|
* substitution shims (which call create_no_qpu via pthread_once)
|
||||||
|
* fire the V3D shaders that exist for cycles 1/2/4/5/8. Without
|
||||||
|
* this hook each consumer process (firefox, mpv, daemon) would
|
||||||
|
* need its own shim build to opt into QPU.
|
||||||
|
*
|
||||||
|
* Default behaviour (env var unset / not "1") is unchanged: pure
|
||||||
|
* NEON ctx, no implicit Vulkan init. Firefox / mpv consumers
|
||||||
|
* that dlopen libavcodec without opting in stay on the
|
||||||
|
* Vulkan-free path; the daemon explicitly sets
|
||||||
|
* DAEDALUS_FORCE_QPU=1 before loading libavcodec.
|
||||||
|
*/
|
||||||
|
const char *force = getenv("DAEDALUS_FORCE_QPU");
|
||||||
|
if (force && force[0] == '1' && force[1] == 0)
|
||||||
|
return daedalus_ctx_create();
|
||||||
|
|
||||||
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||||||
if (!ctx) return NULL;
|
if (!ctx) return NULL;
|
||||||
ctx->has_qpu = 0;
|
ctx->has_qpu = 0;
|
||||||
@@ -75,6 +98,8 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
|||||||
if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
|
if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
|
||||||
if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
|
if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
|
||||||
if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
|
if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
|
||||||
|
if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
|
||||||
|
if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
|
||||||
v3d_runner_destroy(ctx->runner);
|
v3d_runner_destroy(ctx->runner);
|
||||||
}
|
}
|
||||||
free(ctx);
|
free(ctx);
|
||||||
@@ -84,16 +109,25 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
|||||||
|
|
||||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Recipe table per the "QPU is default substrate" decree
|
||||||
|
* 2026-05-23. Any kernel that has a V3D compute shader returns
|
||||||
|
* SUBSTRATE_QPU; CPU is the fallback for kernels without a
|
||||||
|
* shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel
|
||||||
|
* mc20 — covered by follow-on task 165). The dispatch
|
||||||
|
* wrappers already fall back to CPU automatically when the
|
||||||
|
* ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0).
|
||||||
|
*/
|
||||||
switch (k) {
|
switch (k) {
|
||||||
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */
|
||||||
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */
|
||||||
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */
|
||||||
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */
|
||||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */
|
||||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU; /* TODO task #165 */
|
||||||
}
|
}
|
||||||
return DAEDALUS_SUBSTRATE_CPU;
|
return DAEDALUS_SUBSTRATE_CPU;
|
||||||
}
|
}
|
||||||
@@ -291,13 +325,13 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||||||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||||
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||||
}
|
}
|
||||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
||||||
@@ -325,8 +359,8 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
|||||||
._pad = 0,
|
._pad = 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail;
|
||||||
if (cb == VK_NULL_HANDLE) goto fail;
|
VkCommandBuffer cb = ctx->idct8_pipe.cb;
|
||||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
vkBeginCommandBuffer(cb, &cbbi);
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
@@ -344,15 +378,15 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
|||||||
/* Read-back dst. */
|
/* Read-back dst. */
|
||||||
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
||||||
|
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
fail:
|
fail:
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -424,9 +458,9 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
|||||||
size_t dst_window_size = hi - lo;
|
size_t dst_window_size = hi - lo;
|
||||||
|
|
||||||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||||
if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
|
v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
||||||
@@ -442,8 +476,8 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
|||||||
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
||||||
|
|
||||||
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
||||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail;
|
||||||
if (cb == VK_NULL_HANDLE) goto fail;
|
VkCommandBuffer cb = p->cb;
|
||||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
vkBeginCommandBuffer(cb, &cbbi);
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
||||||
@@ -468,12 +502,12 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
|||||||
|
|
||||||
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
||||||
|
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||||
return 0;
|
return 0;
|
||||||
fail:
|
fail:
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -509,9 +543,9 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
v3d_buffer bm = {0}, bd = {0}, bs = {0};
|
v3d_buffer bm = {0}, bd = {0}, bs = {0};
|
||||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||||
|
|
||||||
memcpy(bs.mapped, src, src_max);
|
memcpy(bs.mapped, src, src_max);
|
||||||
memcpy(bd.mapped, dst, dst_max);
|
memcpy(bd.mapped, dst, dst_max);
|
||||||
@@ -530,8 +564,8 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
|||||||
mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||||||
.dst_stride_u8 = (uint32_t) dst_stride,
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||||
.src_stride_u8 = (uint32_t) src_stride };
|
.src_stride_u8 = (uint32_t) src_stride };
|
||||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail;
|
||||||
if (cb == VK_NULL_HANDLE) goto fail;
|
VkCommandBuffer cb = ctx->mc8h_pipe.cb;
|
||||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
vkBeginCommandBuffer(cb, &cbbi);
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
|
||||||
@@ -545,14 +579,14 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
|||||||
|
|
||||||
memcpy(dst, bd.mapped, dst_max);
|
memcpy(dst, bd.mapped, dst_max);
|
||||||
|
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||||
return 0;
|
return 0;
|
||||||
fail:
|
fail:
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -588,9 +622,9 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
|||||||
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
|
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
|
||||||
|
|
||||||
v3d_buffer bm = {0}, bd = {0}, bt = {0};
|
v3d_buffer bm = {0}, bd = {0}, bt = {0};
|
||||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||||
if (v3d_runner_create_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||||
|
|
||||||
/* tmp may need padding before block-origin offset (caller-allocated). Just
|
/* tmp may need padding before block-origin offset (caller-allocated). Just
|
||||||
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
|
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
|
||||||
@@ -615,8 +649,8 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
|||||||
cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||||||
.tmp_stride_u16 = 16,
|
.tmp_stride_u16 = 16,
|
||||||
.dst_stride_u8 = (uint32_t) dst_stride };
|
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail;
|
||||||
if (cb == VK_NULL_HANDLE) goto fail;
|
VkCommandBuffer cb = ctx->cdef_pipe.cb;
|
||||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
vkBeginCommandBuffer(cb, &cbbi);
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
|
||||||
@@ -630,14 +664,14 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
|||||||
|
|
||||||
memcpy(dst, bd.mapped, dst_max);
|
memcpy(dst, bd.mapped, dst_max);
|
||||||
|
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||||
return 0;
|
return 0;
|
||||||
fail:
|
fail:
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -670,8 +704,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
v3d_buffer bm = {0}, bd = {0};
|
v3d_buffer bm = {0}, bd = {0};
|
||||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||||
|
|
||||||
memcpy(bd.mapped, dst, dst_max);
|
memcpy(bd.mapped, dst, dst_max);
|
||||||
uint32_t *m = bm.mapped;
|
uint32_t *m = bm.mapped;
|
||||||
@@ -691,8 +725,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
|||||||
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
||||||
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
||||||
.dst_stride_u8 = (uint32_t) dst_stride };
|
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail;
|
||||||
if (cb == VK_NULL_HANDLE) goto fail;
|
VkCommandBuffer cb = ctx->h264deblock_pipe.cb;
|
||||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
vkBeginCommandBuffer(cb, &cbbi);
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
|
||||||
@@ -706,12 +740,193 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
|||||||
|
|
||||||
memcpy(dst, bd.mapped, dst_max);
|
memcpy(dst, bd.mapped, dst_max);
|
||||||
|
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||||
return 0;
|
return 0;
|
||||||
fail:
|
fail:
|
||||||
|
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||||
|
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t n_blocks;
|
||||||
|
uint32_t dst_stride_u8;
|
||||||
|
uint32_t _pad0;
|
||||||
|
uint32_t _pad1;
|
||||||
|
} h264_idct4_pc;
|
||||||
|
|
||||||
|
static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_h264_block_meta *meta)
|
||||||
|
{
|
||||||
|
if (!ctx->h264_idct4_pipe_ready) {
|
||||||
|
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv",
|
||||||
|
3, sizeof(h264_idct4_pc),
|
||||||
|
&ctx->h264_idct4_pipe) != 0)
|
||||||
|
return -1;
|
||||||
|
ctx->h264_idct4_pipe_ready = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t);
|
||||||
|
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */
|
||||||
|
size_t dst_max = 0;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4;
|
||||||
|
if (e > dst_max) dst_max = e;
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_buffer bc = {0}, bd = {0}, bm = {0};
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||||
|
}
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(bc.mapped, coeffs, coeff_bytes);
|
||||||
|
memcpy(bd.mapped, dst, dst_max);
|
||||||
|
uint32_t *m = bm.mapped;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
m[4*i+0] = meta[i].dst_off;
|
||||||
|
m[4*i+1] = 0;
|
||||||
|
m[4*i+2] = 0;
|
||||||
|
m[4*i+3] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_buffer binds[3] = { bc, bd, bm };
|
||||||
|
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3))
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */
|
||||||
|
h264_idct4_pc pc = {
|
||||||
|
.n_blocks = (uint32_t) n_blocks,
|
||||||
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||||
|
};
|
||||||
|
|
||||||
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||||
|
if (cb == VK_NULL_HANDLE) goto fail;
|
||||||
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
ctx->h264_idct4_pipe.pipeline);
|
||||||
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
ctx->h264_idct4_pipe.layout, 0, 1,
|
||||||
|
&ctx->h264_idct4_pipe.desc_set, 0, NULL);
|
||||||
|
vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout,
|
||||||
|
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||||
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||||
|
vkEndCommandBuffer(cb);
|
||||||
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||||
|
|
||||||
|
memcpy(dst, bd.mapped, dst_max);
|
||||||
|
|
||||||
|
/* H.264/FFmpeg convention: zero the coeffs block after the
|
||||||
|
* transform (matches the C ref + NEON .S behaviour). */
|
||||||
|
memset(coeffs, 0, coeff_bytes);
|
||||||
|
|
||||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||||
|
return 0;
|
||||||
|
fail:
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t n_blocks;
|
||||||
|
uint32_t dst_stride_u8;
|
||||||
|
uint32_t _pad0;
|
||||||
|
uint32_t _pad1;
|
||||||
|
} h264_idct8_pc;
|
||||||
|
|
||||||
|
static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_h264_block_meta *meta)
|
||||||
|
{
|
||||||
|
if (!ctx->h264_idct8_pipe_ready) {
|
||||||
|
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
|
||||||
|
3, sizeof(h264_idct8_pc),
|
||||||
|
&ctx->h264_idct8_pipe) != 0)
|
||||||
|
return -1;
|
||||||
|
ctx->h264_idct8_pipe_ready = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
|
||||||
|
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||||||
|
size_t dst_max = 0;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
|
||||||
|
if (e > dst_max) dst_max = e;
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_buffer bc = {0}, bd = {0}, bm = {0};
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||||
|
}
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(bc.mapped, coeffs, coeff_bytes);
|
||||||
|
memcpy(bd.mapped, dst, dst_max);
|
||||||
|
uint32_t *m = bm.mapped;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
m[4*i+0] = meta[i].dst_off;
|
||||||
|
m[4*i+1] = 0;
|
||||||
|
m[4*i+2] = 0;
|
||||||
|
m[4*i+3] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_buffer binds[3] = { bc, bd, bm };
|
||||||
|
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8); /* 8 blocks/WG */
|
||||||
|
h264_idct8_pc pc = {
|
||||||
|
.n_blocks = (uint32_t) n_blocks,
|
||||||
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||||
|
};
|
||||||
|
|
||||||
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||||
|
if (cb == VK_NULL_HANDLE) goto fail;
|
||||||
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
ctx->h264_idct8_pipe.pipeline);
|
||||||
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
ctx->h264_idct8_pipe.layout, 0, 1,
|
||||||
|
&ctx->h264_idct8_pipe.desc_set, 0, NULL);
|
||||||
|
vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
|
||||||
|
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||||
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||||
|
vkEndCommandBuffer(cb);
|
||||||
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||||
|
|
||||||
|
memcpy(dst, bd.mapped, dst_max);
|
||||||
|
memset(coeffs, 0, coeff_bytes);
|
||||||
|
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||||
|
return 0;
|
||||||
|
fail:
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -803,8 +1018,16 @@ int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
int16_t *coeffs, size_t n_blocks,
|
int16_t *coeffs, size_t n_blocks,
|
||||||
const daedalus_h264_block_meta *meta)
|
const daedalus_h264_block_meta *meta)
|
||||||
{
|
{
|
||||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
|
daedalus_substrate eff = sub;
|
||||||
dst, dst_stride, coeffs, n_blocks, meta);
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||||
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4);
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||||
|
return dispatch_h264_idct4_cpu(ctx, dst, dst_stride,
|
||||||
|
coeffs, n_blocks, meta);
|
||||||
|
return dispatch_h264_idct4_qpu(ctx, dst, dst_stride,
|
||||||
|
coeffs, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
@@ -812,8 +1035,16 @@ int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
int16_t *coeffs, size_t n_blocks,
|
int16_t *coeffs, size_t n_blocks,
|
||||||
const daedalus_h264_block_meta *meta)
|
const daedalus_h264_block_meta *meta)
|
||||||
{
|
{
|
||||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
|
daedalus_substrate eff = sub;
|
||||||
dst, dst_stride, coeffs, n_blocks, meta);
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||||
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||||
|
return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
|
||||||
|
coeffs, n_blocks, meta);
|
||||||
|
return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
|
||||||
|
coeffs, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
|||||||
@@ -0,0 +1,129 @@
|
|||||||
|
// daedalus-fourier — H.264 4x4 inverse integer transform + add, V3D 7.1.
|
||||||
|
//
|
||||||
|
// H.264 spec §8.5.12.1. Pure integer arithmetic — no trig constants
|
||||||
|
// (unlike VP9 IDCT 8x8). Row pass first, column pass second; round
|
||||||
|
// (+32) >> 6, add to dst, clip to u8.
|
||||||
|
//
|
||||||
|
// Block memory layout: COLUMN-MAJOR. block[c*4 + r] = coefficient at
|
||||||
|
// (row r, column c). Matches FFmpeg `ff_h264_idct_add_neon`.
|
||||||
|
//
|
||||||
|
// Workgroup layout: 64 invocations = 4 lanes/block × 16 blocks/WG.
|
||||||
|
// - row pass: lane k (0..3) reads row k of the block (4 coefficients,
|
||||||
|
// one from each column), runs the butterfly, writes 4
|
||||||
|
// outputs to one row of tmp_shared.
|
||||||
|
// - column pass: lane k reads column k of tmp_shared (4 rows),
|
||||||
|
// runs the butterfly, writes 4 outputs to dst as
|
||||||
|
// column k at rows 0..3.
|
||||||
|
//
|
||||||
|
// shared = 16 × 16 × 4 B = 1 KiB. Well under V3D's 16 KiB limit.
|
||||||
|
//
|
||||||
|
// License: BSD-2-Clause.
|
||||||
|
|
||||||
|
#version 450
|
||||||
|
#extension GL_EXT_shader_8bit_storage : require
|
||||||
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
|
||||||
|
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout(binding = 0) readonly buffer Coeffs {
|
||||||
|
int16_t coeffs[]; // N × 16 column-major
|
||||||
|
} u_coeffs;
|
||||||
|
|
||||||
|
layout(binding = 1) buffer Dst {
|
||||||
|
uint8_t dst[]; // H × stride bytes (caller-provided base)
|
||||||
|
} u_dst;
|
||||||
|
|
||||||
|
layout(binding = 2) readonly buffer Meta {
|
||||||
|
uvec4 meta[]; // .x = dst_off (byte offset into u_dst.dst)
|
||||||
|
} u_meta;
|
||||||
|
|
||||||
|
layout(push_constant) uniform PC {
|
||||||
|
uint n_blocks;
|
||||||
|
uint dst_stride_u8;
|
||||||
|
uint _pad0, _pad1;
|
||||||
|
} pc;
|
||||||
|
|
||||||
|
// 16 blocks per WG × 16 ints per block = 256 ints = 1 KiB shared.
|
||||||
|
shared int tmp_shared[16 * 16];
|
||||||
|
|
||||||
|
// 1D butterfly per H.264 §8.5.12.1. d[0..3] in, o[0..3] out.
|
||||||
|
void idct4_1d(int d0, int d1, int d2, int d3,
|
||||||
|
out int o0, out int o1, out int o2, out int o3)
|
||||||
|
{
|
||||||
|
int e = d0 + d2;
|
||||||
|
int f = d0 - d2;
|
||||||
|
int g = (d1 >> 1) - d3;
|
||||||
|
int h = d1 + (d3 >> 1);
|
||||||
|
o0 = e + h;
|
||||||
|
o1 = f + g;
|
||||||
|
o2 = f - g;
|
||||||
|
o3 = e - h;
|
||||||
|
}
|
||||||
|
|
||||||
|
void main()
|
||||||
|
{
|
||||||
|
// Lane decomposition: local_size 64 = 16 blocks × 4 lanes/block.
|
||||||
|
uint gid = gl_GlobalInvocationID.x;
|
||||||
|
uint wg_id = gid / 64u;
|
||||||
|
uint lane_in_wg = gid & 63u;
|
||||||
|
uint block_local = lane_in_wg >> 2; // 0..15
|
||||||
|
uint k = lane_in_wg & 3u; // 0..3
|
||||||
|
uint block_idx = wg_id * 16u + block_local;
|
||||||
|
|
||||||
|
bool oob = (block_idx >= pc.n_blocks);
|
||||||
|
|
||||||
|
// ---- Row pass --------------------------------------------------
|
||||||
|
// lane k handles row r=k. Reads block[c*4 + k] for c=0..3 (one
|
||||||
|
// element from each column at fixed row).
|
||||||
|
if (!oob) {
|
||||||
|
uint base = block_idx * 16u;
|
||||||
|
int d0 = int(u_coeffs.coeffs[base + 0u * 4u + k]);
|
||||||
|
int d1 = int(u_coeffs.coeffs[base + 1u * 4u + k]);
|
||||||
|
int d2 = int(u_coeffs.coeffs[base + 2u * 4u + k]);
|
||||||
|
int d3 = int(u_coeffs.coeffs[base + 3u * 4u + k]);
|
||||||
|
|
||||||
|
int o0, o1, o2, o3;
|
||||||
|
idct4_1d(d0, d1, d2, d3, o0, o1, o2, o3);
|
||||||
|
|
||||||
|
// Write row k of tmp_shared[block_local].
|
||||||
|
uint tbase = block_local * 16u + k * 4u;
|
||||||
|
tmp_shared[tbase + 0u] = o0;
|
||||||
|
tmp_shared[tbase + 1u] = o1;
|
||||||
|
tmp_shared[tbase + 2u] = o2;
|
||||||
|
tmp_shared[tbase + 3u] = o3;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier();
|
||||||
|
|
||||||
|
// ---- Column pass ----------------------------------------------
|
||||||
|
// lane k handles column c=k. Reads tmp[r][k] for r=0..3.
|
||||||
|
if (!oob) {
|
||||||
|
uint tbase = block_local * 16u;
|
||||||
|
int s0 = tmp_shared[tbase + 0u * 4u + k];
|
||||||
|
int s1 = tmp_shared[tbase + 1u * 4u + k];
|
||||||
|
int s2 = tmp_shared[tbase + 2u * 4u + k];
|
||||||
|
int s3 = tmp_shared[tbase + 3u * 4u + k];
|
||||||
|
|
||||||
|
int o0, o1, o2, o3;
|
||||||
|
idct4_1d(s0, s1, s2, s3, o0, o1, o2, o3);
|
||||||
|
|
||||||
|
// Column k at rows 0..3 of dst, offset by meta.x (dst_off).
|
||||||
|
uint dst_off = u_meta.meta[block_idx].x;
|
||||||
|
uint stride = pc.dst_stride_u8;
|
||||||
|
uint a0 = dst_off + 0u * stride + k;
|
||||||
|
uint a1 = dst_off + 1u * stride + k;
|
||||||
|
uint a2 = dst_off + 2u * stride + k;
|
||||||
|
uint a3 = dst_off + 3u * stride + k;
|
||||||
|
|
||||||
|
int p0 = int(u_dst.dst[a0]);
|
||||||
|
int p1 = int(u_dst.dst[a1]);
|
||||||
|
int p2 = int(u_dst.dst[a2]);
|
||||||
|
int p3 = int(u_dst.dst[a3]);
|
||||||
|
|
||||||
|
u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 32) >> 6), 0, 255));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,175 @@
|
|||||||
|
// daedalus-fourier — H.264 8x8 inverse integer transform + add, V3D 7.1.
|
||||||
|
//
|
||||||
|
// H.264 spec §8.5.13.2 (High profile 8x8 IT). Pure integer arithmetic
|
||||||
|
// — different butterfly from VP9 IDCT 8x8 (cycle 1, uses cospi
|
||||||
|
// multipliers). Row pass first, column pass second; round (+32) >> 6,
|
||||||
|
// add to dst, clip to u8.
|
||||||
|
//
|
||||||
|
// Block layout: COLUMN-MAJOR. block[c*8 + r] = coefficient at
|
||||||
|
// (row r, column c). Matches FFmpeg `ff_h264_idct8_add_neon`.
|
||||||
|
//
|
||||||
|
// Workgroup layout: 64 invocations = 8 lanes/block × 8 blocks/WG.
|
||||||
|
// - row pass: lane k (0..7) reads row k of the block (8 coefficients,
|
||||||
|
// one from each column), runs the butterfly, writes 8
|
||||||
|
// outputs to one row of tmp_shared.
|
||||||
|
// - column pass: lane k reads column k of tmp_shared (8 rows),
|
||||||
|
// runs the butterfly, writes 8 outputs to dst as
|
||||||
|
// column k at rows 0..7.
|
||||||
|
//
|
||||||
|
// shared = 8 × 64 × 4 B = 2 KiB. Well under V3D's 16 KiB limit.
|
||||||
|
//
|
||||||
|
// License: BSD-2-Clause.
|
||||||
|
|
||||||
|
#version 450
|
||||||
|
#extension GL_EXT_shader_8bit_storage : require
|
||||||
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
|
||||||
|
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout(binding = 0) readonly buffer Coeffs {
|
||||||
|
int16_t coeffs[]; // N × 64 column-major
|
||||||
|
} u_coeffs;
|
||||||
|
|
||||||
|
layout(binding = 1) buffer Dst {
|
||||||
|
uint8_t dst[]; // H × stride bytes
|
||||||
|
} u_dst;
|
||||||
|
|
||||||
|
layout(binding = 2) readonly buffer Meta {
|
||||||
|
uvec4 meta[]; // .x = dst_off
|
||||||
|
} u_meta;
|
||||||
|
|
||||||
|
layout(push_constant) uniform PC {
|
||||||
|
uint n_blocks;
|
||||||
|
uint dst_stride_u8;
|
||||||
|
uint _pad0, _pad1;
|
||||||
|
} pc;
|
||||||
|
|
||||||
|
// 8 blocks/WG × 64 ints/block × 4 B = 2 KiB shared.
|
||||||
|
shared int tmp_shared[8 * 64];
|
||||||
|
|
||||||
|
// 1D 8-element butterfly per H.264 §8.5.13.2.
|
||||||
|
void idct8_1d(int d0, int d1, int d2, int d3,
|
||||||
|
int d4, int d5, int d6, int d7,
|
||||||
|
out int g0, out int g1, out int g2, out int g3,
|
||||||
|
out int g4, out int g5, out int g6, out int g7)
|
||||||
|
{
|
||||||
|
int e0 = d0 + d4;
|
||||||
|
int e1 = -d3 + d5 - d7 - (d7 >> 1);
|
||||||
|
int e2 = d0 - d4;
|
||||||
|
int e3 = d1 + d7 - d3 - (d3 >> 1);
|
||||||
|
int e4 = (d2 >> 1) - d6;
|
||||||
|
int e5 = -d1 + d7 + d5 + (d5 >> 1);
|
||||||
|
int e6 = d2 + (d6 >> 1);
|
||||||
|
int e7 = d3 + d5 + d1 + (d1 >> 1);
|
||||||
|
|
||||||
|
int f0 = e0 + e6;
|
||||||
|
int f1 = e1 + (e7 >> 2);
|
||||||
|
int f2 = e2 + e4;
|
||||||
|
int f3 = e3 + (e5 >> 2);
|
||||||
|
int f4 = e2 - e4;
|
||||||
|
int f5 = (e3 >> 2) - e5;
|
||||||
|
int f6 = e0 - e6;
|
||||||
|
int f7 = e7 - (e1 >> 2);
|
||||||
|
|
||||||
|
g0 = f0 + f7;
|
||||||
|
g1 = f2 + f5;
|
||||||
|
g2 = f4 + f3;
|
||||||
|
g3 = f6 + f1;
|
||||||
|
g4 = f6 - f1;
|
||||||
|
g5 = f4 - f3;
|
||||||
|
g6 = f2 - f5;
|
||||||
|
g7 = f0 - f7;
|
||||||
|
}
|
||||||
|
|
||||||
|
void main()
|
||||||
|
{
|
||||||
|
// local_size 64 = 8 blocks × 8 lanes/block.
|
||||||
|
uint gid = gl_GlobalInvocationID.x;
|
||||||
|
uint wg_id = gid / 64u;
|
||||||
|
uint lane_in_wg = gid & 63u;
|
||||||
|
uint block_local = lane_in_wg >> 3; // 0..7
|
||||||
|
uint k = lane_in_wg & 7u; // 0..7
|
||||||
|
uint block_idx = wg_id * 8u + block_local;
|
||||||
|
|
||||||
|
bool oob = (block_idx >= pc.n_blocks);
|
||||||
|
|
||||||
|
// ---- Row pass --------------------------------------------------
|
||||||
|
// lane k handles row r=k. Reads block[c*8 + k] for c=0..7.
|
||||||
|
if (!oob) {
|
||||||
|
uint base = block_idx * 64u;
|
||||||
|
int d0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
|
||||||
|
int d1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
|
||||||
|
int d2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
|
||||||
|
int d3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
|
||||||
|
int d4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
|
||||||
|
int d5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
|
||||||
|
int d6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
|
||||||
|
int d7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
|
||||||
|
|
||||||
|
int g0, g1, g2, g3, g4, g5, g6, g7;
|
||||||
|
idct8_1d(d0, d1, d2, d3, d4, d5, d6, d7,
|
||||||
|
g0, g1, g2, g3, g4, g5, g6, g7);
|
||||||
|
|
||||||
|
// Write row k of tmp_shared[block_local].
|
||||||
|
uint tbase = block_local * 64u + k * 8u;
|
||||||
|
tmp_shared[tbase + 0u] = g0;
|
||||||
|
tmp_shared[tbase + 1u] = g1;
|
||||||
|
tmp_shared[tbase + 2u] = g2;
|
||||||
|
tmp_shared[tbase + 3u] = g3;
|
||||||
|
tmp_shared[tbase + 4u] = g4;
|
||||||
|
tmp_shared[tbase + 5u] = g5;
|
||||||
|
tmp_shared[tbase + 6u] = g6;
|
||||||
|
tmp_shared[tbase + 7u] = g7;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier();
|
||||||
|
|
||||||
|
// ---- Column pass ----------------------------------------------
|
||||||
|
// lane k handles column c=k. Reads tmp[r][k] for r=0..7.
|
||||||
|
if (!oob) {
|
||||||
|
uint tbase = block_local * 64u;
|
||||||
|
int s0 = tmp_shared[tbase + 0u * 8u + k];
|
||||||
|
int s1 = tmp_shared[tbase + 1u * 8u + k];
|
||||||
|
int s2 = tmp_shared[tbase + 2u * 8u + k];
|
||||||
|
int s3 = tmp_shared[tbase + 3u * 8u + k];
|
||||||
|
int s4 = tmp_shared[tbase + 4u * 8u + k];
|
||||||
|
int s5 = tmp_shared[tbase + 5u * 8u + k];
|
||||||
|
int s6 = tmp_shared[tbase + 6u * 8u + k];
|
||||||
|
int s7 = tmp_shared[tbase + 7u * 8u + k];
|
||||||
|
|
||||||
|
int g0, g1, g2, g3, g4, g5, g6, g7;
|
||||||
|
idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||||
|
g0, g1, g2, g3, g4, g5, g6, g7);
|
||||||
|
|
||||||
|
// Column k at rows 0..7 of dst, offset by meta.x.
|
||||||
|
uint dst_off = u_meta.meta[block_idx].x;
|
||||||
|
uint stride = pc.dst_stride_u8;
|
||||||
|
uint a0 = dst_off + 0u * stride + k;
|
||||||
|
uint a1 = dst_off + 1u * stride + k;
|
||||||
|
uint a2 = dst_off + 2u * stride + k;
|
||||||
|
uint a3 = dst_off + 3u * stride + k;
|
||||||
|
uint a4 = dst_off + 4u * stride + k;
|
||||||
|
uint a5 = dst_off + 5u * stride + k;
|
||||||
|
uint a6 = dst_off + 6u * stride + k;
|
||||||
|
uint a7 = dst_off + 7u * stride + k;
|
||||||
|
|
||||||
|
int p0 = int(u_dst.dst[a0]);
|
||||||
|
int p1 = int(u_dst.dst[a1]);
|
||||||
|
int p2 = int(u_dst.dst[a2]);
|
||||||
|
int p3 = int(u_dst.dst[a3]);
|
||||||
|
int p4 = int(u_dst.dst[a4]);
|
||||||
|
int p5 = int(u_dst.dst[a5]);
|
||||||
|
int p6 = int(u_dst.dst[a6]);
|
||||||
|
int p7 = int(u_dst.dst[a7]);
|
||||||
|
|
||||||
|
u_dst.dst[a0] = uint8_t(clamp(p0 + ((g0 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a1] = uint8_t(clamp(p1 + ((g1 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a2] = uint8_t(clamp(p2 + ((g2 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a3] = uint8_t(clamp(p3 + ((g3 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a4] = uint8_t(clamp(p4 + ((g4 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a5] = uint8_t(clamp(p5 + ((g5 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a6] = uint8_t(clamp(p6 + ((g6 + 32) >> 6), 0, 255));
|
||||||
|
u_dst.dst[a7] = uint8_t(clamp(p7 + ((g7 + 32) >> 6), 0, 255));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -17,6 +17,18 @@
|
|||||||
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||||
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
|
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
|
||||||
|
|
||||||
|
/* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB). Cycle
|
||||||
|
* 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize
|
||||||
|
* requests fall through to non-pooled allocation. */
|
||||||
|
#define V3D_POOL_MIN_LOG2 8
|
||||||
|
#define V3D_POOL_MAX_LOG2 23
|
||||||
|
#define V3D_POOL_BUCKETS (V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1)
|
||||||
|
|
||||||
|
struct v3d_pool_entry {
|
||||||
|
v3d_buffer buf;
|
||||||
|
struct v3d_pool_entry *next;
|
||||||
|
};
|
||||||
|
|
||||||
struct v3d_runner {
|
struct v3d_runner {
|
||||||
VkInstance instance;
|
VkInstance instance;
|
||||||
VkPhysicalDevice phys;
|
VkPhysicalDevice phys;
|
||||||
@@ -26,6 +38,15 @@ struct v3d_runner {
|
|||||||
VkCommandPool pool;
|
VkCommandPool pool;
|
||||||
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
|
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
|
||||||
VkPhysicalDeviceMemoryProperties mem_props;
|
VkPhysicalDeviceMemoryProperties mem_props;
|
||||||
|
|
||||||
|
/* Buffer pool: per-bucket freelist of previously-released
|
||||||
|
* v3d_buffer. bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2.
|
||||||
|
* pool_total_bytes accumulates every successful vkAllocateMemory
|
||||||
|
* we've done through the pool — never decreases (the freelist
|
||||||
|
* just hands buffers around, no vkFreeMemory until destroy).
|
||||||
|
*/
|
||||||
|
struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS];
|
||||||
|
size_t pool_total_bytes;
|
||||||
};
|
};
|
||||||
|
|
||||||
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
|
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
|
||||||
@@ -168,6 +189,21 @@ void v3d_runner_destroy(v3d_runner *r)
|
|||||||
{
|
{
|
||||||
if (!r) return;
|
if (!r) return;
|
||||||
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
|
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
|
||||||
|
|
||||||
|
/* Drain the buffer pool BEFORE destroying device — the pool
|
||||||
|
* entries own VkBuffer/VkDeviceMemory handles, which need a live
|
||||||
|
* device for vkDestroyBuffer/vkFreeMemory. */
|
||||||
|
for (int b = 0; b < V3D_POOL_BUCKETS; b++) {
|
||||||
|
struct v3d_pool_entry *e = r->pool_free[b];
|
||||||
|
while (e) {
|
||||||
|
struct v3d_pool_entry *next = e->next;
|
||||||
|
v3d_runner_destroy_buffer(r, &e->buf);
|
||||||
|
free(e);
|
||||||
|
e = next;
|
||||||
|
}
|
||||||
|
r->pool_free[b] = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (r->pool != VK_NULL_HANDLE)
|
if (r->pool != VK_NULL_HANDLE)
|
||||||
vkDestroyCommandPool(r->device, r->pool, NULL);
|
vkDestroyCommandPool(r->device, r->pool, NULL);
|
||||||
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
|
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
|
||||||
@@ -175,6 +211,92 @@ void v3d_runner_destroy(v3d_runner *r)
|
|||||||
free(r);
|
free(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---- Buffer pool ----------------------------------------------- */
|
||||||
|
|
||||||
|
/* ceil_log2 for buffer pool bucket selection. */
|
||||||
|
static int v3d_pool_bucket_for(size_t size)
|
||||||
|
{
|
||||||
|
int log2;
|
||||||
|
size_t m;
|
||||||
|
|
||||||
|
if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2))
|
||||||
|
return 0;
|
||||||
|
m = size - 1;
|
||||||
|
log2 = 0;
|
||||||
|
while (m) { log2++; m >>= 1; }
|
||||||
|
if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2;
|
||||||
|
if (log2 > V3D_POOL_MAX_LOG2) return -1;
|
||||||
|
return log2 - V3D_POOL_MIN_LOG2;
|
||||||
|
}
|
||||||
|
|
||||||
|
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
|
||||||
|
{
|
||||||
|
int bucket;
|
||||||
|
size_t bucket_size;
|
||||||
|
struct v3d_pool_entry *e;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if (!r || !out || size == 0) return -1;
|
||||||
|
|
||||||
|
bucket = v3d_pool_bucket_for(size);
|
||||||
|
if (bucket < 0) {
|
||||||
|
/* Oversize — fall through to non-pooled allocation. Caller
|
||||||
|
* still calls v3d_runner_release_buffer(), which detects the
|
||||||
|
* oversize bucket via bucket_for() and destroys. */
|
||||||
|
return v3d_runner_create_buffer(r, size, out);
|
||||||
|
}
|
||||||
|
bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2);
|
||||||
|
|
||||||
|
e = r->pool_free[bucket];
|
||||||
|
if (e) {
|
||||||
|
r->pool_free[bucket] = e->next;
|
||||||
|
*out = e->buf;
|
||||||
|
free(e);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Miss — allocate fresh at the bucket size. Subsequent acquire/
|
||||||
|
* release for the same bucket reuses this buffer. */
|
||||||
|
rc = v3d_runner_create_buffer(r, bucket_size, out);
|
||||||
|
if (rc == 0)
|
||||||
|
r->pool_total_bytes += bucket_size;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf)
|
||||||
|
{
|
||||||
|
int bucket;
|
||||||
|
struct v3d_pool_entry *e;
|
||||||
|
|
||||||
|
if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return;
|
||||||
|
|
||||||
|
bucket = v3d_pool_bucket_for(buf->size);
|
||||||
|
if (bucket < 0) {
|
||||||
|
/* Oversize — destroy outright; never made it into the pool. */
|
||||||
|
v3d_runner_destroy_buffer(r, buf);
|
||||||
|
memset(buf, 0, sizeof(*buf));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
e = malloc(sizeof(*e));
|
||||||
|
if (!e) {
|
||||||
|
/* Allocator failure: just destroy. Pool degenerates to
|
||||||
|
* non-pooled behaviour but doesn't leak. */
|
||||||
|
v3d_runner_destroy_buffer(r, buf);
|
||||||
|
memset(buf, 0, sizeof(*buf));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
e->buf = *buf;
|
||||||
|
e->next = r->pool_free[bucket];
|
||||||
|
r->pool_free[bucket] = e;
|
||||||
|
memset(buf, 0, sizeof(*buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t v3d_runner_pool_total_bytes(v3d_runner *r)
|
||||||
|
{
|
||||||
|
return r ? r->pool_total_bytes : 0;
|
||||||
|
}
|
||||||
|
|
||||||
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
|
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
|
||||||
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
|
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
|
||||||
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
|
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
|
||||||
@@ -364,12 +486,27 @@ int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
|
|||||||
.pSetLayouts = &out->ds_layout,
|
.pSetLayouts = &out->ds_layout,
|
||||||
};
|
};
|
||||||
CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
|
CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
|
||||||
|
|
||||||
|
/* Persistent command buffer — pool was created with
|
||||||
|
* RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch
|
||||||
|
* sites can call vkResetCommandBuffer on this same cb instead
|
||||||
|
* of paying vkAllocateCommandBuffers per call. */
|
||||||
|
VkCommandBufferAllocateInfo cbai = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
|
||||||
|
.commandPool = r->pool,
|
||||||
|
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
|
||||||
|
.commandBufferCount = 1,
|
||||||
|
};
|
||||||
|
CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
||||||
{
|
{
|
||||||
if (!p || p->pipeline == VK_NULL_HANDLE) return;
|
if (!p || p->pipeline == VK_NULL_HANDLE) return;
|
||||||
|
if (p->cb != VK_NULL_HANDLE)
|
||||||
|
vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb);
|
||||||
vkDestroyPipeline(r->device, p->pipeline, NULL);
|
vkDestroyPipeline(r->device, p->pipeline, NULL);
|
||||||
vkDestroyPipelineLayout(r->device, p->layout, NULL);
|
vkDestroyPipelineLayout(r->device, p->layout, NULL);
|
||||||
vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */
|
vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */
|
||||||
@@ -377,6 +514,13 @@ void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
|||||||
memset(p, 0, sizeof(*p));
|
memset(p, 0, sizeof(*p));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p)
|
||||||
|
{
|
||||||
|
(void) r;
|
||||||
|
if (!p || p->cb == VK_NULL_HANDLE) return -1;
|
||||||
|
return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1;
|
||||||
|
}
|
||||||
|
|
||||||
int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
|
int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
|
||||||
const v3d_buffer *bufs, uint32_t n)
|
const v3d_buffer *bufs, uint32_t n)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -34,6 +34,12 @@ typedef struct {
|
|||||||
VkDescriptorSet desc_set;
|
VkDescriptorSet desc_set;
|
||||||
uint32_t n_ssbos;
|
uint32_t n_ssbos;
|
||||||
uint32_t push_const_size;
|
uint32_t push_const_size;
|
||||||
|
/* Persistent command buffer. Allocated at create-pipeline time;
|
||||||
|
* dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to
|
||||||
|
* vkResetCommandBuffer instead of paying vkAllocateCommandBuffers
|
||||||
|
* per dispatch. Pool flagged RESET_COMMAND_BUFFER_BIT so reset
|
||||||
|
* is permitted. */
|
||||||
|
VkCommandBuffer cb;
|
||||||
} v3d_pipeline;
|
} v3d_pipeline;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -57,10 +63,43 @@ const char *v3d_runner_device_name(v3d_runner *r);
|
|||||||
* host side. The mapping persists for the lifetime of the buffer.
|
* host side. The mapping persists for the lifetime of the buffer.
|
||||||
*
|
*
|
||||||
* Returns 0 on success, non-zero on failure.
|
* Returns 0 on success, non-zero on failure.
|
||||||
|
*
|
||||||
|
* NOTE: prefer v3d_runner_acquire_buffer() on the dispatch hot path —
|
||||||
|
* create_buffer/destroy_buffer go straight to vkAllocateMemory each
|
||||||
|
* call, which on V3D7's Mesa stack costs ~10-50us. The acquire/
|
||||||
|
* release pair pulls from a freelist and pays vkAllocateMemory only
|
||||||
|
* on a cache miss.
|
||||||
*/
|
*/
|
||||||
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
||||||
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
|
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pooled buffer acquisition. Returns a v3d_buffer whose .size is the
|
||||||
|
* smallest power-of-2 >= the requested size (so callers can pool
|
||||||
|
* across similar-sized requests). Backed by HOST_VISIBLE |
|
||||||
|
* HOST_COHERENT memory; mapped pointer is valid.
|
||||||
|
*
|
||||||
|
* On cache hit: zero-cost reuse of a previously-released buffer.
|
||||||
|
* On miss: falls through to v3d_runner_create_buffer(). Release with
|
||||||
|
* v3d_runner_release_buffer(); pool drains in v3d_runner_destroy().
|
||||||
|
*
|
||||||
|
* Lifetime contract: the returned buffer's .mapped contents are
|
||||||
|
* UNINITIALISED — the previous user's data may still be present.
|
||||||
|
* Callers that need a clean buffer must memset themselves. This is
|
||||||
|
* deliberate; the dispatch hot paths immediately overwrite the
|
||||||
|
* buffer with new coefficients / meta anyway.
|
||||||
|
*
|
||||||
|
* Thread-safety: NOT thread-safe. A daedalus_ctx is single-threaded
|
||||||
|
* by API contract; the pool inherits that constraint.
|
||||||
|
*/
|
||||||
|
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
||||||
|
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf);
|
||||||
|
|
||||||
|
/* Pool diagnostics: total allocated bytes (sum across all size
|
||||||
|
* classes, including currently-released entries). Useful for
|
||||||
|
* watermark logging. */
|
||||||
|
size_t v3d_runner_pool_total_bytes(v3d_runner *r);
|
||||||
|
|
||||||
/* Compute pipeline from a SPIR-V file path. The descriptor-set
|
/* Compute pipeline from a SPIR-V file path. The descriptor-set
|
||||||
* layout exposes `n_ssbos` storage buffer bindings at binding
|
* layout exposes `n_ssbos` storage buffer bindings at binding
|
||||||
* indices 0..n_ssbos-1, all visible to the compute stage. A push
|
* indices 0..n_ssbos-1, all visible to the compute stage. A push
|
||||||
@@ -88,6 +127,12 @@ int v3d_runner_bind_buffers(v3d_runner *r,
|
|||||||
/* Allocate a primary command buffer from the runner's pool. */
|
/* Allocate a primary command buffer from the runner's pool. */
|
||||||
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
|
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
|
||||||
|
|
||||||
|
/* Reset @p->cb so it can be re-recorded. Returns 0 on success.
|
||||||
|
* Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path —
|
||||||
|
* vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us
|
||||||
|
* driver cost. */
|
||||||
|
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p);
|
||||||
|
|
||||||
/* Submit `cb` to the queue and wait for completion. The classic
|
/* Submit `cb` to the queue and wait for completion. The classic
|
||||||
* timed operation. Returns 0 on success.
|
* timed operation. Returns 0 on success.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -0,0 +1,120 @@
|
|||||||
|
/*
|
||||||
|
* bench_pool_overhead — measure QPU dispatch overhead with and without
|
||||||
|
* the v3d_runner buffer pool warm.
|
||||||
|
*
|
||||||
|
* Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and
|
||||||
|
* prints the per-call distribution. The first call pays
|
||||||
|
* vkAllocateMemory (typically tens of microseconds on V3D7's Mesa);
|
||||||
|
* the second and subsequent should hit the pool freelist and amortise
|
||||||
|
* to the pure dispatch-floor cost.
|
||||||
|
*
|
||||||
|
* Purpose: provide a concrete before/after number for the QPU-default
|
||||||
|
* substrate decree (2026-05-23). Bench is non-gating and runs in
|
||||||
|
* fractions of a second.
|
||||||
|
*
|
||||||
|
* License: BSD-2-Clause.
|
||||||
|
*/
|
||||||
|
#define _POSIX_C_SOURCE 200809L
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
#include "../include/daedalus.h"
|
||||||
|
|
||||||
|
extern size_t v3d_runner_pool_total_bytes(void *); /* exposed if we wanted it */
|
||||||
|
|
||||||
|
static double now_seconds(void)
|
||||||
|
{
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int cmp_double(const void *a, const void *b)
|
||||||
|
{
|
||||||
|
double da = *(const double *)a, db = *(const double *)b;
|
||||||
|
return da < db ? -1 : da > db ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int n_calls = argc > 1 ? atoi(argv[1]) : 200;
|
||||||
|
int n_blocks = 8; /* one MB column of 8x8 IDCT blocks */
|
||||||
|
int stride = 64;
|
||||||
|
|
||||||
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
|
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
||||||
|
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||||
|
printf("ctx: has_qpu=%d\n", has_qpu);
|
||||||
|
if (!has_qpu) {
|
||||||
|
fprintf(stderr, "QPU not available on this device; bench needs V3D\n");
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Build a representative IDCT 8x8 batch and warm a dst buffer. */
|
||||||
|
int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t));
|
||||||
|
uint8_t *dst = calloc((size_t) n_blocks * 8 * stride, 1);
|
||||||
|
daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta));
|
||||||
|
if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; }
|
||||||
|
|
||||||
|
uint64_t s = 0x1234567abcdefULL;
|
||||||
|
for (size_t i = 0; i < (size_t) n_blocks * 64; i++) {
|
||||||
|
s ^= s << 13; s ^= s >> 7; s ^= s << 17;
|
||||||
|
coeffs[i] = (int16_t)(s & 0x7ff) - 0x400;
|
||||||
|
}
|
||||||
|
for (int b = 0; b < n_blocks; b++) {
|
||||||
|
meta[b].dst_off = (uint32_t) b * 8;
|
||||||
|
meta[b].block_x = (uint32_t) b;
|
||||||
|
meta[b].block_y = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double *t = malloc((size_t) n_calls * sizeof(double));
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
printf("=== dispatching %d times, n_blocks=%d/call ===\n",
|
||||||
|
n_calls, n_blocks);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_calls; i++) {
|
||||||
|
double t0 = now_seconds();
|
||||||
|
rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU,
|
||||||
|
dst, (size_t) stride,
|
||||||
|
coeffs, (size_t) n_blocks, meta);
|
||||||
|
double t1 = now_seconds();
|
||||||
|
if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; }
|
||||||
|
t[i] = (t1 - t0) * 1e6; /* us */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Per-call distribution (first few + sorted summary on the steady-state) */
|
||||||
|
printf("\nfirst 5 calls (cold-warm transition):\n");
|
||||||
|
for (int i = 0; i < 5 && i < n_calls; i++)
|
||||||
|
printf(" call %d: %.2f us\n", i, t[i]);
|
||||||
|
|
||||||
|
int skip = 10; /* drop warm-up calls from the steady-state stats */
|
||||||
|
if (n_calls > skip + 10) {
|
||||||
|
int n = n_calls - skip;
|
||||||
|
double *s_arr = malloc((size_t) n * sizeof(double));
|
||||||
|
memcpy(s_arr, t + skip, (size_t) n * sizeof(double));
|
||||||
|
qsort(s_arr, (size_t) n, sizeof(double), cmp_double);
|
||||||
|
double sum = 0;
|
||||||
|
for (int i = 0; i < n; i++) sum += s_arr[i];
|
||||||
|
printf("\nsteady-state stats (calls %d..%d, n=%d):\n",
|
||||||
|
skip, n_calls - 1, n);
|
||||||
|
printf(" min: %.2f us\n", s_arr[0]);
|
||||||
|
printf(" p50: %.2f us\n", s_arr[n / 2]);
|
||||||
|
printf(" p90: %.2f us\n", s_arr[(int)(n * 0.9)]);
|
||||||
|
printf(" p99: %.2f us\n", s_arr[(int)(n * 0.99)]);
|
||||||
|
printf(" max: %.2f us\n", s_arr[n - 1]);
|
||||||
|
printf(" mean: %.2f us\n", sum / n);
|
||||||
|
printf("\nfirst-call / steady-state median ratio: %.1fx\n",
|
||||||
|
t[0] / s_arr[n / 2]);
|
||||||
|
free(s_arr);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(t); free(coeffs); free(dst); free(meta);
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user