Merge pull request 'h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)' (#10 ) from noether/h264-deblock-chroma into main

Reviewed-on: #10
h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)
2026-05-24 21:55:57 +00:00 · 2026-05-24 23:53:09 +02:00 · 2026-05-24 21:47:57 +00:00 · 2026-05-24 23:29:06 +02:00 · 2026-05-24 23:28:56 +02:00 · 2026-05-23 19:14:44 +00:00
14 changed files with 1742 additions and 78 deletions
@@ -11,3 +11,4 @@ build-*/
 # Forensic snapshot of the corrupted .git from 2026-05-18 10:25
 # working-tree wipe. Retained on disk for inspection; not tracked.
 .git-broken-2026-05-18/
 .claude/
@@ -284,7 +284,40 @@ if (DAEDALUS_BUILD_VULKAN)
        VERBATIM
    )
-    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV})
+    set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv)
    add_custom_command(
        OUTPUT ${H264_IDCT4_SPV}
        COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
                -o ${H264_IDCT4_SPV}
                ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
        DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
        COMMENT "glslang: v3d_h264_idct4.comp -> v3d_h264_idct4.spv"
        VERBATIM
    )
    set(H264_IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct8.spv)
    add_custom_command(
        OUTPUT ${H264_IDCT8_SPV}
        COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
                -o ${H264_IDCT8_SPV}
                ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
        DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
        COMMENT "glslang: v3d_h264_idct8.comp -> v3d_h264_idct8.spv"
        VERBATIM
    )
    set(H264_QPEL_MC20_SPV ${CMAKE_BINARY_DIR}/v3d_h264_qpel_mc20.spv)
    add_custom_command(
        OUTPUT ${H264_QPEL_MC20_SPV}
        COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
                -o ${H264_QPEL_MC20_SPV}
                ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc20.comp
        DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc20.comp
        COMMENT "glslang: v3d_h264_qpel_mc20.comp -> v3d_h264_qpel_mc20.spv"
        VERBATIM
    )
    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV})
    # v3d_runner — reusable Vulkan plumbing.
    add_library(v3d_runner STATIC src/v3d_runner.c)
@@ -412,6 +445,9 @@ if (DAEDALUS_BUILD_VULKAN)
        ${LPF8_SPV}
        ${CDEF_SPV}
        ${H264DEBLOCK_SPV}
        ${H264_IDCT4_SPV}
        ${H264_IDCT8_SPV}
        ${H264_QPEL_MC20_SPV}
        DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
    )
 endif()
@@ -419,9 +455,33 @@ endif()
 # pkg-config file.  Vulkan goes in Requires.private (consumer's
 # pkg-config call gets it via --static).  pthread + dl are needed
 # by the static archive's runtime helpers.
 #
 # `prefix` is derived from ${pcfiledir} so the .pc is relocatable:
 # pkg-config substitutes ${pcfiledir} with the directory holding the
 # .pc at lookup time, and the relative path from
 # <prefix>/<libdir>/pkgconfig back to <prefix> tells pkg-config the
 # install prefix without baking it in.  This is why
 # `cmake --install build --prefix /foo` produces a .pc that correctly
 # resolves `prefix=/foo` instead of baking whatever CMAKE_INSTALL_PREFIX
 # was at *configure* time (default /usr/local).  DESTDIR-staged
 # installs work too: at runtime pkg-config sees the .pc at its real
 # install path and computes the right prefix.
 #
 # Relative-path depth is computed from CMAKE_INSTALL_LIBDIR (and
 # whatever multiarch tuple GNUInstallDirs adds) so Debian-style
 # `lib/aarch64-linux-gnu/pkgconfig/...` resolves with the right number
 # of `..` components.  Layouts where libdir is *not* under prefix are
 # not supported by this scheme; if a packager overrides libdir to an
 # absolute path the relative-path machinery falls back to the absolute
 # value (CMake's file(RELATIVE_PATH) prepends `..` until they meet),
 # which is also relocatable but no longer prefix-agnostic.
 file(RELATIVE_PATH PKGCONFIG_PCDIR_TO_PREFIX
    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig"
    "${CMAKE_INSTALL_PREFIX}")
 set(PKGCONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/daedalus-fourier.pc)
 file(WRITE ${PKGCONFIG_OUT}
-"prefix=${CMAKE_INSTALL_PREFIX}
+"prefix=\${pcfiledir}/${PKGCONFIG_PCDIR_TO_PREFIX}
 exec_prefix=\${prefix}
 libdir=\${prefix}/${CMAKE_INSTALL_LIBDIR}
 includedir=\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}
@@ -459,6 +519,8 @@ add_executable(test_api_h264
    tests/h264_idct4_ref.c
    tests/h264_idct8_ref.c
    tests/h264_deblock_ref.c
    tests/h264_h_loop_filter_luma_ref.c
    tests/h264_chroma_loop_filter_ref.c
    tests/h264_qpel8_mc20_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
@@ -468,6 +530,10 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
 target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
 target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
 add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
 target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
 target_compile_options(bench_pool_overhead PRIVATE -O2)
 if (DAEDALUS_BUILD_VULKAN)
 # (re-open the conditional so the closing endif() below balances)
@@ -4,9 +4,9 @@
 This document is forward-looking. It describes the generalized multi-SoC daedalus daemon architecture, but the immediate work block stays "finish Pi 5". Re-read this when:
- A second aarch64 host without a working kernel-side V4L2 stateless decoder shows up in the fleet (most likely candidate: Pi 4, which has V3D 4.x and no rpivid stable upstream).
+- HW decode on noether (Pi 4, the user's interactive workstation) becomes a real ask and rpivid upstream is still unstable. This is the most likely trigger — same SoC class as Pi 5 but weaker V3D 4.x, so the caps-file mechanism plus an extra row's worth of substrate measurements.
- A specific working-copy slowdown that the current Pi-5-only daedalus can't address motivates the generalization.
+- AV1 playback on boltzmann (RK3588) starts mattering. rkvdec doesn't cover AV1, so the daedalus path becomes the only HW-accelerated option, and Mali Valhall compute substrate decisions need their own caps row.
- libva-v4l2-request-fourier evolves to need multi-node negotiation (currently it picks the first matching V4L2 node).
+- libva-v4l2-request-fourier evolves to need multi-node negotiation (today it picks the first matching V4L2 node; a host with both rkvdec and daedalus-v4l2 nodes wants a preference policy).
 Until then: this is decision context, not a TODO.
@@ -51,13 +51,17 @@ The mfritsche fleet has heterogeneous aarch64 hardware decoders:
 | SoC | Host(s) | H.264 | HEVC | VP9 | AV1 | GPU compute |
 |---|---|---|---|---|---|---|
-| BCM2712 (Pi 5) | higgs, broglie | none | V3D7 (rpi-hevc-dec — SPS quirks) | none | none | V3D7 (Vulkan compute, queryable) |
+| BCM2712 (Pi 5) | higgs, hertz, broglie, tesla (LXD on hertz) | none | V3D7 (rpi-hevc-dec — SPS quirks) | none | none | V3D7 (Vulkan compute, queryable) |
-| BCM2711 (Pi 4) | dcw3 | rpivid (out of tree, unstable) | rpivid (out of tree, unstable) | none | none | V3D4 (Vulkan compute, weaker) |
+| BCM2711 (Pi 4) | noether (interactive workstation), dcw3, dcw2 | rpivid (out of tree, unstable) | rpivid (out of tree, unstable) | none | none | V3D4 (Vulkan compute, weaker) |
-| RK3588 | hertz, tesla | rkvdec V4L2 stateless (upstream) | rkvdec V4L2 stateless | rkvdec V4L2 stateless | none (rkvdec lacks AV1) | Mali Valhall (panvk) + RK NPU |
+| RK3588 | boltzmann (32 GB, kernel-dev / MCP hub, 8 W always-on) | rkvdec V4L2 stateless (upstream) | rkvdec V4L2 stateless | rkvdec V4L2 stateless | none (rkvdec lacks AV1) | Mali Valhall (panvk-bifrost-video in dev) + RK NPU |
-| Allwinner H6 | (not in current fleet, but Cedrus exists) | Cedrus V4L2 | Cedrus V4L2 | none | none | Mali Bifrost |
+| Allwinner H6 | (not in current fleet, but Cedrus exists upstream) | Cedrus V4L2 | Cedrus V4L2 | none | none | Mali Bifrost |
 No single SoC has a complete codec set. RK3588 lacks AV1; Pi 5 lacks H.264 + VP9 + AV1; Pi 4 has rpivid (out-of-tree, kernel-version-fragile); Allwinner Cedrus is H.264/HEVC only.
 A note on the Pi 5 row: hertz and tesla share hardware (tesla is an LXD container hosted on hertz) but are operationally distinct — tesla is the distcc/MCP worker, hertz is the LXD host with all the cron automations and the 17-tool lmcp hub. From a daedalus deployment perspective they count as **one** Pi 5 substrate; from a workflow perspective they're separate boxes.
 A note on noether: it's the user's interactive workstation (Pi 4, BCM2711). Firefox + mpv run here. Any "I want HW decode on my main box" pressure lands first on this host, which puts Pi 4 (V3D4 + maybe-rpivid) closer to the front of the queue than the original draft of this document suggested.
 The current daedalus model — "kernel substitution + libavcodec front end" — is the right answer for **Pi 5 specifically**, where no usable kernel V4L2 stateless decoder exists for the codecs we care about, and a Vulkan-capable GPU (V3D7) is available to help on a few kernels.
 The model is **not** the right answer for SoCs that already have working V4L2 stateless decoders for the requested codec — those should be passed through, not re-implemented through libavcodec + kernel substitution.
@@ -207,15 +211,15 @@ Pass-through plugins are *thin* — they translate the daedalus daemon's wire pr
 **Today's calculus:**
- Pi 5 daedalus path is the only thing in the fleet that uses daedalus daemon. Generalizing for a single user is overdesign.
+- Pi 5 (higgs + hertz + broglie + tesla) is **four hosts**, but **one SoC**. Adding the fifth Pi 5 host wouldn't pressure-test the architecture; they all share BCM2712 caps so the substrate decisions are identical across the row.
- RK3588 uses rkvdec directly through libva-v4l2-request-fourier; daedalus daemon is **not in the path** for any RK3588 codec. The "RK3588 support" the architecture above proposes is mostly a no-op routing decision plus an AV1 fallback that doesn't yet measure on Mali.
+- boltzmann (RK3588) is the only non-Pi-5 always-on host in the fleet, and it uses rkvdec directly through libva-v4l2-request-fourier — daedalus daemon is **not in the path** for any RK3588 codec on it. The "RK3588 support" the architecture above proposes is mostly a no-op routing decision plus an AV1 fallback that doesn't yet measure on Mali. No forcing pressure from boltzmann today.
- Pi 4 with rpivid is the only realistic second motivator. rpivid upstream stability is the gate — if it lands cleanly, Pi 4 takes the pass-through path with no kernel substitution needed. If it stays out-of-tree-fragile, **then** the substrate-composed path with V3D4 + NEON becomes the right backend for Pi 4, and we need the per-SoC caps mechanism to handle V3D4's weaker compute.
+- noether (Pi 4, this user's interactive workstation) and dcw3/dcw2 (also Pi 4) are the real second-SoC candidates. The gate is rpivid upstream stability: if it lands cleanly, Pi 4 takes the pass-through path with zero kernel substitution work. If it stays out-of-tree-fragile, **then** the substrate-composed path with V3D4 + NEON becomes the right backend for Pi 4, and we need the per-SoC caps mechanism to handle V3D4's weaker compute.
 - The recipe layer in daedalus-fourier already scales cleanly. Adding more substrates is incremental, not architectural.
 **The forcing function that flips this from "deferred" to "do it":**
- Pi 4 enters daily use and rpivid is still not stable upstream — implies we need a Pi 4 substrate-composed path, which means at minimum a second caps file and the loader for it. At that point, building the full pluggable scaffold becomes proportionate.
+- **noether-as-Firefox-host** — the user starts wanting HW decode on their main workstation and rpivid is still not stable upstream. Implies a Pi 4 substrate-composed path, which means at minimum a second caps file and the loader for it. At that point, building the full pluggable scaffold becomes proportionate. This is the most likely trigger; noether is already a daily-driver Pi 4.
- **Or:** an x86 host enters the fleet running mesa-panvk on a Pi-CM5-like board, and we need the daedalus daemon to discover it dynamically rather than being baked at build time.
+- **boltzmann-as-AV1-decoder** — RK3588 has no AV1 HW decoder, and the user wants AV1 playback there (currently CPU-only). Triggers a cycle-5–equivalent measurement campaign on Mali Valhall to see whether `daedalus_recipe_dispatch_cdef_8x8` (or follow-on AV1 kernels) is worth running on Mali compute. If yes, we need an RK3588 caps file that overrides only the AV1 row while leaving H.264/HEVC/VP9 on rkvdec pass-through.
 - **Or:** a third-party Pi 5 user needs to swap shaders for V3D firmware experiments without rebuilding the daemon — at that point dynamic shader loading + caps overrides become a feature ask.
 Until one of those happens: keep daedalus daemon Pi 5 specific. Push cross-SoC abstraction *up* to libva-v4l2-request-fourier (which already does most of it) rather than *down* into the daemon.
@@ -242,6 +246,7 @@ Until one of those happens: keep daedalus daemon Pi 5 specific. Push cross-SoC a
 |---|---|---|
 | 2026-05-23 | **Defer generalization.** Finish Pi 5 substitution arc (cycle 9 PR #90 pending), then pivot to bug-fix backlog (daemon SEGV #145, D-state #146) before architecture work. | Architecture pivot is a multi-week scope; Pi 5 path is the only user-visible motivator today; deferring loses nothing because the recipe layer already abstracts kernels and libva-v4l2-request-fourier already abstracts V4L2 nodes. |
 | 2026-05-23 | **Document the design now, even though it's deferred.** | Captures the conceptual gap (shaders ≠ hardware decoders) and the two-backend conclusion while the analysis is fresh; saves re-litigating in 3–6 months. |
 | 2026-05-23 | **Correct fleet hardware mapping.** Original draft had hertz/tesla under RK3588 and omitted boltzmann + noether entirely. Verified via `/proc/device-tree/compatible`: hertz + tesla are Pi 5 (BCM2712), noether is Pi 4 (BCM2711), boltzmann is the only RK3588 in the fleet. Adjusted "Why deferred" / forcing-function reasoning accordingly — Pi 5 row is now 4 hosts (one SoC), noether is the realistic Pi 4 trigger, boltzmann is the realistic RK3588 trigger via AV1. | Original draft was speculative on host-to-SoC mapping; verified state changes which forcing functions are credible. |
 ---
@@ -263,6 +263,58 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 /* H.264 luma "h_loop_filter" — sibling of _v, applies filter
 * HORIZONTALLY across a VERTICAL edge (16 rows tall; pix points to
 * row 0 of the right block, col 0 = leftmost output column).  Same
 * non-intra (bS < 4) variant.
 *
 * Each tile is 8 cols x 16 rows of context (cols -4..+3 around the
 * edge).  dst_off points to row 0 col 0 of the RIGHT block.
 *
 * Constraint: (dst_off % dst_stride) >= 4 (the kernel reads p3 at
 * pix[-4]).  Caller must ensure this.
 *
 * QPU shader for the H variant is not yet implemented; recipe table
 * routes AUTO to CPU NEON.  An explicit DAEDALUS_SUBSTRATE_QPU on
 * the _h dispatch returns -1 rather than silently degrading.
 */
 int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 /* H.264 chroma (4:2:0) loop filters — bS<4 variant.  Chroma uses
 * the SAME daedalus_h264_deblock_meta struct as luma but on smaller
 * tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
 * rows for H (4 segments of 2 rows).  Each segment has its own tc0
 * strength (tc0[s] applies to both cells in segment s).
 *
 * Algorithm difference vs luma: chroma updates only p0 and q0
 * (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
 * luma-style ap/aq side-condition bonus).
 *
 * QPU shaders for chroma deblock not implemented yet; recipe table
 * routes AUTO to CPU NEON.  Explicit SUBSTRATE_QPU returns -1.
 */
 int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 /* -------------------------------------------------------------------
 * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
@@ -309,6 +361,9 @@ typedef enum {
    DAEDALUS_KERNEL_H264_IDCT8      = 7,
    DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
    DAEDALUS_KERNEL_H264_QPEL_MC20  = 9,
    DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
    DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
    DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
 } daedalus_kernel;
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
@@ -40,6 +40,12 @@ struct daedalus_ctx {
    v3d_pipeline  cdef_pipe;
    int           h264deblock_pipe_ready;
    v3d_pipeline  h264deblock_pipe;
    int           h264_idct4_pipe_ready;
    v3d_pipeline  h264_idct4_pipe;
    int           h264_idct8_pipe_ready;
    v3d_pipeline  h264_idct8_pipe;
    int           h264_qpel_mc20_pipe_ready;
    v3d_pipeline  h264_qpel_mc20_pipe;
 };
 daedalus_ctx *daedalus_ctx_create(void)
@@ -53,6 +59,25 @@ daedalus_ctx *daedalus_ctx_create(void)
 daedalus_ctx *daedalus_ctx_create_no_qpu(void)
 {
    /*
     * Per the "QPU is default substrate" decree 2026-05-23:
     * setting DAEDALUS_FORCE_QPU=1 in the process env escalates this
     * function to a full daedalus_ctx_create(), letting the libavcodec
     * substitution shims (which call create_no_qpu via pthread_once)
     * fire the V3D shaders that exist for cycles 1/2/4/5/8.  Without
     * this hook each consumer process (firefox, mpv, daemon) would
     * need its own shim build to opt into QPU.
     *
     * Default behaviour (env var unset / not "1") is unchanged: pure
     * NEON ctx, no implicit Vulkan init.  Firefox / mpv consumers
     * that dlopen libavcodec without opting in stay on the
     * Vulkan-free path; the daemon explicitly sets
     * DAEDALUS_FORCE_QPU=1 before loading libavcodec.
     */
    const char *force = getenv("DAEDALUS_FORCE_QPU");
    if (force && force[0] == '1' && force[1] == 0)
        return daedalus_ctx_create();
    daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
    if (!ctx) return NULL;
    ctx->has_qpu = 0;
@@ -75,6 +100,9 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
        if (ctx->mc8h_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
        if (ctx->cdef_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
        if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
        if (ctx->h264_idct4_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
        if (ctx->h264_idct8_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
        if (ctx->h264_qpel_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc20_pipe);
        v3d_runner_destroy(ctx->runner);
    }
    free(ctx);
@@ -84,16 +112,28 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
 {
    /*
     * Recipe table per the "QPU is default substrate" decree
     * 2026-05-23.  Any kernel that has a V3D compute shader returns
     * SUBSTRATE_QPU; CPU is the fallback for kernels without a
     * shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel
     * mc20 — covered by follow-on task 165).  The dispatch
     * wrappers already fall back to CPU automatically when the
     * ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0).
     */
    switch (k) {
    case DAEDALUS_KERNEL_VP9_IDCT8:        return DAEDALUS_SUBSTRATE_QPU;
    case DAEDALUS_KERNEL_VP9_LPF4_INNER:   return DAEDALUS_SUBSTRATE_QPU;
-    case DAEDALUS_KERNEL_VP9_MC_8H:        return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_VP9_MC_8H:        return DAEDALUS_SUBSTRATE_QPU;	/* v3d_mc_8h.spv */
    case DAEDALUS_KERNEL_VP9_LPF8_INNER:   return DAEDALUS_SUBSTRATE_QPU;
-    case DAEDALUS_KERNEL_AV1_CDEF_8X8:     return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_AV1_CDEF_8X8:     return DAEDALUS_SUBSTRATE_QPU;	/* v3d_cdef.spv */
-    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct4.spv */
-    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct8.spv */
-    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264deblock.spv */
-    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_DEBLOCK_LH:  return DAEDALUS_SUBSTRATE_CPU;	/* QPU H shader pending */
    case DAEDALUS_KERNEL_H264_DEBLOCK_CV:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
    case DAEDALUS_KERNEL_H264_DEBLOCK_CH:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -118,6 +158,12 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride
 extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
 extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
 extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
                                                int alpha, int beta, int8_t *tc0);
 extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
                                                int alpha, int beta, int8_t *tc0);
 extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);
@@ -229,6 +275,51 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
    return 0;
 }
 static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    (void) ctx;
    for (size_t i = 0; i < n_edges; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
                                 meta[i].tc0[2], meta[i].tc0[3] };
        ff_h264_h_loop_filter_luma_neon(dst + meta[i].dst_off,
                                         (ptrdiff_t) dst_stride,
                                         meta[i].alpha, meta[i].beta, tc0_local);
    }
    return 0;
 }
 static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    (void) ctx;
    for (size_t i = 0; i < n_edges; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
                                 meta[i].tc0[2], meta[i].tc0[3] };
        ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
                                            (ptrdiff_t) dst_stride,
                                            meta[i].alpha, meta[i].beta, tc0_local);
    }
    return 0;
 }
 static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    (void) ctx;
    for (size_t i = 0; i < n_edges; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
                                 meta[i].tc0[2], meta[i].tc0[3] };
        ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
                                            (ptrdiff_t) dst_stride,
                                            meta[i].alpha, meta[i].beta, tc0_local);
    }
    return 0;
 }
 static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -291,13 +382,13 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
    }
    v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
-    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
+    if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
-        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
+        v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
    }
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
-        v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+        v3d_runner_release_buffer(ctx->runner, &buf_dst);
-        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
+        v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
    }
    /* Upload. Coeffs and meta are straight copies. Dst we copy the
@@ -325,8 +416,8 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
        ._pad = 0,
    };
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail;
-    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBuffer cb = ctx->idct8_pipe.cb;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
@@ -344,15 +435,15 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
    /* Read-back dst. */
    memcpy(dst, buf_dst.mapped, max_byte_touched);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
+    v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
    return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
+    v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
    return -1;
 }
@@ -424,9 +515,9 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
    size_t dst_window_size = hi - lo;
    v3d_buffer buf_meta = {0}, buf_dst = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
-        v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
+        v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
    }
    memcpy(buf_dst.mapped, dst + lo, dst_window_size);
@@ -442,8 +533,8 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
    if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
    uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail;
-    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBuffer cb = p->cb;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
@@ -468,12 +559,12 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
    memcpy(dst + lo, buf_dst.mapped, dst_window_size);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
    return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
    return -1;
 }
@@ -509,9 +600,9 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
    }
    v3d_buffer bm = {0}, bd = {0}, bs = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_max,     &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
-    if (v3d_runner_create_buffer(ctx->runner, src_max,     &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, src_max,     &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
    memcpy(bs.mapped, src, src_max);
    memcpy(bd.mapped, dst, dst_max);
@@ -530,8 +621,8 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
    mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
                 .dst_stride_u8 = (uint32_t) dst_stride,
                 .src_stride_u8 = (uint32_t) src_stride };
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail;
-    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBuffer cb = ctx->mc8h_pipe.cb;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
@@ -545,14 +636,14 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
    memcpy(dst, bd.mapped, dst_max);
-    v3d_runner_destroy_buffer(ctx->runner, &bs);
+    v3d_runner_release_buffer(ctx->runner, &bs);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bm);
    return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &bs);
+    v3d_runner_release_buffer(ctx->runner, &bs);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bm);
    return -1;
 }
@@ -588,9 +679,9 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
    size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
    v3d_buffer bm = {0}, bd = {0}, bt = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
-    if (v3d_runner_create_buffer(ctx->runner, tmp_bytes,  &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes,  &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
    /* tmp may need padding before block-origin offset (caller-allocated). Just
     * copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
@@ -615,8 +706,8 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
    cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
                   .tmp_stride_u16 = 16,
                   .dst_stride_u8 = (uint32_t) dst_stride };
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail;
-    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBuffer cb = ctx->cdef_pipe.cb;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
@@ -630,14 +721,14 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
    memcpy(dst, bd.mapped, dst_max);
-    v3d_runner_destroy_buffer(ctx->runner, &bt);
+    v3d_runner_release_buffer(ctx->runner, &bt);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bm);
    return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &bt);
+    v3d_runner_release_buffer(ctx->runner, &bt);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bm);
    return -1;
 }
@@ -670,8 +761,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
    }
    v3d_buffer bm = {0}, bd = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
    memcpy(bd.mapped, dst, dst_max);
    uint32_t *m = bm.mapped;
@@ -691,8 +782,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
    uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
    h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
                          .dst_stride_u8 = (uint32_t) dst_stride };
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail;
-    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBuffer cb = ctx->h264deblock_pipe.cb;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
@@ -706,12 +797,294 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
    memcpy(dst, bd.mapped, dst_max);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bm);
    return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bd);
    v3d_runner_release_buffer(ctx->runner, &bm);
    return -1;
 }
 /* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */
 typedef struct {
    uint32_t n_blocks;
    uint32_t dst_stride_u8;
    uint32_t _pad0;
    uint32_t _pad1;
 } h264_idct4_pc;
 static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
    if (!ctx->h264_idct4_pipe_ready) {
        if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv",
                                       3, sizeof(h264_idct4_pc),
                                       &ctx->h264_idct4_pipe) != 0)
            return -1;
        ctx->h264_idct4_pipe_ready = 1;
    }
    size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t);
    size_t meta_bytes  = n_blocks * 4 * sizeof(uint32_t);    /* uvec4 per block */
    size_t dst_max = 0;
    for (size_t i = 0; i < n_blocks; i++) {
        size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4;
        if (e > dst_max) dst_max = e;
    }
    v3d_buffer bc = {0}, bd = {0}, bm = {0};
    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) {
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    if (v3d_runner_create_buffer(ctx->runner, meta_bytes,  &bm)) {
        v3d_runner_destroy_buffer(ctx->runner, &bd);
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    memcpy(bc.mapped, coeffs, coeff_bytes);
    memcpy(bd.mapped, dst,    dst_max);
    uint32_t *m = bm.mapped;
    for (size_t i = 0; i < n_blocks; i++) {
        m[4*i+0] = meta[i].dst_off;
        m[4*i+1] = 0;
        m[4*i+2] = 0;
        m[4*i+3] = 0;
    }
    v3d_buffer binds[3] = { bc, bd, bm };
    if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3))
        goto fail;
    uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16);   /* 16 blocks/WG */
    h264_idct4_pc pc = {
        .n_blocks      = (uint32_t) n_blocks,
        .dst_stride_u8 = (uint32_t) dst_stride,
    };
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
    if (cb == VK_NULL_HANDLE) goto fail;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                      ctx->h264_idct4_pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            ctx->h264_idct4_pipe.layout, 0, 1,
                            &ctx->h264_idct4_pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout,
                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
    vkCmdDispatch(cb, wg_count, 1, 1);
    vkEndCommandBuffer(cb);
    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
    memcpy(dst, bd.mapped, dst_max);
    /* H.264/FFmpeg convention: zero the coeffs block after the
     * transform (matches the C ref + NEON .S behaviour). */
    memset(coeffs, 0, coeff_bytes);
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return 0;
 fail:
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return -1;
 }
 /* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
 typedef struct {
    uint32_t n_blocks;
    uint32_t dst_stride_u8;
    uint32_t _pad0;
    uint32_t _pad1;
 } h264_idct8_pc;
 static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
    if (!ctx->h264_idct8_pipe_ready) {
        if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
                                       3, sizeof(h264_idct8_pc),
                                       &ctx->h264_idct8_pipe) != 0)
            return -1;
        ctx->h264_idct8_pipe_ready = 1;
    }
    size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
    size_t meta_bytes  = n_blocks * 4 * sizeof(uint32_t);
    size_t dst_max = 0;
    for (size_t i = 0; i < n_blocks; i++) {
        size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
        if (e > dst_max) dst_max = e;
    }
    v3d_buffer bc = {0}, bd = {0}, bm = {0};
    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) {
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    if (v3d_runner_create_buffer(ctx->runner, meta_bytes,  &bm)) {
        v3d_runner_destroy_buffer(ctx->runner, &bd);
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    memcpy(bc.mapped, coeffs, coeff_bytes);
    memcpy(bd.mapped, dst,    dst_max);
    uint32_t *m = bm.mapped;
    for (size_t i = 0; i < n_blocks; i++) {
        m[4*i+0] = meta[i].dst_off;
        m[4*i+1] = 0;
        m[4*i+2] = 0;
        m[4*i+3] = 0;
    }
    v3d_buffer binds[3] = { bc, bd, bm };
    if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
        goto fail;
    uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8);   /* 8 blocks/WG */
    h264_idct8_pc pc = {
        .n_blocks      = (uint32_t) n_blocks,
        .dst_stride_u8 = (uint32_t) dst_stride,
    };
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
    if (cb == VK_NULL_HANDLE) goto fail;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                      ctx->h264_idct8_pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            ctx->h264_idct8_pipe.layout, 0, 1,
                            &ctx->h264_idct8_pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
    vkCmdDispatch(cb, wg_count, 1, 1);
    vkEndCommandBuffer(cb);
    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
    memcpy(dst, bd.mapped, dst_max);
    memset(coeffs, 0, coeff_bytes);
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return 0;
 fail:
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return -1;
 }
 /* -------------------- H.264 qpel mc20 QPU dispatch (cycle 9) --- */
 typedef struct {
    uint32_t n_blocks;
    uint32_t stride_u8;
    uint32_t _pad0;
    uint32_t _pad1;
 } h264_qpel_mc20_pc;
 static int dispatch_h264_qpel_mc20_qpu(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
 {
    if (!ctx->h264_qpel_mc20_pipe_ready) {
        if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc20.spv",
                                       3, sizeof(h264_qpel_mc20_pc),
                                       &ctx->h264_qpel_mc20_pipe) != 0)
            return -1;
        ctx->h264_qpel_mc20_pipe_ready = 1;
    }
    /* Compute the smallest contiguous src/dst window that covers
     * every block's read/write footprint.
     *
     * src: filter reads cols (c-2)..(c+3) for c=0..7 across rows 0..7.
     *      Highest read = src_off + 7*stride + (7 + 3) = src_off + 7*stride + 10.
     *      Plus 1 for the byte-count semantic of memcpy (length=N copies
     *      indices 0..N-1) → src_max = src_off + 7*stride + 11.
     *
     * dst: writes cols 0..7 across rows 0..7.
     *      Highest write = dst_off + 7*stride + 7; +1 → dst_off + 7*stride + 8. */
    size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
    size_t src_max = 0, dst_max = 0;
    for (size_t i = 0; i < n_blocks; i++) {
        size_t s_end = meta[i].src_off + (size_t) 7 * stride + 11;
        size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
        if (s_end > src_max) src_max = s_end;
        if (d_end > dst_max) dst_max = d_end;
    }
    v3d_buffer bs = {0}, bd = {0}, bm = {0};
    if (v3d_runner_create_buffer(ctx->runner, src_max,    &bs)) return -1;
    if (v3d_runner_create_buffer(ctx->runner, dst_max,    &bd)) {
        v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
    }
    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
        v3d_runner_destroy_buffer(ctx->runner, &bd);
        v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
    }
    /* Copy src window (filter needs cols -2..+3, captured by src_max
     * upper bound above; the lower bound is implicit in src_off >= 2
     * which the caller guarantees per the public API contract). */
    memcpy(bs.mapped, src, src_max);
    memcpy(bd.mapped, dst, dst_max);
    uint32_t *m = bm.mapped;
    for (size_t i = 0; i < n_blocks; i++) {
        m[4*i+0] = meta[i].dst_off;
        m[4*i+1] = meta[i].src_off;
        m[4*i+2] = 0;
        m[4*i+3] = 0;
    }
    v3d_buffer binds[3] = { bs, bd, bm };
    if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc20_pipe, binds, 3))
        goto fail;
    uint32_t wg_count = (uint32_t) n_blocks;   /* 1 block per WG */
    h264_qpel_mc20_pc pc = {
        .n_blocks  = (uint32_t) n_blocks,
        .stride_u8 = (uint32_t) stride,
    };
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
    if (cb == VK_NULL_HANDLE) goto fail;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                      ctx->h264_qpel_mc20_pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            ctx->h264_qpel_mc20_pipe.layout, 0, 1,
                            &ctx->h264_qpel_mc20_pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, ctx->h264_qpel_mc20_pipe.layout,
                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
    vkCmdDispatch(cb, wg_count, 1, 1);
    vkEndCommandBuffer(cb);
    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
    memcpy(dst, bd.mapped, dst_max);
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bs);
    return 0;
 fail:
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bs);
    return -1;
 }
@@ -803,8 +1176,16 @@ int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
+    daedalus_substrate eff = sub;
-                   dst, dst_stride, coeffs, n_blocks, meta);
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_h264_idct4_cpu(ctx, dst, dst_stride,
                                       coeffs, n_blocks, meta);
    return dispatch_h264_idct4_qpu(ctx, dst, dst_stride,
                                   coeffs, n_blocks, meta);
 }
 int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -812,8 +1193,16 @@ int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
+    daedalus_substrate eff = sub;
-                   dst, dst_stride, coeffs, n_blocks, meta);
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
                                       coeffs, n_blocks, meta);
    return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
                                   coeffs, n_blocks, meta);
 }
 int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -830,12 +1219,71 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
 }
 int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    daedalus_substrate eff = sub;
    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH);
    /* No QPU shader for the H variant yet — always falls through to
     * CPU.  Mirror the _v shape anyway so the substrate switch is
     * uniform; QPU just isn't a real option here yet. */
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_QPU) {
        /* QPU shader for H deblock isn't implemented yet; recipe
         * table returns CPU, so AUTO never lands here.  An explicit
         * QPU request fails fast rather than silently degrading to
         * CPU — matches the principle from the IDCT QPU substrate
         * (explicit means explicit). */
        return -1;
    }
    return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
 }
 int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    daedalus_substrate eff = sub;
    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_QPU)
        return -1;  /* No chroma QPU shader yet. */
    return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
 }
 int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    daedalus_substrate eff = sub;
    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_QPU)
        return -1;
    return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
 }
 int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_QPEL_MC20, dispatch_h264_qpel_mc20_cpu,
+    daedalus_substrate eff = sub;
-                   dst, src, stride, n_blocks, meta);
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_h264_qpel_mc20_cpu(ctx, dst, src, stride,
                                           n_blocks, meta);
    return dispatch_h264_qpel_mc20_qpu(ctx, dst, src, stride,
                                       n_blocks, meta);
 }
 /* -------------------- Recipe convenience wrappers --------------- */
@@ -909,6 +1357,30 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
                                                  dst, dst_stride, n_edges, meta);
 }
 int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    return daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                                  dst, dst_stride, n_edges, meta);
 }
 int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                                    dst, dst_stride, n_edges, meta);
 }
 int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                                    dst, dst_stride, n_edges, meta);
 }
 int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -0,0 +1,129 @@
 // daedalus-fourier — H.264 4x4 inverse integer transform + add, V3D 7.1.
 //
 // H.264 spec §8.5.12.1.  Pure integer arithmetic — no trig constants
 // (unlike VP9 IDCT 8x8).  Row pass first, column pass second; round
 // (+32) >> 6, add to dst, clip to u8.
 //
 // Block memory layout: COLUMN-MAJOR.  block[c*4 + r] = coefficient at
 // (row r, column c).  Matches FFmpeg `ff_h264_idct_add_neon`.
 //
 // Workgroup layout: 64 invocations = 4 lanes/block × 16 blocks/WG.
 //   - row pass: lane k (0..3) reads row k of the block (4 coefficients,
 //               one from each column), runs the butterfly, writes 4
 //               outputs to one row of tmp_shared.
 //   - column pass: lane k reads column k of tmp_shared (4 rows),
 //                  runs the butterfly, writes 4 outputs to dst as
 //                  column k at rows 0..3.
 //
 // shared = 16 × 16 × 4 B = 1 KiB.  Well under V3D's 16 KiB limit.
 //
 // License: BSD-2-Clause.
 #version 450
 #extension GL_EXT_shader_8bit_storage             : require
 #extension GL_EXT_shader_16bit_storage            : require
 #extension GL_EXT_shader_explicit_arithmetic_types : require
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(binding = 0) readonly buffer Coeffs {
    int16_t coeffs[];   // N × 16 column-major
 } u_coeffs;
 layout(binding = 1) buffer Dst {
    uint8_t dst[];      // H × stride bytes (caller-provided base)
 } u_dst;
 layout(binding = 2) readonly buffer Meta {
    uvec4 meta[];       // .x = dst_off (byte offset into u_dst.dst)
 } u_meta;
 layout(push_constant) uniform PC {
    uint n_blocks;
    uint dst_stride_u8;
    uint _pad0, _pad1;
 } pc;
 // 16 blocks per WG × 16 ints per block = 256 ints = 1 KiB shared.
 shared int tmp_shared[16 * 16];
 // 1D butterfly per H.264 §8.5.12.1.  d[0..3] in, o[0..3] out.
 void idct4_1d(int d0, int d1, int d2, int d3,
              out int o0, out int o1, out int o2, out int o3)
 {
    int e = d0 + d2;
    int f = d0 - d2;
    int g = (d1 >> 1) - d3;
    int h = d1 + (d3 >> 1);
    o0 = e + h;
    o1 = f + g;
    o2 = f - g;
    o3 = e - h;
 }
 void main()
 {
    // Lane decomposition: local_size 64 = 16 blocks × 4 lanes/block.
    uint gid          = gl_GlobalInvocationID.x;
    uint wg_id        = gid / 64u;
    uint lane_in_wg   = gid & 63u;
    uint block_local  = lane_in_wg >> 2;          // 0..15
    uint k            = lane_in_wg & 3u;          // 0..3
    uint block_idx    = wg_id * 16u + block_local;
    bool oob = (block_idx >= pc.n_blocks);
    // ---- Row pass --------------------------------------------------
    // lane k handles row r=k.  Reads block[c*4 + k] for c=0..3 (one
    // element from each column at fixed row).
    if (!oob) {
        uint base = block_idx * 16u;
        int d0 = int(u_coeffs.coeffs[base + 0u * 4u + k]);
        int d1 = int(u_coeffs.coeffs[base + 1u * 4u + k]);
        int d2 = int(u_coeffs.coeffs[base + 2u * 4u + k]);
        int d3 = int(u_coeffs.coeffs[base + 3u * 4u + k]);
        int o0, o1, o2, o3;
        idct4_1d(d0, d1, d2, d3, o0, o1, o2, o3);
        // Write row k of tmp_shared[block_local].
        uint tbase = block_local * 16u + k * 4u;
        tmp_shared[tbase + 0u] = o0;
        tmp_shared[tbase + 1u] = o1;
        tmp_shared[tbase + 2u] = o2;
        tmp_shared[tbase + 3u] = o3;
    }
    barrier();
    // ---- Column pass ----------------------------------------------
    // lane k handles column c=k.  Reads tmp[r][k] for r=0..3.
    if (!oob) {
        uint tbase = block_local * 16u;
        int s0 = tmp_shared[tbase + 0u * 4u + k];
        int s1 = tmp_shared[tbase + 1u * 4u + k];
        int s2 = tmp_shared[tbase + 2u * 4u + k];
        int s3 = tmp_shared[tbase + 3u * 4u + k];
        int o0, o1, o2, o3;
        idct4_1d(s0, s1, s2, s3, o0, o1, o2, o3);
        // Column k at rows 0..3 of dst, offset by meta.x (dst_off).
        uint dst_off = u_meta.meta[block_idx].x;
        uint stride  = pc.dst_stride_u8;
        uint a0 = dst_off + 0u * stride + k;
        uint a1 = dst_off + 1u * stride + k;
        uint a2 = dst_off + 2u * stride + k;
        uint a3 = dst_off + 3u * stride + k;
        int p0 = int(u_dst.dst[a0]);
        int p1 = int(u_dst.dst[a1]);
        int p2 = int(u_dst.dst[a2]);
        int p3 = int(u_dst.dst[a3]);
        u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 32) >> 6), 0, 255));
        u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 32) >> 6), 0, 255));
        u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 32) >> 6), 0, 255));
        u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 32) >> 6), 0, 255));
    }
 }
@@ -0,0 +1,175 @@
 // daedalus-fourier — H.264 8x8 inverse integer transform + add, V3D 7.1.
 //
 // H.264 spec §8.5.13.2 (High profile 8x8 IT).  Pure integer arithmetic
 // — different butterfly from VP9 IDCT 8x8 (cycle 1, uses cospi
 // multipliers).  Row pass first, column pass second; round (+32) >> 6,
 // add to dst, clip to u8.
 //
 // Block layout: COLUMN-MAJOR.  block[c*8 + r] = coefficient at
 // (row r, column c).  Matches FFmpeg `ff_h264_idct8_add_neon`.
 //
 // Workgroup layout: 64 invocations = 8 lanes/block × 8 blocks/WG.
 //   - row pass: lane k (0..7) reads row k of the block (8 coefficients,
 //               one from each column), runs the butterfly, writes 8
 //               outputs to one row of tmp_shared.
 //   - column pass: lane k reads column k of tmp_shared (8 rows),
 //                  runs the butterfly, writes 8 outputs to dst as
 //                  column k at rows 0..7.
 //
 // shared = 8 × 64 × 4 B = 2 KiB.  Well under V3D's 16 KiB limit.
 //
 // License: BSD-2-Clause.
 #version 450
 #extension GL_EXT_shader_8bit_storage             : require
 #extension GL_EXT_shader_16bit_storage            : require
 #extension GL_EXT_shader_explicit_arithmetic_types : require
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(binding = 0) readonly buffer Coeffs {
    int16_t coeffs[];   // N × 64 column-major
 } u_coeffs;
 layout(binding = 1) buffer Dst {
    uint8_t dst[];      // H × stride bytes
 } u_dst;
 layout(binding = 2) readonly buffer Meta {
    uvec4 meta[];       // .x = dst_off
 } u_meta;
 layout(push_constant) uniform PC {
    uint n_blocks;
    uint dst_stride_u8;
    uint _pad0, _pad1;
 } pc;
 // 8 blocks/WG × 64 ints/block × 4 B = 2 KiB shared.
 shared int tmp_shared[8 * 64];
 // 1D 8-element butterfly per H.264 §8.5.13.2.
 void idct8_1d(int d0, int d1, int d2, int d3,
              int d4, int d5, int d6, int d7,
              out int g0, out int g1, out int g2, out int g3,
              out int g4, out int g5, out int g6, out int g7)
 {
    int e0 = d0 + d4;
    int e1 = -d3 + d5 - d7 - (d7 >> 1);
    int e2 = d0 - d4;
    int e3 = d1 + d7 - d3 - (d3 >> 1);
    int e4 = (d2 >> 1) - d6;
    int e5 = -d1 + d7 + d5 + (d5 >> 1);
    int e6 = d2 + (d6 >> 1);
    int e7 = d3 + d5 + d1 + (d1 >> 1);
    int f0 = e0 + e6;
    int f1 = e1 + (e7 >> 2);
    int f2 = e2 + e4;
    int f3 = e3 + (e5 >> 2);
    int f4 = e2 - e4;
    int f5 = (e3 >> 2) - e5;
    int f6 = e0 - e6;
    int f7 = e7 - (e1 >> 2);
    g0 = f0 + f7;
    g1 = f2 + f5;
    g2 = f4 + f3;
    g3 = f6 + f1;
    g4 = f6 - f1;
    g5 = f4 - f3;
    g6 = f2 - f5;
    g7 = f0 - f7;
 }
 void main()
 {
    // local_size 64 = 8 blocks × 8 lanes/block.
    uint gid          = gl_GlobalInvocationID.x;
    uint wg_id        = gid / 64u;
    uint lane_in_wg   = gid & 63u;
    uint block_local  = lane_in_wg >> 3;          // 0..7
    uint k            = lane_in_wg & 7u;          // 0..7
    uint block_idx    = wg_id * 8u + block_local;
    bool oob = (block_idx >= pc.n_blocks);
    // ---- Row pass --------------------------------------------------
    // lane k handles row r=k.  Reads block[c*8 + k] for c=0..7.
    if (!oob) {
        uint base = block_idx * 64u;
        int d0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
        int d1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
        int d2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
        int d3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
        int d4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
        int d5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
        int d6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
        int d7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
        int g0, g1, g2, g3, g4, g5, g6, g7;
        idct8_1d(d0, d1, d2, d3, d4, d5, d6, d7,
                 g0, g1, g2, g3, g4, g5, g6, g7);
        // Write row k of tmp_shared[block_local].
        uint tbase = block_local * 64u + k * 8u;
        tmp_shared[tbase + 0u] = g0;
        tmp_shared[tbase + 1u] = g1;
        tmp_shared[tbase + 2u] = g2;
        tmp_shared[tbase + 3u] = g3;
        tmp_shared[tbase + 4u] = g4;
        tmp_shared[tbase + 5u] = g5;
        tmp_shared[tbase + 6u] = g6;
        tmp_shared[tbase + 7u] = g7;
    }
    barrier();
    // ---- Column pass ----------------------------------------------
    // lane k handles column c=k.  Reads tmp[r][k] for r=0..7.
    if (!oob) {
        uint tbase = block_local * 64u;
        int s0 = tmp_shared[tbase + 0u * 8u + k];
        int s1 = tmp_shared[tbase + 1u * 8u + k];
        int s2 = tmp_shared[tbase + 2u * 8u + k];
        int s3 = tmp_shared[tbase + 3u * 8u + k];
        int s4 = tmp_shared[tbase + 4u * 8u + k];
        int s5 = tmp_shared[tbase + 5u * 8u + k];
        int s6 = tmp_shared[tbase + 6u * 8u + k];
        int s7 = tmp_shared[tbase + 7u * 8u + k];
        int g0, g1, g2, g3, g4, g5, g6, g7;
        idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
                 g0, g1, g2, g3, g4, g5, g6, g7);
        // Column k at rows 0..7 of dst, offset by meta.x.
        uint dst_off = u_meta.meta[block_idx].x;
        uint stride  = pc.dst_stride_u8;
        uint a0 = dst_off + 0u * stride + k;
        uint a1 = dst_off + 1u * stride + k;
        uint a2 = dst_off + 2u * stride + k;
        uint a3 = dst_off + 3u * stride + k;
        uint a4 = dst_off + 4u * stride + k;
        uint a5 = dst_off + 5u * stride + k;
        uint a6 = dst_off + 6u * stride + k;
        uint a7 = dst_off + 7u * stride + k;
        int p0 = int(u_dst.dst[a0]);
        int p1 = int(u_dst.dst[a1]);
        int p2 = int(u_dst.dst[a2]);
        int p3 = int(u_dst.dst[a3]);
        int p4 = int(u_dst.dst[a4]);
        int p5 = int(u_dst.dst[a5]);
        int p6 = int(u_dst.dst[a6]);
        int p7 = int(u_dst.dst[a7]);
        u_dst.dst[a0] = uint8_t(clamp(p0 + ((g0 + 32) >> 6), 0, 255));
        u_dst.dst[a1] = uint8_t(clamp(p1 + ((g1 + 32) >> 6), 0, 255));
        u_dst.dst[a2] = uint8_t(clamp(p2 + ((g2 + 32) >> 6), 0, 255));
        u_dst.dst[a3] = uint8_t(clamp(p3 + ((g3 + 32) >> 6), 0, 255));
        u_dst.dst[a4] = uint8_t(clamp(p4 + ((g4 + 32) >> 6), 0, 255));
        u_dst.dst[a5] = uint8_t(clamp(p5 + ((g5 + 32) >> 6), 0, 255));
        u_dst.dst[a6] = uint8_t(clamp(p6 + ((g6 + 32) >> 6), 0, 255));
        u_dst.dst[a7] = uint8_t(clamp(p7 + ((g7 + 32) >> 6), 0, 255));
    }
 }
@@ -0,0 +1,83 @@
 // daedalus-fourier — H.264 luma qpel mc20 (8x8, horizontal half-pel), V3D 7.1.
 //
 // H.264 spec §8.4.2.2.1 horizontal 6-tap luma interpolation:
 //
 //   dst[r,c] = clip255(
 //       ( s[r,c-2]
 //         - 5 * s[r,c-1]
 //         + 20 * s[r,c]
 //         + 20 * s[r,c+1]
 //         -  5 * s[r,c+2]
 //         +      s[r,c+3]
 //         + 16
 //       ) >> 5)
 //
 // Single-stride: dst and src share `stride` (H264QpelContext
 // convention).  src+src_off already points at the leftmost output
 // column (col 0); the filter reads cols -2..+3.  Caller guarantees
 // edge-padding context per the public API docstring.
 //
 // Workgroup layout: 64 invocations = 1 lane per output pixel.
 // 1 block per WG; n_blocks WGs total.  This is the simplest layout
 // that avoids any inter-lane communication — each lane independently
 // reads its 6 src samples and writes its 1 dst sample.  V3D's L2
 // cache handles the redundant reads from adjacent lanes.
 //
 // License: BSD-2-Clause.
 #version 450
 #extension GL_EXT_shader_8bit_storage             : require
 #extension GL_EXT_shader_explicit_arithmetic_types : require
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(binding = 0) readonly buffer Src {
    uint8_t src[];
 } u_src;
 layout(binding = 1) buffer Dst {
    uint8_t dst[];
 } u_dst;
 layout(binding = 2) readonly buffer Meta {
    uvec4 meta[];       // .x = dst_off, .y = src_off
 } u_meta;
 layout(push_constant) uniform PC {
    uint n_blocks;
    uint stride_u8;
    uint _pad0, _pad1;
 } pc;
 void main()
 {
    // 1 block per WG, 64 lanes covering the 8x8 output block.
    uint wg_id      = gl_WorkGroupID.x;
    uint block_idx  = wg_id;
    if (block_idx >= pc.n_blocks) return;
    uint lane = gl_LocalInvocationID.x;
    uint r = lane >> 3;    // 0..7 (row)
    uint c = lane & 7u;    // 0..7 (column)
    uint dst_off = u_meta.meta[block_idx].x;
    uint src_off = u_meta.meta[block_idx].y;
    uint stride  = pc.stride_u8;
    // src points at output col 0 of the block; filter reads cols -2..+3
    // of the current row.  Negative col arithmetic is unsigned-safe
    // because src_off >= 2 (caller-guaranteed left context).
    uint row_base = src_off + r * stride + c;
    int s_m2 = int(u_src.src[row_base - 2u]);
    int s_m1 = int(u_src.src[row_base - 1u]);
    int s_0  = int(u_src.src[row_base + 0u]);
    int s_p1 = int(u_src.src[row_base + 1u]);
    int s_p2 = int(u_src.src[row_base + 2u]);
    int s_p3 = int(u_src.src[row_base + 3u]);
    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
    int p = clamp(v >> 5, 0, 255);
    u_dst.dst[dst_off + r * stride + c] = uint8_t(p);
 }
@@ -17,6 +17,18 @@
    fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
            r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
 /* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB).  Cycle
 * 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize
 * requests fall through to non-pooled allocation. */
 #define V3D_POOL_MIN_LOG2	8
 #define V3D_POOL_MAX_LOG2	23
 #define V3D_POOL_BUCKETS	(V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1)
 struct v3d_pool_entry {
    v3d_buffer             buf;
    struct v3d_pool_entry *next;
 };
 struct v3d_runner {
    VkInstance       instance;
    VkPhysicalDevice phys;
@@ -26,6 +38,15 @@ struct v3d_runner {
    VkCommandPool    pool;
    char             device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
    VkPhysicalDeviceMemoryProperties mem_props;
    /* Buffer pool: per-bucket freelist of previously-released
     * v3d_buffer.  bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2.
     * pool_total_bytes accumulates every successful vkAllocateMemory
     * we've done through the pool — never decreases (the freelist
     * just hands buffers around, no vkFreeMemory until destroy).
     */
    struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS];
    size_t                 pool_total_bytes;
 };
 static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
@@ -168,6 +189,21 @@ void v3d_runner_destroy(v3d_runner *r)
 {
    if (!r) return;
    if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
    /* Drain the buffer pool BEFORE destroying device — the pool
     * entries own VkBuffer/VkDeviceMemory handles, which need a live
     * device for vkDestroyBuffer/vkFreeMemory. */
    for (int b = 0; b < V3D_POOL_BUCKETS; b++) {
        struct v3d_pool_entry *e = r->pool_free[b];
        while (e) {
            struct v3d_pool_entry *next = e->next;
            v3d_runner_destroy_buffer(r, &e->buf);
            free(e);
            e = next;
        }
        r->pool_free[b] = NULL;
    }
    if (r->pool != VK_NULL_HANDLE)
        vkDestroyCommandPool(r->device, r->pool, NULL);
    if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
@@ -175,6 +211,92 @@ void v3d_runner_destroy(v3d_runner *r)
    free(r);
 }
 /* ---- Buffer pool ----------------------------------------------- */
 /* ceil_log2 for buffer pool bucket selection. */
 static int v3d_pool_bucket_for(size_t size)
 {
    int log2;
    size_t m;
    if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2))
        return 0;
    m = size - 1;
    log2 = 0;
    while (m) { log2++; m >>= 1; }
    if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2;
    if (log2 > V3D_POOL_MAX_LOG2) return -1;
    return log2 - V3D_POOL_MIN_LOG2;
 }
 int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
 {
    int bucket;
    size_t bucket_size;
    struct v3d_pool_entry *e;
    int rc;
    if (!r || !out || size == 0) return -1;
    bucket = v3d_pool_bucket_for(size);
    if (bucket < 0) {
        /* Oversize — fall through to non-pooled allocation.  Caller
         * still calls v3d_runner_release_buffer(), which detects the
         * oversize bucket via bucket_for() and destroys. */
        return v3d_runner_create_buffer(r, size, out);
    }
    bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2);
    e = r->pool_free[bucket];
    if (e) {
        r->pool_free[bucket] = e->next;
        *out = e->buf;
        free(e);
        return 0;
    }
    /* Miss — allocate fresh at the bucket size.  Subsequent acquire/
     * release for the same bucket reuses this buffer. */
    rc = v3d_runner_create_buffer(r, bucket_size, out);
    if (rc == 0)
        r->pool_total_bytes += bucket_size;
    return rc;
 }
 void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf)
 {
    int bucket;
    struct v3d_pool_entry *e;
    if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return;
    bucket = v3d_pool_bucket_for(buf->size);
    if (bucket < 0) {
        /* Oversize — destroy outright; never made it into the pool. */
        v3d_runner_destroy_buffer(r, buf);
        memset(buf, 0, sizeof(*buf));
        return;
    }
    e = malloc(sizeof(*e));
    if (!e) {
        /* Allocator failure: just destroy.  Pool degenerates to
         * non-pooled behaviour but doesn't leak. */
        v3d_runner_destroy_buffer(r, buf);
        memset(buf, 0, sizeof(*buf));
        return;
    }
    e->buf = *buf;
    e->next = r->pool_free[bucket];
    r->pool_free[bucket] = e;
    memset(buf, 0, sizeof(*buf));
 }
 size_t v3d_runner_pool_total_bytes(v3d_runner *r)
 {
    return r ? r->pool_total_bytes : 0;
 }
 VkDevice      v3d_runner_device(v3d_runner *r)        { return r->device; }
 VkQueue       v3d_runner_queue(v3d_runner *r)         { return r->queue; }
 uint32_t      v3d_runner_queue_family(v3d_runner *r)  { return r->queue_family; }
@@ -364,12 +486,27 @@ int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
        .pSetLayouts = &out->ds_layout,
    };
    CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
    /* Persistent command buffer — pool was created with
     * RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch
     * sites can call vkResetCommandBuffer on this same cb instead
     * of paying vkAllocateCommandBuffers per call. */
    VkCommandBufferAllocateInfo cbai = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
        .commandPool = r->pool,
        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
        .commandBufferCount = 1,
    };
    CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb));
    return 0;
 }
 void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
 {
    if (!p || p->pipeline == VK_NULL_HANDLE) return;
    if (p->cb != VK_NULL_HANDLE)
        vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb);
    vkDestroyPipeline(r->device, p->pipeline, NULL);
    vkDestroyPipelineLayout(r->device, p->layout, NULL);
    vkDestroyDescriptorPool(r->device, p->pool, NULL);  /* frees its set */
@@ -377,6 +514,13 @@ void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
    memset(p, 0, sizeof(*p));
 }
 int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p)
 {
    (void) r;
    if (!p || p->cb == VK_NULL_HANDLE) return -1;
    return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1;
 }
 int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
                            const v3d_buffer *bufs, uint32_t n)
 {
@@ -34,6 +34,12 @@ typedef struct {
    VkDescriptorSet        desc_set;
    uint32_t               n_ssbos;
    uint32_t               push_const_size;
    /* Persistent command buffer.  Allocated at create-pipeline time;
     * dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to
     * vkResetCommandBuffer instead of paying vkAllocateCommandBuffers
     * per dispatch.  Pool flagged RESET_COMMAND_BUFFER_BIT so reset
     * is permitted. */
    VkCommandBuffer        cb;
 } v3d_pipeline;
 /*
@@ -57,10 +63,43 @@ const char      *v3d_runner_device_name(v3d_runner *r);
 * host side. The mapping persists for the lifetime of the buffer.
 *
 * Returns 0 on success, non-zero on failure.
 *
 * NOTE: prefer v3d_runner_acquire_buffer() on the dispatch hot path —
 * create_buffer/destroy_buffer go straight to vkAllocateMemory each
 * call, which on V3D7's Mesa stack costs ~10-50us.  The acquire/
 * release pair pulls from a freelist and pays vkAllocateMemory only
 * on a cache miss.
 */
 int  v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
 void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
 /*
 * Pooled buffer acquisition.  Returns a v3d_buffer whose .size is the
 * smallest power-of-2 >= the requested size (so callers can pool
 * across similar-sized requests).  Backed by HOST_VISIBLE |
 * HOST_COHERENT memory; mapped pointer is valid.
 *
 * On cache hit: zero-cost reuse of a previously-released buffer.
 * On miss: falls through to v3d_runner_create_buffer().  Release with
 * v3d_runner_release_buffer(); pool drains in v3d_runner_destroy().
 *
 * Lifetime contract: the returned buffer's .mapped contents are
 * UNINITIALISED — the previous user's data may still be present.
 * Callers that need a clean buffer must memset themselves.  This is
 * deliberate; the dispatch hot paths immediately overwrite the
 * buffer with new coefficients / meta anyway.
 *
 * Thread-safety: NOT thread-safe.  A daedalus_ctx is single-threaded
 * by API contract; the pool inherits that constraint.
 */
 int  v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
 void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf);
 /* Pool diagnostics: total allocated bytes (sum across all size
 * classes, including currently-released entries).  Useful for
 * watermark logging. */
 size_t v3d_runner_pool_total_bytes(v3d_runner *r);
 /* Compute pipeline from a SPIR-V file path. The descriptor-set
 * layout exposes `n_ssbos` storage buffer bindings at binding
 * indices 0..n_ssbos-1, all visible to the compute stage. A push
@@ -88,6 +127,12 @@ int  v3d_runner_bind_buffers(v3d_runner   *r,
 /* Allocate a primary command buffer from the runner's pool. */
 VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
 /* Reset @p->cb so it can be re-recorded.  Returns 0 on success.
 * Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path —
 * vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us
 * driver cost. */
 int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p);
 /* Submit `cb` to the queue and wait for completion. The classic
 * timed operation. Returns 0 on success.
 */
@@ -0,0 +1,120 @@
 /*
 * bench_pool_overhead — measure QPU dispatch overhead with and without
 * the v3d_runner buffer pool warm.
 *
 * Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and
 * prints the per-call distribution.  The first call pays
 * vkAllocateMemory (typically tens of microseconds on V3D7's Mesa);
 * the second and subsequent should hit the pool freelist and amortise
 * to the pure dispatch-floor cost.
 *
 * Purpose: provide a concrete before/after number for the QPU-default
 * substrate decree (2026-05-23).  Bench is non-gating and runs in
 * fractions of a second.
 *
 * License: BSD-2-Clause.
 */
 #define _POSIX_C_SOURCE 200809L
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <time.h>
 #include "../include/daedalus.h"
 extern size_t v3d_runner_pool_total_bytes(void *);  /* exposed if we wanted it */
 static double now_seconds(void)
 {
 	struct timespec ts;
 	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
 	return ts.tv_sec + ts.tv_nsec * 1e-9;
 }
 static int cmp_double(const void *a, const void *b)
 {
 	double da = *(const double *)a, db = *(const double *)b;
 	return da < db ? -1 : da > db ? 1 : 0;
 }
 int main(int argc, char **argv)
 {
 	int n_calls = argc > 1 ? atoi(argv[1]) : 200;
 	int n_blocks = 8;	/* one MB column of 8x8 IDCT blocks */
 	int stride = 64;
 	daedalus_ctx *ctx = daedalus_ctx_create();
 	if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
 	int has_qpu = daedalus_ctx_has_qpu(ctx);
 	printf("ctx: has_qpu=%d\n", has_qpu);
 	if (!has_qpu) {
 		fprintf(stderr, "QPU not available on this device; bench needs V3D\n");
 		daedalus_ctx_destroy(ctx);
 		return 2;
 	}
 	/* Build a representative IDCT 8x8 batch and warm a dst buffer. */
 	int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t));
 	uint8_t *dst    = calloc((size_t) n_blocks * 8 * stride, 1);
 	daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta));
 	if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; }
 	uint64_t s = 0x1234567abcdefULL;
 	for (size_t i = 0; i < (size_t) n_blocks * 64; i++) {
 		s ^= s << 13; s ^= s >> 7; s ^= s << 17;
 		coeffs[i] = (int16_t)(s & 0x7ff) - 0x400;
 	}
 	for (int b = 0; b < n_blocks; b++) {
 		meta[b].dst_off = (uint32_t) b * 8;
 		meta[b].block_x = (uint32_t) b;
 		meta[b].block_y = 0;
 	}
 	double *t = malloc((size_t) n_calls * sizeof(double));
 	int rc;
 	printf("=== dispatching %d times, n_blocks=%d/call ===\n",
 	       n_calls, n_blocks);
 	for (int i = 0; i < n_calls; i++) {
 		double t0 = now_seconds();
 		rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU,
 						  dst, (size_t) stride,
 						  coeffs, (size_t) n_blocks, meta);
 		double t1 = now_seconds();
 		if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; }
 		t[i] = (t1 - t0) * 1e6;	/* us */
 	}
 	/* Per-call distribution (first few + sorted summary on the steady-state) */
 	printf("\nfirst 5 calls (cold-warm transition):\n");
 	for (int i = 0; i < 5 && i < n_calls; i++)
 		printf("  call %d:  %.2f us\n", i, t[i]);
 	int skip = 10;	/* drop warm-up calls from the steady-state stats */
 	if (n_calls > skip + 10) {
 		int n = n_calls - skip;
 		double *s_arr = malloc((size_t) n * sizeof(double));
 		memcpy(s_arr, t + skip, (size_t) n * sizeof(double));
 		qsort(s_arr, (size_t) n, sizeof(double), cmp_double);
 		double sum = 0;
 		for (int i = 0; i < n; i++) sum += s_arr[i];
 		printf("\nsteady-state stats (calls %d..%d, n=%d):\n",
 		       skip, n_calls - 1, n);
 		printf("  min:    %.2f us\n", s_arr[0]);
 		printf("  p50:    %.2f us\n", s_arr[n / 2]);
 		printf("  p90:    %.2f us\n", s_arr[(int)(n * 0.9)]);
 		printf("  p99:    %.2f us\n", s_arr[(int)(n * 0.99)]);
 		printf("  max:    %.2f us\n", s_arr[n - 1]);
 		printf("  mean:   %.2f us\n", sum / n);
 		printf("\nfirst-call / steady-state median ratio: %.1fx\n",
 		       t[0] / s_arr[n / 2]);
 		free(s_arr);
 	}
 	free(t); free(coeffs); free(dst); free(meta);
 	daedalus_ctx_destroy(ctx);
 	return 0;
 }
@@ -0,0 +1,110 @@
 /*
 * Standalone bit-exact C reference for H.264 chroma loop filters
 * (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
 * when added).  Covers both orientations:
 *
 *   v_loop_filter_chroma: filter applied VERTICALLY across a
 *     HORIZONTAL edge.  Tile is 8 cols × 4 rows of context
 *     (rows -2..+1); pix points to row 0 of the bottom block.
 *   h_loop_filter_chroma: filter applied HORIZONTALLY across a
 *     VERTICAL edge.  Tile is 4 cols × 8 rows of context
 *     (cols -2..+1); pix points to col 0 of the right block.
 *
 * Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
 * `ff_h264_h_loop_filter_chroma_neon` (line 430) in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
 *
 * Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
 *   - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
 *   - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
 *   - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
 *   - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
 *   - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
 *
 * tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
 * (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or
 * 8 rows for H edge).
 *
 * Signature (matches FFmpeg + the existing luma refs):
 *   void(uint8_t *pix, ptrdiff_t stride,
 *        int alpha, int beta, int8_t tc0[4]);
 *
 * License: LGPL-2.1-or-later (matches FFmpeg upstream).
 */
 #include <stdint.h>
 #include <stddef.h>
 static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
 static inline int clip3(int v, int lo, int hi) {
    return v < lo ? lo : v > hi ? hi : v;
 }
 static inline int abs_i(int x) { return x < 0 ? -x : x; }
 /* Per-cell chroma filter, vertical-direction access (one column
 * across the horizontal edge).  p1 is at pix[-2*stride], q1 at
 * pix[+1*stride]. */
 static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
                                int alpha, int beta, int tc0_s)
 {
    int p1 = pix[-2*stride], p0 = pix[-1*stride];
    int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
    if (abs_i(p0 - q0) >= alpha) return;
    if (abs_i(p1 - p0) >= beta)  return;
    if (abs_i(q1 - q0) >= beta)  return;
    int tc = tc0_s + 1;
    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
    pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
    pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
 }
 /* Same kernel, horizontal-direction access (one row across the
 * vertical edge).  p1 at pix[-2], q1 at pix[+1]. */
 static void h264_chroma_cell_h(uint8_t *pix,
                                int alpha, int beta, int tc0_s)
 {
    int p1 = pix[-2], p0 = pix[-1];
    int q0 = pix[ 0], q1 = pix[ 1];
    if (abs_i(p0 - q0) >= alpha) return;
    if (abs_i(p1 - p0) >= beta)  return;
    if (abs_i(q1 - q0) >= beta)  return;
    int tc = tc0_s + 1;
    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
    pix[-1] = (uint8_t) clip_u8(p0 + delta);
    pix[ 0] = (uint8_t) clip_u8(q0 - delta);
 }
 void daedalus_h264_v_loop_filter_chroma_ref(
    uint8_t *pix, ptrdiff_t stride,
    int alpha, int beta, int8_t tc0[4])
 {
    if (alpha == 0 || beta == 0) return;
    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
    /* 8 cols divided into 4 segments of 2 cols each. */
    for (int s = 0; s < 4; s++) {
        int tc0_s = tc0[s];
        if (tc0_s < 0) continue;
        for (int c = 0; c < 2; c++) {
            int col = s * 2 + c;
            h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
        }
    }
 }
 void daedalus_h264_h_loop_filter_chroma_ref(
    uint8_t *pix, ptrdiff_t stride,
    int alpha, int beta, int8_t tc0[4])
 {
    if (alpha == 0 || beta == 0) return;
    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
    /* 8 rows divided into 4 segments of 2 rows each. */
    for (int s = 0; s < 4; s++) {
        int tc0_s = tc0[s];
        if (tc0_s < 0) continue;
        for (int r = 0; r < 2; r++) {
            int row = s * 2 + r;
            h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
        }
    }
 }
@@ -0,0 +1,116 @@
 /*
 * Standalone bit-exact C reference for H.264 luma "horizontal"
 * loop filter (h_loop_filter_luma): applies filter HORIZONTALLY
 * across a VERTICAL edge. The edge spans the 16-row macroblock
 * height, between columns -1 and 0.
 *
 * Mirrors FFmpeg `ff_h264_h_loop_filter_luma_neon` in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S
 * line 134. Operates on an 8-col × 16-row region:
 *   pix[r*stride + c] for r in 0..15, c in -4..+3
 * With pix pointing to row 0, col 0 of the right block (= the
 * leftmost column of the bottom-/right-block half of the edge).
 *
 * 16 rows divided into 4 segments of 4 rows; each segment has its
 * own tc0 strength (tc0[0..3]).
 *
 * Note: FFmpeg's "h_loop_filter" naming uses the FILTER DIRECTION
 * (horizontal = across the edge from the left), not the edge
 * orientation (vertical). H.264 spec calls this the "vertical
 * edge" filter.
 *
 * This is the column-axis transpose of h264_v_loop_filter_luma_ref:
 *   - v variant: p3..p0 above the edge (pix[-4*stride..-1*stride]),
 *     q0..q3 below (pix[0..+3*stride]).  16 columns × 4 segments.
 *   - h variant: p3..p0 left of the edge (pix[-4..-1]),
 *     q0..q3 right (pix[0..+3]).            16 rows × 4 segments.
 * Same per-segment kernel; only the address arithmetic transposes.
 *
 * Signature:
 *   void(uint8_t *pix, ptrdiff_t stride,
 *        int alpha, int beta, int8_t tc0[4]);
 *
 * License: LGPL-2.1-or-later (matches FFmpeg upstream).
 */
 #include <stdint.h>
 #include <stddef.h>
 static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
 static inline int clip3(int v, int lo, int hi) {
    return v < lo ? lo : v > hi ? hi : v;
 }
 static inline int abs_i(int x) { return x < 0 ? -x : x; }
 /* Apply luma deblock to one ROW at the vertical edge.
 * p0..p3 are pixels left of the edge (pix[-1..-4]),
 * q0..q3 right (pix[0..+3]).
 * tc0_s is the segment's tc0 value (already known >= 0).
 *
 * Writes back to pix[-2], pix[-1], pix[0], pix[+1]
 * (= p1, p0, q0, q1).
 */
 static void h264_deblock_luma_row(uint8_t *pix,
                                   int alpha, int beta, int tc0_s)
 {
    int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1];
    int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3];
    (void) p3; (void) q3;   /* not used in bS<4 path */
    /* Edge pre-conditions. */
    if (abs_i(p0 - q0) >= alpha) return;
    if (abs_i(p1 - p0) >= beta)  return;
    if (abs_i(q1 - q0) >= beta)  return;
    /* Side conditions. */
    int ap = abs_i(p2 - p0);
    int aq = abs_i(q2 - q0);
    int ap_lt_beta = (ap < beta);
    int aq_lt_beta = (aq < beta);
    /* Combined filter strength. */
    int tc = tc0_s + ap_lt_beta + aq_lt_beta;
    /* p0 / q0 update. */
    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
    int p0p = clip_u8(p0 + delta);
    int q0p = clip_u8(q0 - delta);
    /* p1 update (only if ap<beta). */
    int p1p = p1;
    if (ap_lt_beta) {
        int delta_p1 = clip3((p2 + ((p0 + q0 + 1) >> 1) - 2*p1) >> 1, -tc0_s, tc0_s);
        p1p = p1 + delta_p1;
    }
    /* q1 update (only if aq<beta). */
    int q1p = q1;
    if (aq_lt_beta) {
        int delta_q1 = clip3((q2 + ((p0 + q0 + 1) >> 1) - 2*q1) >> 1, -tc0_s, tc0_s);
        q1p = q1 + delta_q1;
    }
    pix[-2] = (uint8_t) p1p;
    pix[-1] = (uint8_t) p0p;
    pix[ 0] = (uint8_t) q0p;
    pix[ 1] = (uint8_t) q1p;
 }
 void daedalus_h264_h_loop_filter_luma_ref(
    uint8_t *pix, ptrdiff_t stride,
    int alpha, int beta, int8_t tc0[4])
 {
    /* H.264 deblock "outer" precondition: alpha == 0 OR beta == 0
     * skips filtering. Also if ALL tc0[*] == -1, skip
     * (h264_loop_filter_start macro check). */
    if (alpha == 0 || beta == 0) return;
    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
    /* 16 rows divided into 4 segments of 4 rows each. */
    for (int s = 0; s < 4; s++) {
        int tc0_s = tc0[s];
        if (tc0_s < 0) continue;   /* bS = 0 segment → skip */
        for (int r = 0; r < 4; r++) {
            int row = s * 4 + r;
            h264_deblock_luma_row(pix + row * stride, alpha, beta, tc0_s);
        }
    }
 }
@@ -16,6 +16,12 @@
 extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                   int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
                                                     int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
                                                     int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -145,6 +151,133 @@ static int test_deblock(void)
    return diff == 0 ? 0 : 1;
 }
 static int test_deblock_h(void)
 {
    /* Mirror of test_deblock but for the H variant.  Per-tile layout
     * is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
     * of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
     * column of the right block so the kernel's pix[-4..+3] read sits
     * inside the tile. */
    enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
           TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;
    uint8_t dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_deblock_meta meta[N_EDGES];
    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N_EDGES; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }
    for (int i = 0; i < N_EDGES; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
        daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
                                              meta[i].alpha, meta[i].beta, tc0_local);
    }
    int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
                                                           N_EDGES, meta);
    if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
 }
 static int test_deblock_chroma_v(void)
 {
    /* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
     * (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
    enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
           TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
           EDGE_OFF = EDGE_ROW * TILE_STRIDE };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;
    uint8_t dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_deblock_meta meta[N_EDGES];
    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N_EDGES; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }
    for (int i = 0; i < N_EDGES; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
        daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
                                                 meta[i].alpha, meta[i].beta, tc0_local);
    }
    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
                                                              N_EDGES, meta);
    if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
 }
 static int test_deblock_chroma_h(void)
 {
    /* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
     * (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
    enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
           TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;
    uint8_t dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_deblock_meta meta[N_EDGES];
    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N_EDGES; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }
    for (int i = 0; i < N_EDGES; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
        daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
                                                 meta[i].alpha, meta[i].beta, tc0_local);
    }
    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
                                                              N_EDGES, meta);
    if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
 }
 static int test_qpel_mc20(void)
 {
    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -197,10 +330,20 @@ int main(void)
    printf("  H264_QPEL_MC20 recipe substrate:  %d\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
    printf("  H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
    printf("  H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
    printf("  H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
    int fail = 0;
    fail |= test_idct4();
    fail |= test_idct8();
    fail |= test_deblock();
    fail |= test_deblock_h();
    fail |= test_deblock_chroma_v();
    fail |= test_deblock_chroma_h();
    fail |= test_qpel_mc20();
    return fail;
 }