h264: V3D shaders for all 15 avg_ qpel positions — qpel QPU complete

Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context.
2026-05-25 20:22:33 +02:00
parent 55d3618408
commit 2079fe39c6
17 changed files with 1360 additions and 31 deletions
@@ -389,7 +389,24 @@ if (DAEDALUS_BUILD_VULKAN)
        set(H264_QPEL_${_mc}_SPV ${_spv})
    endforeach()

-    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV})
+    # avg_ biprediction variants — same shader as put_ + extra L2 with
+    # existing dst.  All 15 useful positions.
+    foreach(_mc mc20 mc02 mc22 mc10 mc30 mc01 mc03
+                mc11 mc12 mc13 mc21 mc23 mc31 mc32 mc33)
+        set(_spv ${CMAKE_BINARY_DIR}/v3d_h264_qpel_avg_${_mc}.spv)
+        add_custom_command(
+            OUTPUT ${_spv}
+            COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
+                    -o ${_spv}
+                    ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
+            DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
+            COMMENT "glslang: v3d_h264_qpel_avg_${_mc}.comp -> .spv"
+            VERBATIM
+        )
+        set(H264_QPEL_avg_${_mc}_SPV ${_spv})
+    endforeach()
+
+    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV} ${H264_QPEL_avg_mc20_SPV} ${H264_QPEL_avg_mc02_SPV} ${H264_QPEL_avg_mc22_SPV} ${H264_QPEL_avg_mc10_SPV} ${H264_QPEL_avg_mc30_SPV} ${H264_QPEL_avg_mc01_SPV} ${H264_QPEL_avg_mc03_SPV} ${H264_QPEL_avg_mc11_SPV} ${H264_QPEL_avg_mc12_SPV} ${H264_QPEL_avg_mc13_SPV} ${H264_QPEL_avg_mc21_SPV} ${H264_QPEL_avg_mc23_SPV} ${H264_QPEL_avg_mc31_SPV} ${H264_QPEL_avg_mc32_SPV} ${H264_QPEL_avg_mc33_SPV})

    # v3d_runner — reusable Vulkan plumbing.
    add_library(v3d_runner STATIC src/v3d_runner.c)
@@ -542,6 +559,21 @@ if (DAEDALUS_BUILD_VULKAN)
        ${H264_QPEL_mc31_SPV}
        ${H264_QPEL_mc32_SPV}
        ${H264_QPEL_mc33_SPV}
+        ${H264_QPEL_avg_mc20_SPV}
+        ${H264_QPEL_avg_mc02_SPV}
+        ${H264_QPEL_avg_mc22_SPV}
+        ${H264_QPEL_avg_mc10_SPV}
+        ${H264_QPEL_avg_mc30_SPV}
+        ${H264_QPEL_avg_mc01_SPV}
+        ${H264_QPEL_avg_mc03_SPV}
+        ${H264_QPEL_avg_mc11_SPV}
+        ${H264_QPEL_avg_mc12_SPV}
+        ${H264_QPEL_avg_mc13_SPV}
+        ${H264_QPEL_avg_mc21_SPV}
+        ${H264_QPEL_avg_mc23_SPV}
+        ${H264_QPEL_avg_mc31_SPV}
+        ${H264_QPEL_avg_mc32_SPV}
+        ${H264_QPEL_avg_mc33_SPV}
        DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
    )
 endif()
@@ -72,6 +72,22 @@ struct daedalus_ctx {
    int           h264_qpel_mc31_pipe_ready; v3d_pipeline h264_qpel_mc31_pipe;
    int           h264_qpel_mc32_pipe_ready; v3d_pipeline h264_qpel_mc32_pipe;
    int           h264_qpel_mc33_pipe_ready; v3d_pipeline h264_qpel_mc33_pipe;
+    /* avg_ biprediction pipelines — same shaders + L2 with existing dst. */
+    int           h264_qpel_avg_mc20_pipe_ready; v3d_pipeline h264_qpel_avg_mc20_pipe;
+    int           h264_qpel_avg_mc02_pipe_ready; v3d_pipeline h264_qpel_avg_mc02_pipe;
+    int           h264_qpel_avg_mc22_pipe_ready; v3d_pipeline h264_qpel_avg_mc22_pipe;
+    int           h264_qpel_avg_mc10_pipe_ready; v3d_pipeline h264_qpel_avg_mc10_pipe;
+    int           h264_qpel_avg_mc30_pipe_ready; v3d_pipeline h264_qpel_avg_mc30_pipe;
+    int           h264_qpel_avg_mc01_pipe_ready; v3d_pipeline h264_qpel_avg_mc01_pipe;
+    int           h264_qpel_avg_mc03_pipe_ready; v3d_pipeline h264_qpel_avg_mc03_pipe;
+    int           h264_qpel_avg_mc11_pipe_ready; v3d_pipeline h264_qpel_avg_mc11_pipe;
+    int           h264_qpel_avg_mc12_pipe_ready; v3d_pipeline h264_qpel_avg_mc12_pipe;
+    int           h264_qpel_avg_mc13_pipe_ready; v3d_pipeline h264_qpel_avg_mc13_pipe;
+    int           h264_qpel_avg_mc21_pipe_ready; v3d_pipeline h264_qpel_avg_mc21_pipe;
+    int           h264_qpel_avg_mc23_pipe_ready; v3d_pipeline h264_qpel_avg_mc23_pipe;
+    int           h264_qpel_avg_mc31_pipe_ready; v3d_pipeline h264_qpel_avg_mc31_pipe;
+    int           h264_qpel_avg_mc32_pipe_ready; v3d_pipeline h264_qpel_avg_mc32_pipe;
+    int           h264_qpel_avg_mc33_pipe_ready; v3d_pipeline h264_qpel_avg_mc33_pipe;
 };

 daedalus_ctx *daedalus_ctx_create(void)
@@ -146,6 +162,21 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
        if (ctx->h264_qpel_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc31_pipe);
        if (ctx->h264_qpel_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc32_pipe);
        if (ctx->h264_qpel_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc33_pipe);
+        if (ctx->h264_qpel_avg_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc20_pipe);
+        if (ctx->h264_qpel_avg_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc02_pipe);
+        if (ctx->h264_qpel_avg_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc22_pipe);
+        if (ctx->h264_qpel_avg_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc10_pipe);
+        if (ctx->h264_qpel_avg_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc30_pipe);
+        if (ctx->h264_qpel_avg_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc01_pipe);
+        if (ctx->h264_qpel_avg_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc03_pipe);
+        if (ctx->h264_qpel_avg_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc11_pipe);
+        if (ctx->h264_qpel_avg_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc12_pipe);
+        if (ctx->h264_qpel_avg_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc13_pipe);
+        if (ctx->h264_qpel_avg_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc21_pipe);
+        if (ctx->h264_qpel_avg_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc23_pipe);
+        if (ctx->h264_qpel_avg_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc31_pipe);
+        if (ctx->h264_qpel_avg_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc32_pipe);
+        if (ctx->h264_qpel_avg_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc33_pipe);
        v3d_runner_destroy(ctx->runner);
    }
    free(ctx);
@@ -195,21 +226,21 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_QPEL_MC31:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc31.spv */
    case DAEDALUS_KERNEL_H264_QPEL_MC32:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc32.spv */
    case DAEDALUS_KERNEL_H264_QPEL_MC33:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc33.spv */
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU;	/* biprediction anchors */
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_CPU;	/* ¼-H L2 avg */
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_CPU;	/* diagonals avg */
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_CPU;
-    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc20.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc02.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc22.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc10.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc30.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc01.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc03.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc11.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc12.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc13.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc21.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc23.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc31.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc32.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_avg_mc33.spv */
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -1837,6 +1868,26 @@ DEFINE_QPEL_DIAG_QPU(mc31)
 DEFINE_QPEL_DIAG_QPU(mc32)
 DEFINE_QPEL_DIAG_QPU(mc33)

+/* avg_ variants — same diag-style envelope (10*stride+11 covers any
+ * (r±1, c±1) offset the avg_ shaders use), different SPV file.
+ * Slightly over-allocates for avg_mc20/02/10/30/01/03 (which need
+ * less src context) but the cost is negligible. */
+DEFINE_QPEL_DIAG_QPU(avg_mc20)
+DEFINE_QPEL_DIAG_QPU(avg_mc02)
+DEFINE_QPEL_DIAG_QPU(avg_mc22)
+DEFINE_QPEL_DIAG_QPU(avg_mc10)
+DEFINE_QPEL_DIAG_QPU(avg_mc30)
+DEFINE_QPEL_DIAG_QPU(avg_mc01)
+DEFINE_QPEL_DIAG_QPU(avg_mc03)
+DEFINE_QPEL_DIAG_QPU(avg_mc11)
+DEFINE_QPEL_DIAG_QPU(avg_mc12)
+DEFINE_QPEL_DIAG_QPU(avg_mc13)
+DEFINE_QPEL_DIAG_QPU(avg_mc21)
+DEFINE_QPEL_DIAG_QPU(avg_mc23)
+DEFINE_QPEL_DIAG_QPU(avg_mc31)
+DEFINE_QPEL_DIAG_QPU(avg_mc32)
+DEFINE_QPEL_DIAG_QPU(avg_mc33)
+
 #undef DEFINE_QPEL_DIAG_QPU

 /* -------------------- Public dispatch entry points -------------- */
@@ -2142,22 +2193,27 @@ DEFINE_QPEL_DIAG_PUBLIC(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
 DEFINE_QPEL_DIAG_PUBLIC(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
 DEFINE_QPEL_DIAG_PUBLIC(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
 DEFINE_QPEL_DIAG_PUBLIC(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
+
+/* avg_ biprediction dispatchers (15 positions) — same macro, the
+ * underlying _qpu dispatch fns also reuse the diag QPU helper since
+ * the avg_ shaders share the put_ src envelope (the L2 step only
+ * touches dst). */
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
+DEFINE_QPEL_DIAG_PUBLIC(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
 #undef DEFINE_QPEL_DIAG_PUBLIC
-DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
-DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
-DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
-DEFINE_QPEL_DISPATCH(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
-DEFINE_QPEL_DISPATCH(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
-DEFINE_QPEL_DISPATCH(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
-DEFINE_QPEL_DISPATCH(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
-DEFINE_QPEL_DISPATCH(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
-DEFINE_QPEL_DISPATCH(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
-DEFINE_QPEL_DISPATCH(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
-DEFINE_QPEL_DISPATCH(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
-DEFINE_QPEL_DISPATCH(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
-DEFINE_QPEL_DISPATCH(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
-DEFINE_QPEL_DISPATCH(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
-DEFINE_QPEL_DISPATCH(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)

 #undef DEFINE_QPEL_DISPATCH

@@ -0,0 +1,52 @@
+// daedalus-fourier — H.264 luma qpel avg_mc01 (biprediction) (8x8, ¼-pel vertical),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 "d" position:
+//
+//   dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r,c] + 1) >> 1)
+//
+// Sibling of v3d_h264_qpel_mc02.comp with L2 step against src[r, c].
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc01_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+    uint col_base = src_off + c;
+
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
+    int vp = clamp(v >> 5, 0, 255);
+
+    int avg = (vp + s_0 + 1) >> 1;    // L2 with src[r, c]
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,77 @@
+// daedalus-fourier — H.264 luma qpel avg_mc02 (biprediction) (8x8, vertical half-pel), V3D 7.1.
+//
+// Sibling of cycle 9's v3d_h264_qpel_mc20.comp.  Same 6-tap filter,
+// transposed to vertical direction:
+//
+//   dst[r,c] = clip255(
+//       ( s[r-2,c]
+//         - 5 * s[r-1,c]
+//         + 20 * s[r,  c]
+//         + 20 * s[r+1,c]
+//         -  5 * s[r+2,c]
+//         +      s[r+3,c]
+//         + 16
+//       ) >> 5)
+//
+// src+src_off points at row 0 col 0 of the OUTPUT block; the filter
+// reads rows -2..+3 (2 rows of top context, 3 rows of bottom).
+//
+// Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc02_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+
+layout(push_constant) uniform PC {
+    uint n_blocks;
+    uint stride_u8;
+    uint _pad0, _pad1;
+} pc;
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3;
+    uint c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    // Read the 6 rows of vertical context at col (c) of THIS output row.
+    // src_off+r*stride+c is at the OUTPUT pixel position; the kernel
+    // samples r-2..r+3 along the column.  Unsigned-safe because the
+    // public API contract guarantees src_off >= 2*stride.
+    uint col_base = src_off + c;
+
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+
+    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
+    int p = clamp(v >> 5, 0, 255);
+
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
+}
@@ -0,0 +1,52 @@
+// daedalus-fourier — H.264 luma qpel avg_mc03 (biprediction) (8x8, ¾-pel vertical),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 "n" position:
+//
+//   dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r+1, c] + 1) >> 1)
+//
+// Same as mc01 but L2-averages with src[r+1, c] instead of src[r, c].
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc03_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+    uint col_base = src_off + c;
+
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
+    int vp = clamp(v >> 5, 0, 255);
+
+    int avg = (vp + s_p1 + 1) >> 1;   // L2 with src[r+1, c]
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,55 @@
+// daedalus-fourier — H.264 luma qpel avg_mc10 (biprediction) (8x8, ¼-pel horizontal),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 "a" position:
+//
+//   dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c] + 1) >> 1)
+//
+// = horizontal half-pel filter, clipped to u8, then L2 rounded-averaged
+// with the integer source pixel at the SAME position.  Sibling of
+// v3d_h264_qpel_mc20.comp with the L2 step added at the tail.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc10_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+    uint row_base = src_off + r * stride + c;
+
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
+    int hp = clamp(v >> 5, 0, 255);
+
+    // L2 average with the integer source at the SAME (r, c) position.
+    int avg = (hp + s_0 + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc11 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc11[r,c] = avg(mc20(r, c),
+//                     mc02(r, c))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc11_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_h(src_off, stride, r, c);
+    int b = hpel_v(src_off, stride, r, c);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc12 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc12[r,c] = avg(mc22(r, c),
+//                     mc02(r, c))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc12_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_hv(src_off, stride, r, c);
+    int b = hpel_v(src_off, stride, r, c);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc13 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc13[r,c] = avg(mc20(r+1, c),
+//                     mc02(r, c))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc13_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_h(src_off, stride, r+1u, c);
+    int b = hpel_v(src_off, stride, r, c);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,91 @@
+// daedalus-fourier — H.264 luma qpel avg_mc20 (biprediction) (8x8, horizontal half-pel), V3D 7.1.
+//
+// H.264 spec §8.4.2.2.1 horizontal 6-tap luma interpolation:
+//
+//   dst[r,c] = clip255(
+//       ( s[r,c-2]
+//         - 5 * s[r,c-1]
+//         + 20 * s[r,c]
+//         + 20 * s[r,c+1]
+//         -  5 * s[r,c+2]
+//         +      s[r,c+3]
+//         + 16
+//       ) >> 5)
+//
+// Single-stride: dst and src share `stride` (H264QpelContext
+// convention).  src+src_off already points at the leftmost output
+// column (col 0); the filter reads cols -2..+3.  Caller guarantees
+// edge-padding context per the public API docstring.
+//
+// Workgroup layout: 64 invocations = 1 lane per output pixel.
+// 1 block per WG; n_blocks WGs total.  This is the simplest layout
+// that avoids any inter-lane communication — each lane independently
+// reads its 6 src samples and writes its 1 dst sample.  V3D's L2
+// cache handles the redundant reads from adjacent lanes.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc20_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer Src {
+    uint8_t src[];
+} u_src;
+
+layout(binding = 1) buffer Dst {
+    uint8_t dst[];
+} u_dst;
+
+layout(binding = 2) readonly buffer Meta {
+    uvec4 meta[];       // .x = dst_off, .y = src_off
+} u_meta;
+
+layout(push_constant) uniform PC {
+    uint n_blocks;
+    uint stride_u8;
+    uint _pad0, _pad1;
+} pc;
+
+void main()
+{
+    // 1 block per WG, 64 lanes covering the 8x8 output block.
+    uint wg_id      = gl_WorkGroupID.x;
+    uint block_idx  = wg_id;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3;    // 0..7 (row)
+    uint c = lane & 7u;    // 0..7 (column)
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    // src points at output col 0 of the block; filter reads cols -2..+3
+    // of the current row.  Negative col arithmetic is unsigned-safe
+    // because src_off >= 2 (caller-guaranteed left context).
+    uint row_base = src_off + r * stride + c;
+
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base + 0u]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+
+    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
+    int p = clamp(v >> 5, 0, 255);
+
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc21 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc21[r,c] = avg(mc22(r, c),
+//                     mc20(r, c))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc21_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_hv(src_off, stride, r, c);
+    int b = hpel_h(src_off, stride, r, c);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,94 @@
+// daedalus-fourier — H.264 luma qpel avg_mc22 (biprediction) (8x8, 2D half-pel "j" position).
+// V3D 7.1.
+//
+// Cascaded H+V 6-tap per H.264 §8.4.2.2.1 / FFmpeg ff_put_h264_qpel8_mc22_neon:
+//
+//   tmp[r,c] = src[r,c-2] - 5*src[r,c-1] + 20*src[r,c] + 20*src[r,c+1]
+//              - 5*src[r,c+2] + src[r,c+3]                    (int16)
+//
+//   dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
+//                       + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
+//                       + 512) >> 10)
+//
+// The +512 >> 10 final scale compensates for both 6-tap scalings.
+// CANNOT just cascade mc20→mc02 because intermediate must be int16
+// (no per-stage clip), so this is a dedicated kernel.
+//
+// Per-lane structure: each lane computes its own (r, c) output by
+// running the FULL cascade — 6 horizontal lowpass int16 values for
+// rows r-2..r+3, then a vertical lowpass on those.  ~50 ALU ops per
+// lane.  No shared memory / barriers needed; V3D L2 absorbs the
+// redundant src reads across lanes.
+//
+// WG layout: 64 lanes / 1 block-per-WG / 1 lane-per-output-pixel
+// (same as mc20 / mc02).
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc22_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+
+layout(push_constant) uniform PC {
+    uint n_blocks;
+    uint stride_u8;
+    uint _pad0, _pad1;
+} pc;
+
+// Horizontal 6-tap filter at (row_off, c) — reads src at cols c-2..c+3
+// of the row identified by row_off, returns int16 intermediate (NOT
+// scaled — the v-pass does the +512 >> 10 for both stages).
+int hpel_h(uint row_off, uint c)
+{
+    int s_m2 = int(u_src.src[row_off + c - 2u]);
+    int s_m1 = int(u_src.src[row_off + c - 1u]);
+    int s_0  = int(u_src.src[row_off + c       ]);
+    int s_p1 = int(u_src.src[row_off + c + 1u]);
+    int s_p2 = int(u_src.src[row_off + c + 2u]);
+    int s_p3 = int(u_src.src[row_off + c + 3u]);
+    return s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3;
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3;
+    uint c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    // Compute 6 horizontal lowpass values at rows r-2..r+3 (relative
+    // to the output row r) of column c.  src_off+r*stride+c is the
+    // output pixel position; we sample rows r-2..r+3.
+    // Unsigned-safe because src_off >= 2*stride per the caller contract.
+    int t0 = hpel_h(src_off + (r - 2u) * stride, c);
+    int t1 = hpel_h(src_off + (r - 1u) * stride, c);
+    int t2 = hpel_h(src_off +  r       * stride, c);
+    int t3 = hpel_h(src_off + (r + 1u) * stride, c);
+    int t4 = hpel_h(src_off + (r + 2u) * stride, c);
+    int t5 = hpel_h(src_off + (r + 3u) * stride, c);
+
+    int v = t0 - 5 * t1 + 20 * t2 + 20 * t3 - 5 * t4 + t5 + 512;
+    int p = clamp(v >> 10, 0, 255);
+
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc23 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc23[r,c] = avg(mc22(r, c),
+//                     mc20(r+1, c))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc23_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_hv(src_off, stride, r, c);
+    int b = hpel_h(src_off, stride, r+1u, c);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,52 @@
+// daedalus-fourier — H.264 luma qpel avg_mc30 (biprediction) (8x8, ¾-pel horizontal),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 "c" position:
+//
+//   dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c+1] + 1) >> 1)
+//
+// Same as mc10 but L2-averages with src[r, c+1] instead of src[r, c].
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc30_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+    uint row_base = src_off + r * stride + c;
+
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
+    int hp = clamp(v >> 5, 0, 255);
+
+    int avg = (hp + s_p1 + 1) >> 1;   // L2 with src[r, c+1]
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc31 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc31[r,c] = avg(mc20(r, c),
+//                     mc02(r, c+1))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc31_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_h(src_off, stride, r, c);
+    int b = hpel_v(src_off, stride, r, c+1u);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc32 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc32[r,c] = avg(mc22(r, c),
+//                     mc02(r, c+1))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc32_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_hv(src_off, stride, r, c);
+    int b = hpel_v(src_off, stride, r, c+1u);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}
@@ -0,0 +1,96 @@
+// daedalus-fourier — H.264 luma qpel avg_mc33 (biprediction) (8x8, diagonal quarter-pel),
+// V3D 7.1.  Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
+// anchors via L2 rounded-average:
+//
+//   mc33[r,c] = avg(mc20(r+1, c),
+//                     mc02(r, c+1))
+//
+// Per-lane structure: each lane computes BOTH anchor outputs at its
+// own (r, c) target offset, then L2 averages.  No shared memory.
+// Same WG geometry as the other qpel shaders.
+//
+//
+// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
+//   dst[r,c] = avg(dst[r,c], mc33_value)
+// Caller pre-loads dst with the list0 prediction; this shader
+// folds in the list1 contribution.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+layout(binding = 0) readonly buffer Src  { uint8_t src[]; } u_src;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
+
+int hpel_h(uint src_off, uint stride, uint r, uint c) {
+    uint row_base = src_off + r * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_v(uint src_off, uint stride, uint r, uint c) {
+    uint col_base = src_off + c;
+    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
+    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
+    int s_0  = int(u_src.src[col_base +  r       * stride]);
+    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
+    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
+    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
+    return clamp(v >> 5, 0, 255);
+}
+
+int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
+    // Single row's int16 horizontal lowpass (NOT clipped — used as
+    // intermediate for the vertical pass of hpel_hv).
+    uint row_base = src_off + rr * stride + c;
+    int s_m2 = int(u_src.src[row_base - 2u]);
+    int s_m1 = int(u_src.src[row_base - 1u]);
+    int s_0  = int(u_src.src[row_base       ]);
+    int s_p1 = int(u_src.src[row_base + 1u]);
+    int s_p2 = int(u_src.src[row_base + 2u]);
+    int s_p3 = int(u_src.src[row_base + 3u]);
+    return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
+}
+
+int hpel_hv(uint src_off, uint stride, uint r, uint c) {
+    int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
+    int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
+    int t2 = hpel_hv_row(src_off, stride, r,       c);
+    int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
+    int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
+    int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
+    int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
+    return clamp(v >> 10, 0, 255);
+}
+
+void main()
+{
+    uint block_idx = gl_WorkGroupID.x;
+    if (block_idx >= pc.n_blocks) return;
+
+    uint lane = gl_LocalInvocationID.x;
+    uint r = lane >> 3, c = lane & 7u;
+
+    uint dst_off = u_meta.meta[block_idx].x;
+    uint src_off = u_meta.meta[block_idx].y;
+    uint stride  = pc.stride_u8;
+
+    int a = hpel_h(src_off, stride, r+1u, c);
+    int b = hpel_v(src_off, stride, r, c+1u);
+    int avg = (a + b + 1) >> 1;
+    uint final_off = dst_off + r * stride + c;
+    int prev = int(u_dst.dst[final_off]);
+    u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
+}