Merge pull request 'QPU is default substrate: recipe table + ctx env-var override' (#7 ) from noether/qpu-default-recipe-cycles-5-8 into main

Reviewed-on: #7
cycle 7: V3D shader for H.264 IDCT 8x8
2026-05-23 18:59:34 +00:00 · 2026-05-23 20:09:25 +02:00 · 2026-05-23 20:06:20 +02:00 · 2026-05-23 19:59:53 +02:00
4 changed files with 570 additions and 11 deletions
@@ -284,7 +284,29 @@ if (DAEDALUS_BUILD_VULKAN)
        VERBATIM
    )
-    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV})
+    set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv)
    add_custom_command(
        OUTPUT ${H264_IDCT4_SPV}
        COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
                -o ${H264_IDCT4_SPV}
                ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
        DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
        COMMENT "glslang: v3d_h264_idct4.comp -> v3d_h264_idct4.spv"
        VERBATIM
    )
    set(H264_IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct8.spv)
    add_custom_command(
        OUTPUT ${H264_IDCT8_SPV}
        COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
                -o ${H264_IDCT8_SPV}
                ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
        DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
        COMMENT "glslang: v3d_h264_idct8.comp -> v3d_h264_idct8.spv"
        VERBATIM
    )
    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV})
    # v3d_runner — reusable Vulkan plumbing.
    add_library(v3d_runner STATIC src/v3d_runner.c)
@@ -412,6 +434,8 @@ if (DAEDALUS_BUILD_VULKAN)
        ${LPF8_SPV}
        ${CDEF_SPV}
        ${H264DEBLOCK_SPV}
        ${H264_IDCT4_SPV}
        ${H264_IDCT8_SPV}
        DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
    )
 endif()
@@ -40,6 +40,10 @@ struct daedalus_ctx {
    v3d_pipeline  cdef_pipe;
    int           h264deblock_pipe_ready;
    v3d_pipeline  h264deblock_pipe;
    int           h264_idct4_pipe_ready;
    v3d_pipeline  h264_idct4_pipe;
    int           h264_idct8_pipe_ready;
    v3d_pipeline  h264_idct8_pipe;
 };
 daedalus_ctx *daedalus_ctx_create(void)
@@ -53,6 +57,25 @@ daedalus_ctx *daedalus_ctx_create(void)
 daedalus_ctx *daedalus_ctx_create_no_qpu(void)
 {
    /*
     * Per the "QPU is default substrate" decree 2026-05-23:
     * setting DAEDALUS_FORCE_QPU=1 in the process env escalates this
     * function to a full daedalus_ctx_create(), letting the libavcodec
     * substitution shims (which call create_no_qpu via pthread_once)
     * fire the V3D shaders that exist for cycles 1/2/4/5/8.  Without
     * this hook each consumer process (firefox, mpv, daemon) would
     * need its own shim build to opt into QPU.
     *
     * Default behaviour (env var unset / not "1") is unchanged: pure
     * NEON ctx, no implicit Vulkan init.  Firefox / mpv consumers
     * that dlopen libavcodec without opting in stay on the
     * Vulkan-free path; the daemon explicitly sets
     * DAEDALUS_FORCE_QPU=1 before loading libavcodec.
     */
    const char *force = getenv("DAEDALUS_FORCE_QPU");
    if (force && force[0] == '1' && force[1] == 0)
        return daedalus_ctx_create();
    daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
    if (!ctx) return NULL;
    ctx->has_qpu = 0;
@@ -75,6 +98,8 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
        if (ctx->mc8h_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
        if (ctx->cdef_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
        if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
        if (ctx->h264_idct4_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
        if (ctx->h264_idct8_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
        v3d_runner_destroy(ctx->runner);
    }
    free(ctx);
@@ -84,16 +109,25 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
 {
    /*
     * Recipe table per the "QPU is default substrate" decree
     * 2026-05-23.  Any kernel that has a V3D compute shader returns
     * SUBSTRATE_QPU; CPU is the fallback for kernels without a
     * shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel
     * mc20 — covered by follow-on task 165).  The dispatch
     * wrappers already fall back to CPU automatically when the
     * ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0).
     */
    switch (k) {
    case DAEDALUS_KERNEL_VP9_IDCT8:        return DAEDALUS_SUBSTRATE_QPU;
    case DAEDALUS_KERNEL_VP9_LPF4_INNER:   return DAEDALUS_SUBSTRATE_QPU;
-    case DAEDALUS_KERNEL_VP9_MC_8H:        return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_VP9_MC_8H:        return DAEDALUS_SUBSTRATE_QPU;	/* v3d_mc_8h.spv */
    case DAEDALUS_KERNEL_VP9_LPF8_INNER:   return DAEDALUS_SUBSTRATE_QPU;
-    case DAEDALUS_KERNEL_AV1_CDEF_8X8:     return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_AV1_CDEF_8X8:     return DAEDALUS_SUBSTRATE_QPU;	/* v3d_cdef.spv */
-    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct4.spv */
-    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct8.spv */
-    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264deblock.spv */
-    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_CPU;	/* TODO task #165 */
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -715,6 +749,187 @@ fail:
    return -1;
 }
 /* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */
 typedef struct {
    uint32_t n_blocks;
    uint32_t dst_stride_u8;
    uint32_t _pad0;
    uint32_t _pad1;
 } h264_idct4_pc;
 static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
    if (!ctx->h264_idct4_pipe_ready) {
        if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv",
                                       3, sizeof(h264_idct4_pc),
                                       &ctx->h264_idct4_pipe) != 0)
            return -1;
        ctx->h264_idct4_pipe_ready = 1;
    }
    size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t);
    size_t meta_bytes  = n_blocks * 4 * sizeof(uint32_t);    /* uvec4 per block */
    size_t dst_max = 0;
    for (size_t i = 0; i < n_blocks; i++) {
        size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4;
        if (e > dst_max) dst_max = e;
    }
    v3d_buffer bc = {0}, bd = {0}, bm = {0};
    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) {
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    if (v3d_runner_create_buffer(ctx->runner, meta_bytes,  &bm)) {
        v3d_runner_destroy_buffer(ctx->runner, &bd);
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    memcpy(bc.mapped, coeffs, coeff_bytes);
    memcpy(bd.mapped, dst,    dst_max);
    uint32_t *m = bm.mapped;
    for (size_t i = 0; i < n_blocks; i++) {
        m[4*i+0] = meta[i].dst_off;
        m[4*i+1] = 0;
        m[4*i+2] = 0;
        m[4*i+3] = 0;
    }
    v3d_buffer binds[3] = { bc, bd, bm };
    if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3))
        goto fail;
    uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16);   /* 16 blocks/WG */
    h264_idct4_pc pc = {
        .n_blocks      = (uint32_t) n_blocks,
        .dst_stride_u8 = (uint32_t) dst_stride,
    };
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
    if (cb == VK_NULL_HANDLE) goto fail;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                      ctx->h264_idct4_pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            ctx->h264_idct4_pipe.layout, 0, 1,
                            &ctx->h264_idct4_pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout,
                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
    vkCmdDispatch(cb, wg_count, 1, 1);
    vkEndCommandBuffer(cb);
    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
    memcpy(dst, bd.mapped, dst_max);
    /* H.264/FFmpeg convention: zero the coeffs block after the
     * transform (matches the C ref + NEON .S behaviour). */
    memset(coeffs, 0, coeff_bytes);
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return 0;
 fail:
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return -1;
 }
 /* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
 typedef struct {
    uint32_t n_blocks;
    uint32_t dst_stride_u8;
    uint32_t _pad0;
    uint32_t _pad1;
 } h264_idct8_pc;
 static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
    if (!ctx->h264_idct8_pipe_ready) {
        if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
                                       3, sizeof(h264_idct8_pc),
                                       &ctx->h264_idct8_pipe) != 0)
            return -1;
        ctx->h264_idct8_pipe_ready = 1;
    }
    size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
    size_t meta_bytes  = n_blocks * 4 * sizeof(uint32_t);
    size_t dst_max = 0;
    for (size_t i = 0; i < n_blocks; i++) {
        size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
        if (e > dst_max) dst_max = e;
    }
    v3d_buffer bc = {0}, bd = {0}, bm = {0};
    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) {
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    if (v3d_runner_create_buffer(ctx->runner, meta_bytes,  &bm)) {
        v3d_runner_destroy_buffer(ctx->runner, &bd);
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    memcpy(bc.mapped, coeffs, coeff_bytes);
    memcpy(bd.mapped, dst,    dst_max);
    uint32_t *m = bm.mapped;
    for (size_t i = 0; i < n_blocks; i++) {
        m[4*i+0] = meta[i].dst_off;
        m[4*i+1] = 0;
        m[4*i+2] = 0;
        m[4*i+3] = 0;
    }
    v3d_buffer binds[3] = { bc, bd, bm };
    if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
        goto fail;
    uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8);   /* 8 blocks/WG */
    h264_idct8_pc pc = {
        .n_blocks      = (uint32_t) n_blocks,
        .dst_stride_u8 = (uint32_t) dst_stride,
    };
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
    if (cb == VK_NULL_HANDLE) goto fail;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                      ctx->h264_idct8_pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            ctx->h264_idct8_pipe.layout, 0, 1,
                            &ctx->h264_idct8_pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
    vkCmdDispatch(cb, wg_count, 1, 1);
    vkEndCommandBuffer(cb);
    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
    memcpy(dst, bd.mapped, dst_max);
    memset(coeffs, 0, coeff_bytes);
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return 0;
 fail:
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return -1;
 }
 /* -------------------- Public dispatch entry points -------------- */
 #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...)                                 \
@@ -803,8 +1018,16 @@ int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
+    daedalus_substrate eff = sub;
-                   dst, dst_stride, coeffs, n_blocks, meta);
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_h264_idct4_cpu(ctx, dst, dst_stride,
                                       coeffs, n_blocks, meta);
    return dispatch_h264_idct4_qpu(ctx, dst, dst_stride,
                                   coeffs, n_blocks, meta);
 }
 int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -812,8 +1035,16 @@ int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
+    daedalus_substrate eff = sub;
-                   dst, dst_stride, coeffs, n_blocks, meta);
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
                                       coeffs, n_blocks, meta);
    return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
                                   coeffs, n_blocks, meta);
 }
 int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -0,0 +1,129 @@
 // daedalus-fourier — H.264 4x4 inverse integer transform + add, V3D 7.1.
 //
 // H.264 spec §8.5.12.1.  Pure integer arithmetic — no trig constants
 // (unlike VP9 IDCT 8x8).  Row pass first, column pass second; round
 // (+32) >> 6, add to dst, clip to u8.
 //
 // Block memory layout: COLUMN-MAJOR.  block[c*4 + r] = coefficient at
 // (row r, column c).  Matches FFmpeg `ff_h264_idct_add_neon`.
 //
 // Workgroup layout: 64 invocations = 4 lanes/block × 16 blocks/WG.
 //   - row pass: lane k (0..3) reads row k of the block (4 coefficients,
 //               one from each column), runs the butterfly, writes 4
 //               outputs to one row of tmp_shared.
 //   - column pass: lane k reads column k of tmp_shared (4 rows),
 //                  runs the butterfly, writes 4 outputs to dst as
 //                  column k at rows 0..3.
 //
 // shared = 16 × 16 × 4 B = 1 KiB.  Well under V3D's 16 KiB limit.
 //
 // License: BSD-2-Clause.
 #version 450
 #extension GL_EXT_shader_8bit_storage             : require
 #extension GL_EXT_shader_16bit_storage            : require
 #extension GL_EXT_shader_explicit_arithmetic_types : require
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(binding = 0) readonly buffer Coeffs {
    int16_t coeffs[];   // N × 16 column-major
 } u_coeffs;
 layout(binding = 1) buffer Dst {
    uint8_t dst[];      // H × stride bytes (caller-provided base)
 } u_dst;
 layout(binding = 2) readonly buffer Meta {
    uvec4 meta[];       // .x = dst_off (byte offset into u_dst.dst)
 } u_meta;
 layout(push_constant) uniform PC {
    uint n_blocks;
    uint dst_stride_u8;
    uint _pad0, _pad1;
 } pc;
 // 16 blocks per WG × 16 ints per block = 256 ints = 1 KiB shared.
 shared int tmp_shared[16 * 16];
 // 1D butterfly per H.264 §8.5.12.1.  d[0..3] in, o[0..3] out.
 void idct4_1d(int d0, int d1, int d2, int d3,
              out int o0, out int o1, out int o2, out int o3)
 {
    int e = d0 + d2;
    int f = d0 - d2;
    int g = (d1 >> 1) - d3;
    int h = d1 + (d3 >> 1);
    o0 = e + h;
    o1 = f + g;
    o2 = f - g;
    o3 = e - h;
 }
 void main()
 {
    // Lane decomposition: local_size 64 = 16 blocks × 4 lanes/block.
    uint gid          = gl_GlobalInvocationID.x;
    uint wg_id        = gid / 64u;
    uint lane_in_wg   = gid & 63u;
    uint block_local  = lane_in_wg >> 2;          // 0..15
    uint k            = lane_in_wg & 3u;          // 0..3
    uint block_idx    = wg_id * 16u + block_local;
    bool oob = (block_idx >= pc.n_blocks);
    // ---- Row pass --------------------------------------------------
    // lane k handles row r=k.  Reads block[c*4 + k] for c=0..3 (one
    // element from each column at fixed row).
    if (!oob) {
        uint base = block_idx * 16u;
        int d0 = int(u_coeffs.coeffs[base + 0u * 4u + k]);
        int d1 = int(u_coeffs.coeffs[base + 1u * 4u + k]);
        int d2 = int(u_coeffs.coeffs[base + 2u * 4u + k]);
        int d3 = int(u_coeffs.coeffs[base + 3u * 4u + k]);
        int o0, o1, o2, o3;
        idct4_1d(d0, d1, d2, d3, o0, o1, o2, o3);
        // Write row k of tmp_shared[block_local].
        uint tbase = block_local * 16u + k * 4u;
        tmp_shared[tbase + 0u] = o0;
        tmp_shared[tbase + 1u] = o1;
        tmp_shared[tbase + 2u] = o2;
        tmp_shared[tbase + 3u] = o3;
    }
    barrier();
    // ---- Column pass ----------------------------------------------
    // lane k handles column c=k.  Reads tmp[r][k] for r=0..3.
    if (!oob) {
        uint tbase = block_local * 16u;
        int s0 = tmp_shared[tbase + 0u * 4u + k];
        int s1 = tmp_shared[tbase + 1u * 4u + k];
        int s2 = tmp_shared[tbase + 2u * 4u + k];
        int s3 = tmp_shared[tbase + 3u * 4u + k];
        int o0, o1, o2, o3;
        idct4_1d(s0, s1, s2, s3, o0, o1, o2, o3);
        // Column k at rows 0..3 of dst, offset by meta.x (dst_off).
        uint dst_off = u_meta.meta[block_idx].x;
        uint stride  = pc.dst_stride_u8;
        uint a0 = dst_off + 0u * stride + k;
        uint a1 = dst_off + 1u * stride + k;
        uint a2 = dst_off + 2u * stride + k;
        uint a3 = dst_off + 3u * stride + k;
        int p0 = int(u_dst.dst[a0]);
        int p1 = int(u_dst.dst[a1]);
        int p2 = int(u_dst.dst[a2]);
        int p3 = int(u_dst.dst[a3]);
        u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 32) >> 6), 0, 255));
        u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 32) >> 6), 0, 255));
        u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 32) >> 6), 0, 255));
        u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 32) >> 6), 0, 255));
    }
 }
@@ -0,0 +1,175 @@
 // daedalus-fourier — H.264 8x8 inverse integer transform + add, V3D 7.1.
 //
 // H.264 spec §8.5.13.2 (High profile 8x8 IT).  Pure integer arithmetic
 // — different butterfly from VP9 IDCT 8x8 (cycle 1, uses cospi
 // multipliers).  Row pass first, column pass second; round (+32) >> 6,
 // add to dst, clip to u8.
 //
 // Block layout: COLUMN-MAJOR.  block[c*8 + r] = coefficient at
 // (row r, column c).  Matches FFmpeg `ff_h264_idct8_add_neon`.
 //
 // Workgroup layout: 64 invocations = 8 lanes/block × 8 blocks/WG.
 //   - row pass: lane k (0..7) reads row k of the block (8 coefficients,
 //               one from each column), runs the butterfly, writes 8
 //               outputs to one row of tmp_shared.
 //   - column pass: lane k reads column k of tmp_shared (8 rows),
 //                  runs the butterfly, writes 8 outputs to dst as
 //                  column k at rows 0..7.
 //
 // shared = 8 × 64 × 4 B = 2 KiB.  Well under V3D's 16 KiB limit.
 //
 // License: BSD-2-Clause.
 #version 450
 #extension GL_EXT_shader_8bit_storage             : require
 #extension GL_EXT_shader_16bit_storage            : require
 #extension GL_EXT_shader_explicit_arithmetic_types : require
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(binding = 0) readonly buffer Coeffs {
    int16_t coeffs[];   // N × 64 column-major
 } u_coeffs;
 layout(binding = 1) buffer Dst {
    uint8_t dst[];      // H × stride bytes
 } u_dst;
 layout(binding = 2) readonly buffer Meta {
    uvec4 meta[];       // .x = dst_off
 } u_meta;
 layout(push_constant) uniform PC {
    uint n_blocks;
    uint dst_stride_u8;
    uint _pad0, _pad1;
 } pc;
 // 8 blocks/WG × 64 ints/block × 4 B = 2 KiB shared.
 shared int tmp_shared[8 * 64];
 // 1D 8-element butterfly per H.264 §8.5.13.2.
 void idct8_1d(int d0, int d1, int d2, int d3,
              int d4, int d5, int d6, int d7,
              out int g0, out int g1, out int g2, out int g3,
              out int g4, out int g5, out int g6, out int g7)
 {
    int e0 = d0 + d4;
    int e1 = -d3 + d5 - d7 - (d7 >> 1);
    int e2 = d0 - d4;
    int e3 = d1 + d7 - d3 - (d3 >> 1);
    int e4 = (d2 >> 1) - d6;
    int e5 = -d1 + d7 + d5 + (d5 >> 1);
    int e6 = d2 + (d6 >> 1);
    int e7 = d3 + d5 + d1 + (d1 >> 1);
    int f0 = e0 + e6;
    int f1 = e1 + (e7 >> 2);
    int f2 = e2 + e4;
    int f3 = e3 + (e5 >> 2);
    int f4 = e2 - e4;
    int f5 = (e3 >> 2) - e5;
    int f6 = e0 - e6;
    int f7 = e7 - (e1 >> 2);
    g0 = f0 + f7;
    g1 = f2 + f5;
    g2 = f4 + f3;
    g3 = f6 + f1;
    g4 = f6 - f1;
    g5 = f4 - f3;
    g6 = f2 - f5;
    g7 = f0 - f7;
 }
 void main()
 {
    // local_size 64 = 8 blocks × 8 lanes/block.
    uint gid          = gl_GlobalInvocationID.x;
    uint wg_id        = gid / 64u;
    uint lane_in_wg   = gid & 63u;
    uint block_local  = lane_in_wg >> 3;          // 0..7
    uint k            = lane_in_wg & 7u;          // 0..7
    uint block_idx    = wg_id * 8u + block_local;
    bool oob = (block_idx >= pc.n_blocks);
    // ---- Row pass --------------------------------------------------
    // lane k handles row r=k.  Reads block[c*8 + k] for c=0..7.
    if (!oob) {
        uint base = block_idx * 64u;
        int d0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
        int d1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
        int d2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
        int d3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
        int d4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
        int d5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
        int d6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
        int d7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
        int g0, g1, g2, g3, g4, g5, g6, g7;
        idct8_1d(d0, d1, d2, d3, d4, d5, d6, d7,
                 g0, g1, g2, g3, g4, g5, g6, g7);
        // Write row k of tmp_shared[block_local].
        uint tbase = block_local * 64u + k * 8u;
        tmp_shared[tbase + 0u] = g0;
        tmp_shared[tbase + 1u] = g1;
        tmp_shared[tbase + 2u] = g2;
        tmp_shared[tbase + 3u] = g3;
        tmp_shared[tbase + 4u] = g4;
        tmp_shared[tbase + 5u] = g5;
        tmp_shared[tbase + 6u] = g6;
        tmp_shared[tbase + 7u] = g7;
    }
    barrier();
    // ---- Column pass ----------------------------------------------
    // lane k handles column c=k.  Reads tmp[r][k] for r=0..7.
    if (!oob) {
        uint tbase = block_local * 64u;
        int s0 = tmp_shared[tbase + 0u * 8u + k];
        int s1 = tmp_shared[tbase + 1u * 8u + k];
        int s2 = tmp_shared[tbase + 2u * 8u + k];
        int s3 = tmp_shared[tbase + 3u * 8u + k];
        int s4 = tmp_shared[tbase + 4u * 8u + k];
        int s5 = tmp_shared[tbase + 5u * 8u + k];
        int s6 = tmp_shared[tbase + 6u * 8u + k];
        int s7 = tmp_shared[tbase + 7u * 8u + k];
        int g0, g1, g2, g3, g4, g5, g6, g7;
        idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
                 g0, g1, g2, g3, g4, g5, g6, g7);
        // Column k at rows 0..7 of dst, offset by meta.x.
        uint dst_off = u_meta.meta[block_idx].x;
        uint stride  = pc.dst_stride_u8;
        uint a0 = dst_off + 0u * stride + k;
        uint a1 = dst_off + 1u * stride + k;
        uint a2 = dst_off + 2u * stride + k;
        uint a3 = dst_off + 3u * stride + k;
        uint a4 = dst_off + 4u * stride + k;
        uint a5 = dst_off + 5u * stride + k;
        uint a6 = dst_off + 6u * stride + k;
        uint a7 = dst_off + 7u * stride + k;
        int p0 = int(u_dst.dst[a0]);
        int p1 = int(u_dst.dst[a1]);
        int p2 = int(u_dst.dst[a2]);
        int p3 = int(u_dst.dst[a3]);
        int p4 = int(u_dst.dst[a4]);
        int p5 = int(u_dst.dst[a5]);
        int p6 = int(u_dst.dst[a6]);
        int p7 = int(u_dst.dst[a7]);
        u_dst.dst[a0] = uint8_t(clamp(p0 + ((g0 + 32) >> 6), 0, 255));
        u_dst.dst[a1] = uint8_t(clamp(p1 + ((g1 + 32) >> 6), 0, 255));
        u_dst.dst[a2] = uint8_t(clamp(p2 + ((g2 + 32) >> 6), 0, 255));
        u_dst.dst[a3] = uint8_t(clamp(p3 + ((g3 + 32) >> 6), 0, 255));
        u_dst.dst[a4] = uint8_t(clamp(p4 + ((g4 + 32) >> 6), 0, 255));
        u_dst.dst[a5] = uint8_t(clamp(p5 + ((g5 + 32) >> 6), 0, 255));
        u_dst.dst[a6] = uint8_t(clamp(p6 + ((g6 + 32) >> 6), 0, 255));
        u_dst.dst[a7] = uint8_t(clamp(p7 + ((g7 + 32) >> 6), 0, 255));
    }
 }
Author	SHA1	Message	Date
marfrit	a092ee34aa	Merge pull request 'QPU is default substrate: recipe table + ctx env-var override' (#7 ) from noether/qpu-default-recipe-cycles-5-8 into main Reviewed-on: #7	2026-05-23 18:59:34 +00:00
claude-noether	74687d9def	cycle 7: V3D shader for H.264 IDCT 8x8 Mirrors cycle 6 (PR #7 prior commit) but at 8x8 scale: 8 lanes per block, 8 blocks per WG. H.264 §8.5.13.2 1D butterfly twice (row pass, column pass), (val + 32) >> 6 rounded + clipped + added to dst. Bit-exact first try on hertz (Pi 5, V3D 7.1): H264_IDCT4 recipe substrate: 2 (QPU) H264_IDCT8 recipe substrate: 2 (QPU) ← flipped H264_DEBLOCK_LV recipe substrate: 2 (QPU) H264_QPEL_MC20 recipe substrate: 1 (CPU) ← task #165 remaining H.264 IDCT 4x4: 2048/2048 bytes bit-exact H.264 IDCT 8x8: 2048/2048 bytes bit-exact ← QPU H.264 deblock luma v: 2048/2048 bytes bit-exact H.264 qpel mc20: 1024/1024 bytes bit-exact 8 of 9 daedalus-fourier cycles now QPU-by-recipe. Only cycle 9 (H.264 luma qpel mc20) still CPU — different shape (6-tap MC filter, not a transform) so needs its own shader template; task #165 covers it as a follow-on. Same pattern as cycle 6 commit (`65bd5c3`): adds h264_idct8_pipe field + lazy init, dispatch_h264_idct8_qpu() with 3 SSBOs, v3d_h264_idct8.spv install rule. Uses v3d_runner_create_buffer / destroy_buffer (will swap to pool API once PR #6 lands).	2026-05-23 20:09:25 +02:00
claude-noether	65bd5c3fe3	cycle 6: V3D shader for H.264 IDCT 4x4 (first cycle-6 QPU dispatch) Per the QPU-default substrate decree 2026-05-23, cycle 6 (H.264 IDCT 4x4 + add) was the highest-priority H.264 kernel to flip from NEON-only to QPU-capable. The same shape as VP9 IDCT 8x8 (cycle 1) — two-pass butterfly with shared-memory transpose — but at 4x4 scale: 4 lanes per block, 16 blocks per WG. What's added: - src/v3d_h264_idct4.comp: GLSL compute shader implementing the H.264 §8.5.12.1 1D butterfly twice (row pass then column pass), with (val + 32) >> 6 rounding and clip-to-u8 add to dst. Block memory layout is column-major (matches FFmpeg `ff_h264_idct_add_neon` convention). - CMakeLists: glslang rule + install entry for v3d_h264_idct4.spv. - dispatch_h264_idct4_qpu() in daedalus_core.c: lazy pipeline init, 3 SSBOs (coeffs / dst / meta as uvec4), push-constant (n_blocks, dst_stride), 16 blocks per WG dispatch. Matches the existing dispatch_*_qpu patterns; uses v3d_runner_create_buffer / destroy_buffer (will swap to pool API once PR #6 lands). - daedalus_dispatch_h264_idct4() replaces ROUTE_CPU_ONLY with the same CPU/QPU substrate switch the deblock dispatch uses. - daedalus_recipe_substrate_for(H264_IDCT4) returns QPU now that the shader exists. Verification on hertz (Pi 5 + V3D 7.1): $ ./test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === H264_IDCT4 recipe substrate: 2 (1=CPU, 2=QPU) H264_IDCT8 recipe substrate: 1 H264_DEBLOCK_LV recipe substrate: 2 H264_QPEL_MC20 recipe substrate: 1 H.264 IDCT 4x4: 2048/2048 bytes bit-exact (100.0000%) ← QPU H.264 IDCT 8x8: 2048/2048 bytes bit-exact H.264 deblock luma v: 2048/2048 bytes bit-exact H.264 qpel mc20: 1024/1024 bytes bit-exact The AUTO-substrate path now picks QPU for H.264 IDCT 4x4, and the output is bit-exact against the C reference (which is identical to the NEON .S code by construction — same FFmpeg upstream). Remaining cycle-6/7/9 work in task #165: - cycle 7: H.264 IDCT 8x8 (template same shape; 8 lanes per block, fewer blocks per WG) - cycle 9: H.264 luma qpel mc20 (different shape — 6-tap MC not a transform) This commit lands the cycle-6 piece of task #165.	2026-05-23 20:06:20 +02:00
claude-noether	737e87980d	QPU is default substrate: recipe table + ctx env-var override Per the user decree 2026-05-23 — "what can be done in QPU will be done in QPU" — this lands two coupled changes that flip production-decode kernels with existing V3D shaders from CPU-by-recipe to QPU-by-recipe: 1) daedalus_recipe_substrate_for() returns SUBSTRATE_QPU for every kernel that has a shipped V3D compute shader: cycle 1 VP9 IDCT 8x8 QPU (was QPU; unchanged) cycle 2 VP9 LPF wd=4 QPU (was QPU; unchanged) cycle 3 VP9 MC 8h QPU (FLIPPED from CPU — v3d_mc_8h.spv) cycle 4 VP9 LPF wd=8 QPU (was QPU; unchanged) cycle 5 AV1 CDEF 8x8 QPU (FLIPPED from CPU — v3d_cdef.spv) cycle 6 H.264 IDCT 4x4 CPU (no shader yet; task #165) cycle 7 H.264 IDCT 8x8 CPU (no shader yet; task #165) cycle 8 H.264 deblock luma-v QPU (FLIPPED from CPU — v3d_h264deblock.spv) cycle 9 H.264 qpel mc20 CPU (no shader yet; task #165) The R-band cost/benefit framework still applies but is now superseded for substrate selection by the decree. Where R stays RED, the cost is in dispatch overhead, which is a fixable engineering issue (tasks 160 buffer-pool, 161 persistent cmdbuf, 162 dmabuf import). 2) daedalus_ctx_create_no_qpu() now honours an env-var override: set DAEDALUS_FORCE_QPU=1 in the process and create_no_qpu silently escalates to a full daedalus_ctx_create(). Lets the libavcodec substitution shims in marfrit-packages (which pthread_once a create_no_qpu ctx — see libavcodec/aarch64/h264_idct_daedalus.c) fire QPU paths without rebuilding those patches. Firefox / mpv consumers stay on the Vulkan-free path by default (env var unset). The daedalus-v4l2 daemon will set DAEDALUS_FORCE_QPU=1 explicitly before dlopen'ing libavcodec (separate daedalus-v4l2 follow-up). Smoke (hertz, Pi 5, kernel 6.18.29): === test_api_h264 === H264_IDCT4 recipe substrate: 1 (1=CPU, 2=QPU) H264_IDCT8 recipe substrate: 1 H264_DEBLOCK_LV recipe substrate: 2 ← flipped H264_QPEL_MC20 recipe substrate: 1 H.264 IDCT 4x4: 2048/2048 bytes bit-exact H.264 IDCT 8x8: 2048/2048 bytes bit-exact H.264 deblock luma v: 2048/2048 bytes bit-exact ← QPU path H.264 qpel mc20: 1024/1024 bytes bit-exact === test_api_idct === all substrates (CPU/QPU/AUTO) bit-exact === test_api_lpf === all substrates bit-exact wd=4 and wd=8 The dispatch wrapper's fall-through logic (eff == SUBSTRATE_QPU && !ctx_has_qpu(ctx) → eff = SUBSTRATE_CPU) handles the case where the recipe says QPU but the consumer didn't opt in — it falls back to CPU silently, no regression. Closes daedalus-fourier tasks #163, #164. Refs the 2026-05-23 "QPU default substrate" decree.	2026-05-23 19:59:53 +02:00