cycle 7: V3D shader for H.264 IDCT 8x8

Mirrors cycle 6 (PR #7 prior commit) but at 8x8 scale: 8 lanes per block, 8 blocks per WG. H.264 §8.5.13.2 1D butterfly twice (row pass, column pass), (val + 32) >> 6 rounded + clipped + added to dst. Bit-exact first try on hertz (Pi 5, V3D 7.1): H264_IDCT4 recipe substrate: 2 (QPU) H264_IDCT8 recipe substrate: 2 (QPU) ← flipped H264_DEBLOCK_LV recipe substrate: 2 (QPU) H264_QPEL_MC20 recipe substrate: 1 (CPU) ← task #165 remaining H.264 IDCT 4x4: 2048/2048 bytes bit-exact H.264 IDCT 8x8: 2048/2048 bytes bit-exact ← QPU H.264 deblock luma v: 2048/2048 bytes bit-exact H.264 qpel mc20: 1024/1024 bytes bit-exact 8 of 9 daedalus-fourier cycles now QPU-by-recipe. Only cycle 9 (H.264 luma qpel mc20) still CPU — different shape (6-tap MC filter, not a transform) so needs its own shader template; task #165 covers it as a follow-on. Same pattern as cycle 6 commit (65bd5c3): adds h264_idct8_pipe field + lazy init, dispatch_h264_idct8_qpu() with 3 SSBOs, v3d_h264_idct8.spv install rule. Uses v3d_runner_create_buffer / destroy_buffer (will swap to pool API once PR #6 lands).
2026-05-23 20:09:25 +02:00
parent 65bd5c3fe3
commit 74687d9def
3 changed files with 291 additions and 4 deletions
@@ -295,7 +295,18 @@ if (DAEDALUS_BUILD_VULKAN)
        VERBATIM
    )
-    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV})
+    set(H264_IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct8.spv)
    add_custom_command(
        OUTPUT ${H264_IDCT8_SPV}
        COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
                -o ${H264_IDCT8_SPV}
                ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
        DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
        COMMENT "glslang: v3d_h264_idct8.comp -> v3d_h264_idct8.spv"
        VERBATIM
    )
    add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV})
    # v3d_runner — reusable Vulkan plumbing.
    add_library(v3d_runner STATIC src/v3d_runner.c)
@@ -424,6 +435,7 @@ if (DAEDALUS_BUILD_VULKAN)
        ${CDEF_SPV}
        ${H264DEBLOCK_SPV}
        ${H264_IDCT4_SPV}
        ${H264_IDCT8_SPV}
        DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
    )
 endif()
@@ -42,6 +42,8 @@ struct daedalus_ctx {
    v3d_pipeline  h264deblock_pipe;
    int           h264_idct4_pipe_ready;
    v3d_pipeline  h264_idct4_pipe;
    int           h264_idct8_pipe_ready;
    v3d_pipeline  h264_idct8_pipe;
 };
 daedalus_ctx *daedalus_ctx_create(void)
@@ -97,6 +99,7 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
        if (ctx->cdef_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
        if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
        if (ctx->h264_idct4_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
        if (ctx->h264_idct8_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
        v3d_runner_destroy(ctx->runner);
    }
    free(ctx);
@@ -122,7 +125,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_VP9_LPF8_INNER:   return DAEDALUS_SUBSTRATE_QPU;
    case DAEDALUS_KERNEL_AV1_CDEF_8X8:     return DAEDALUS_SUBSTRATE_QPU;	/* v3d_cdef.spv */
    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct4.spv */
-    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_CPU;	/* TODO task #165 */
+    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct8.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264deblock.spv */
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_CPU;	/* TODO task #165 */
    }
@@ -838,6 +841,95 @@ fail:
    return -1;
 }
 /* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
 typedef struct {
    uint32_t n_blocks;
    uint32_t dst_stride_u8;
    uint32_t _pad0;
    uint32_t _pad1;
 } h264_idct8_pc;
 static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
    if (!ctx->h264_idct8_pipe_ready) {
        if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
                                       3, sizeof(h264_idct8_pc),
                                       &ctx->h264_idct8_pipe) != 0)
            return -1;
        ctx->h264_idct8_pipe_ready = 1;
    }
    size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
    size_t meta_bytes  = n_blocks * 4 * sizeof(uint32_t);
    size_t dst_max = 0;
    for (size_t i = 0; i < n_blocks; i++) {
        size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
        if (e > dst_max) dst_max = e;
    }
    v3d_buffer bc = {0}, bd = {0}, bm = {0};
    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) {
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    if (v3d_runner_create_buffer(ctx->runner, meta_bytes,  &bm)) {
        v3d_runner_destroy_buffer(ctx->runner, &bd);
        v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
    }
    memcpy(bc.mapped, coeffs, coeff_bytes);
    memcpy(bd.mapped, dst,    dst_max);
    uint32_t *m = bm.mapped;
    for (size_t i = 0; i < n_blocks; i++) {
        m[4*i+0] = meta[i].dst_off;
        m[4*i+1] = 0;
        m[4*i+2] = 0;
        m[4*i+3] = 0;
    }
    v3d_buffer binds[3] = { bc, bd, bm };
    if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
        goto fail;
    uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8);   /* 8 blocks/WG */
    h264_idct8_pc pc = {
        .n_blocks      = (uint32_t) n_blocks,
        .dst_stride_u8 = (uint32_t) dst_stride,
    };
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
    if (cb == VK_NULL_HANDLE) goto fail;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                      ctx->h264_idct8_pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            ctx->h264_idct8_pipe.layout, 0, 1,
                            &ctx->h264_idct8_pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
    vkCmdDispatch(cb, wg_count, 1, 1);
    vkEndCommandBuffer(cb);
    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
    memcpy(dst, bd.mapped, dst_max);
    memset(coeffs, 0, coeff_bytes);
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return 0;
 fail:
    v3d_runner_destroy_buffer(ctx->runner, &bm);
    v3d_runner_destroy_buffer(ctx->runner, &bd);
    v3d_runner_destroy_buffer(ctx->runner, &bc);
    return -1;
 }
 /* -------------------- Public dispatch entry points -------------- */
 #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...)                                 \
@@ -943,8 +1035,16 @@ int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
    int16_t *coeffs, size_t n_blocks,
    const daedalus_h264_block_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
+    daedalus_substrate eff = sub;
-                   dst, dst_stride, coeffs, n_blocks, meta);
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
                                       coeffs, n_blocks, meta);
    return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
                                   coeffs, n_blocks, meta);
 }
 int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -0,0 +1,175 @@
 // daedalus-fourier — H.264 8x8 inverse integer transform + add, V3D 7.1.
 //
 // H.264 spec §8.5.13.2 (High profile 8x8 IT).  Pure integer arithmetic
 // — different butterfly from VP9 IDCT 8x8 (cycle 1, uses cospi
 // multipliers).  Row pass first, column pass second; round (+32) >> 6,
 // add to dst, clip to u8.
 //
 // Block layout: COLUMN-MAJOR.  block[c*8 + r] = coefficient at
 // (row r, column c).  Matches FFmpeg `ff_h264_idct8_add_neon`.
 //
 // Workgroup layout: 64 invocations = 8 lanes/block × 8 blocks/WG.
 //   - row pass: lane k (0..7) reads row k of the block (8 coefficients,
 //               one from each column), runs the butterfly, writes 8
 //               outputs to one row of tmp_shared.
 //   - column pass: lane k reads column k of tmp_shared (8 rows),
 //                  runs the butterfly, writes 8 outputs to dst as
 //                  column k at rows 0..7.
 //
 // shared = 8 × 64 × 4 B = 2 KiB.  Well under V3D's 16 KiB limit.
 //
 // License: BSD-2-Clause.
 #version 450
 #extension GL_EXT_shader_8bit_storage             : require
 #extension GL_EXT_shader_16bit_storage            : require
 #extension GL_EXT_shader_explicit_arithmetic_types : require
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(binding = 0) readonly buffer Coeffs {
    int16_t coeffs[];   // N × 64 column-major
 } u_coeffs;
 layout(binding = 1) buffer Dst {
    uint8_t dst[];      // H × stride bytes
 } u_dst;
 layout(binding = 2) readonly buffer Meta {
    uvec4 meta[];       // .x = dst_off
 } u_meta;
 layout(push_constant) uniform PC {
    uint n_blocks;
    uint dst_stride_u8;
    uint _pad0, _pad1;
 } pc;
 // 8 blocks/WG × 64 ints/block × 4 B = 2 KiB shared.
 shared int tmp_shared[8 * 64];
 // 1D 8-element butterfly per H.264 §8.5.13.2.
 void idct8_1d(int d0, int d1, int d2, int d3,
              int d4, int d5, int d6, int d7,
              out int g0, out int g1, out int g2, out int g3,
              out int g4, out int g5, out int g6, out int g7)
 {
    int e0 = d0 + d4;
    int e1 = -d3 + d5 - d7 - (d7 >> 1);
    int e2 = d0 - d4;
    int e3 = d1 + d7 - d3 - (d3 >> 1);
    int e4 = (d2 >> 1) - d6;
    int e5 = -d1 + d7 + d5 + (d5 >> 1);
    int e6 = d2 + (d6 >> 1);
    int e7 = d3 + d5 + d1 + (d1 >> 1);
    int f0 = e0 + e6;
    int f1 = e1 + (e7 >> 2);
    int f2 = e2 + e4;
    int f3 = e3 + (e5 >> 2);
    int f4 = e2 - e4;
    int f5 = (e3 >> 2) - e5;
    int f6 = e0 - e6;
    int f7 = e7 - (e1 >> 2);
    g0 = f0 + f7;
    g1 = f2 + f5;
    g2 = f4 + f3;
    g3 = f6 + f1;
    g4 = f6 - f1;
    g5 = f4 - f3;
    g6 = f2 - f5;
    g7 = f0 - f7;
 }
 void main()
 {
    // local_size 64 = 8 blocks × 8 lanes/block.
    uint gid          = gl_GlobalInvocationID.x;
    uint wg_id        = gid / 64u;
    uint lane_in_wg   = gid & 63u;
    uint block_local  = lane_in_wg >> 3;          // 0..7
    uint k            = lane_in_wg & 7u;          // 0..7
    uint block_idx    = wg_id * 8u + block_local;
    bool oob = (block_idx >= pc.n_blocks);
    // ---- Row pass --------------------------------------------------
    // lane k handles row r=k.  Reads block[c*8 + k] for c=0..7.
    if (!oob) {
        uint base = block_idx * 64u;
        int d0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
        int d1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
        int d2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
        int d3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
        int d4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
        int d5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
        int d6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
        int d7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
        int g0, g1, g2, g3, g4, g5, g6, g7;
        idct8_1d(d0, d1, d2, d3, d4, d5, d6, d7,
                 g0, g1, g2, g3, g4, g5, g6, g7);
        // Write row k of tmp_shared[block_local].
        uint tbase = block_local * 64u + k * 8u;
        tmp_shared[tbase + 0u] = g0;
        tmp_shared[tbase + 1u] = g1;
        tmp_shared[tbase + 2u] = g2;
        tmp_shared[tbase + 3u] = g3;
        tmp_shared[tbase + 4u] = g4;
        tmp_shared[tbase + 5u] = g5;
        tmp_shared[tbase + 6u] = g6;
        tmp_shared[tbase + 7u] = g7;
    }
    barrier();
    // ---- Column pass ----------------------------------------------
    // lane k handles column c=k.  Reads tmp[r][k] for r=0..7.
    if (!oob) {
        uint tbase = block_local * 64u;
        int s0 = tmp_shared[tbase + 0u * 8u + k];
        int s1 = tmp_shared[tbase + 1u * 8u + k];
        int s2 = tmp_shared[tbase + 2u * 8u + k];
        int s3 = tmp_shared[tbase + 3u * 8u + k];
        int s4 = tmp_shared[tbase + 4u * 8u + k];
        int s5 = tmp_shared[tbase + 5u * 8u + k];
        int s6 = tmp_shared[tbase + 6u * 8u + k];
        int s7 = tmp_shared[tbase + 7u * 8u + k];
        int g0, g1, g2, g3, g4, g5, g6, g7;
        idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
                 g0, g1, g2, g3, g4, g5, g6, g7);
        // Column k at rows 0..7 of dst, offset by meta.x.
        uint dst_off = u_meta.meta[block_idx].x;
        uint stride  = pc.dst_stride_u8;
        uint a0 = dst_off + 0u * stride + k;
        uint a1 = dst_off + 1u * stride + k;
        uint a2 = dst_off + 2u * stride + k;
        uint a3 = dst_off + 3u * stride + k;
        uint a4 = dst_off + 4u * stride + k;
        uint a5 = dst_off + 5u * stride + k;
        uint a6 = dst_off + 6u * stride + k;
        uint a7 = dst_off + 7u * stride + k;
        int p0 = int(u_dst.dst[a0]);
        int p1 = int(u_dst.dst[a1]);
        int p2 = int(u_dst.dst[a2]);
        int p3 = int(u_dst.dst[a3]);
        int p4 = int(u_dst.dst[a4]);
        int p5 = int(u_dst.dst[a5]);
        int p6 = int(u_dst.dst[a6]);
        int p7 = int(u_dst.dst[a7]);
        u_dst.dst[a0] = uint8_t(clamp(p0 + ((g0 + 32) >> 6), 0, 255));
        u_dst.dst[a1] = uint8_t(clamp(p1 + ((g1 + 32) >> 6), 0, 255));
        u_dst.dst[a2] = uint8_t(clamp(p2 + ((g2 + 32) >> 6), 0, 255));
        u_dst.dst[a3] = uint8_t(clamp(p3 + ((g3 + 32) >> 6), 0, 255));
        u_dst.dst[a4] = uint8_t(clamp(p4 + ((g4 + 32) >> 6), 0, 255));
        u_dst.dst[a5] = uint8_t(clamp(p5 + ((g5 + 32) >> 6), 0, 255));
        u_dst.dst[a6] = uint8_t(clamp(p6 + ((g6 + 32) >> 6), 0, 255));
        u_dst.dst[a7] = uint8_t(clamp(p7 + ((g7 + 32) >> 6), 0, 255));
    }
 }