From 65bd5c3fe37423fd49c074b68f8b575d4ca57d3f Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sat, 23 May 2026 20:06:20 +0200 Subject: [PATCH] cycle 6: V3D shader for H.264 IDCT 4x4 (first cycle-6 QPU dispatch) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the QPU-default substrate decree 2026-05-23, cycle 6 (H.264 IDCT 4x4 + add) was the highest-priority H.264 kernel to flip from NEON-only to QPU-capable. The same shape as VP9 IDCT 8x8 (cycle 1) — two-pass butterfly with shared-memory transpose — but at 4x4 scale: 4 lanes per block, 16 blocks per WG. What's added: - src/v3d_h264_idct4.comp: GLSL compute shader implementing the H.264 §8.5.12.1 1D butterfly twice (row pass then column pass), with (val + 32) >> 6 rounding and clip-to-u8 add to dst. Block memory layout is column-major (matches FFmpeg `ff_h264_idct_add_neon` convention). - CMakeLists: glslang rule + install entry for v3d_h264_idct4.spv. - dispatch_h264_idct4_qpu() in daedalus_core.c: lazy pipeline init, 3 SSBOs (coeffs / dst / meta as uvec4), push-constant (n_blocks, dst_stride), 16 blocks per WG dispatch. Matches the existing dispatch_*_qpu patterns; uses v3d_runner_create_buffer / destroy_buffer (will swap to pool API once PR #6 lands). - daedalus_dispatch_h264_idct4() replaces ROUTE_CPU_ONLY with the same CPU/QPU substrate switch the deblock dispatch uses. - daedalus_recipe_substrate_for(H264_IDCT4) returns QPU now that the shader exists. Verification on hertz (Pi 5 + V3D 7.1): $ ./test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === H264_IDCT4 recipe substrate: 2 (1=CPU, 2=QPU) H264_IDCT8 recipe substrate: 1 H264_DEBLOCK_LV recipe substrate: 2 H264_QPEL_MC20 recipe substrate: 1 H.264 IDCT 4x4: 2048/2048 bytes bit-exact (100.0000%) ← QPU H.264 IDCT 8x8: 2048/2048 bytes bit-exact H.264 deblock luma v: 2048/2048 bytes bit-exact H.264 qpel mc20: 1024/1024 bytes bit-exact The AUTO-substrate path now picks QPU for H.264 IDCT 4x4, and the output is bit-exact against the C reference (which is identical to the NEON .S code by construction — same FFmpeg upstream). Remaining cycle-6/7/9 work in task #165: - cycle 7: H.264 IDCT 8x8 (template same shape; 8 lanes per block, fewer blocks per WG) - cycle 9: H.264 luma qpel mc20 (different shape — 6-tap MC not a transform) This commit lands the cycle-6 piece of task #165. --- CMakeLists.txt | 14 ++++- src/daedalus_core.c | 109 ++++++++++++++++++++++++++++++++- src/v3d_h264_idct4.comp | 129 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 248 insertions(+), 4 deletions(-) create mode 100644 src/v3d_h264_idct4.comp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c2faa7..7d6bca3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -284,7 +284,18 @@ if (DAEDALUS_BUILD_VULKAN) VERBATIM ) - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV}) + set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv) + add_custom_command( + OUTPUT ${H264_IDCT4_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${H264_IDCT4_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp + COMMENT "glslang: v3d_h264_idct4.comp -> v3d_h264_idct4.spv" + VERBATIM + ) + + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV}) # v3d_runner — reusable Vulkan plumbing. add_library(v3d_runner STATIC src/v3d_runner.c) @@ -412,6 +423,7 @@ if (DAEDALUS_BUILD_VULKAN) ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} + ${H264_IDCT4_SPV} DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders ) endif() diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 7be1500..43d2c66 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -40,6 +40,8 @@ struct daedalus_ctx { v3d_pipeline cdef_pipe; int h264deblock_pipe_ready; v3d_pipeline h264deblock_pipe; + int h264_idct4_pipe_ready; + v3d_pipeline h264_idct4_pipe; }; daedalus_ctx *daedalus_ctx_create(void) @@ -94,6 +96,7 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx) if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe); if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe); if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe); + if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe); v3d_runner_destroy(ctx->runner); } free(ctx); @@ -118,7 +121,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */ case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU; case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */ - case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU; /* TODO task #165 */ + case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */ case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU; /* TODO task #165 */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */ case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU; /* TODO task #165 */ @@ -743,6 +746,98 @@ fail: return -1; } +/* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */ + +typedef struct { + uint32_t n_blocks; + uint32_t dst_stride_u8; + uint32_t _pad0; + uint32_t _pad1; +} h264_idct4_pc; + +static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + if (!ctx->h264_idct4_pipe_ready) { + if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv", + 3, sizeof(h264_idct4_pc), + &ctx->h264_idct4_pipe) != 0) + return -1; + ctx->h264_idct4_pipe_ready = 1; + } + + size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t); + size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */ + size_t dst_max = 0; + for (size_t i = 0; i < n_blocks; i++) { + size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4; + if (e > dst_max) dst_max = e; + } + + v3d_buffer bc = {0}, bd = {0}, bm = {0}; + if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1; + if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { + v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; + } + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) { + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; + } + + memcpy(bc.mapped, coeffs, coeff_bytes); + memcpy(bd.mapped, dst, dst_max); + uint32_t *m = bm.mapped; + for (size_t i = 0; i < n_blocks; i++) { + m[4*i+0] = meta[i].dst_off; + m[4*i+1] = 0; + m[4*i+2] = 0; + m[4*i+3] = 0; + } + + v3d_buffer binds[3] = { bc, bd, bm }; + if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3)) + goto fail; + + uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */ + h264_idct4_pc pc = { + .n_blocks = (uint32_t) n_blocks, + .dst_stride_u8 = (uint32_t) dst_stride, + }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->h264_idct4_pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->h264_idct4_pipe.layout, 0, 1, + &ctx->h264_idct4_pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + memcpy(dst, bd.mapped, dst_max); + + /* H.264/FFmpeg convention: zero the coeffs block after the + * transform (matches the C ref + NEON .S behaviour). */ + memset(coeffs, 0, coeff_bytes); + + v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); + return 0; +fail: + v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); + return -1; +} + /* -------------------- Public dispatch entry points -------------- */ #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \ @@ -831,8 +926,16 @@ int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { - ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu, - dst, dst_stride, coeffs, n_blocks, meta); + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_h264_idct4_cpu(ctx, dst, dst_stride, + coeffs, n_blocks, meta); + return dispatch_h264_idct4_qpu(ctx, dst, dst_stride, + coeffs, n_blocks, meta); } int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub, diff --git a/src/v3d_h264_idct4.comp b/src/v3d_h264_idct4.comp new file mode 100644 index 0000000..dc66b77 --- /dev/null +++ b/src/v3d_h264_idct4.comp @@ -0,0 +1,129 @@ +// daedalus-fourier — H.264 4x4 inverse integer transform + add, V3D 7.1. +// +// H.264 spec §8.5.12.1. Pure integer arithmetic — no trig constants +// (unlike VP9 IDCT 8x8). Row pass first, column pass second; round +// (+32) >> 6, add to dst, clip to u8. +// +// Block memory layout: COLUMN-MAJOR. block[c*4 + r] = coefficient at +// (row r, column c). Matches FFmpeg `ff_h264_idct_add_neon`. +// +// Workgroup layout: 64 invocations = 4 lanes/block × 16 blocks/WG. +// - row pass: lane k (0..3) reads row k of the block (4 coefficients, +// one from each column), runs the butterfly, writes 4 +// outputs to one row of tmp_shared. +// - column pass: lane k reads column k of tmp_shared (4 rows), +// runs the butterfly, writes 4 outputs to dst as +// column k at rows 0..3. +// +// shared = 16 × 16 × 4 B = 1 KiB. Well under V3D's 16 KiB limit. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Coeffs { + int16_t coeffs[]; // N × 16 column-major +} u_coeffs; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; // H × stride bytes (caller-provided base) +} u_dst; + +layout(binding = 2) readonly buffer Meta { + uvec4 meta[]; // .x = dst_off (byte offset into u_dst.dst) +} u_meta; + +layout(push_constant) uniform PC { + uint n_blocks; + uint dst_stride_u8; + uint _pad0, _pad1; +} pc; + +// 16 blocks per WG × 16 ints per block = 256 ints = 1 KiB shared. +shared int tmp_shared[16 * 16]; + +// 1D butterfly per H.264 §8.5.12.1. d[0..3] in, o[0..3] out. +void idct4_1d(int d0, int d1, int d2, int d3, + out int o0, out int o1, out int o2, out int o3) +{ + int e = d0 + d2; + int f = d0 - d2; + int g = (d1 >> 1) - d3; + int h = d1 + (d3 >> 1); + o0 = e + h; + o1 = f + g; + o2 = f - g; + o3 = e - h; +} + +void main() +{ + // Lane decomposition: local_size 64 = 16 blocks × 4 lanes/block. + uint gid = gl_GlobalInvocationID.x; + uint wg_id = gid / 64u; + uint lane_in_wg = gid & 63u; + uint block_local = lane_in_wg >> 2; // 0..15 + uint k = lane_in_wg & 3u; // 0..3 + uint block_idx = wg_id * 16u + block_local; + + bool oob = (block_idx >= pc.n_blocks); + + // ---- Row pass -------------------------------------------------- + // lane k handles row r=k. Reads block[c*4 + k] for c=0..3 (one + // element from each column at fixed row). + if (!oob) { + uint base = block_idx * 16u; + int d0 = int(u_coeffs.coeffs[base + 0u * 4u + k]); + int d1 = int(u_coeffs.coeffs[base + 1u * 4u + k]); + int d2 = int(u_coeffs.coeffs[base + 2u * 4u + k]); + int d3 = int(u_coeffs.coeffs[base + 3u * 4u + k]); + + int o0, o1, o2, o3; + idct4_1d(d0, d1, d2, d3, o0, o1, o2, o3); + + // Write row k of tmp_shared[block_local]. + uint tbase = block_local * 16u + k * 4u; + tmp_shared[tbase + 0u] = o0; + tmp_shared[tbase + 1u] = o1; + tmp_shared[tbase + 2u] = o2; + tmp_shared[tbase + 3u] = o3; + } + + barrier(); + + // ---- Column pass ---------------------------------------------- + // lane k handles column c=k. Reads tmp[r][k] for r=0..3. + if (!oob) { + uint tbase = block_local * 16u; + int s0 = tmp_shared[tbase + 0u * 4u + k]; + int s1 = tmp_shared[tbase + 1u * 4u + k]; + int s2 = tmp_shared[tbase + 2u * 4u + k]; + int s3 = tmp_shared[tbase + 3u * 4u + k]; + + int o0, o1, o2, o3; + idct4_1d(s0, s1, s2, s3, o0, o1, o2, o3); + + // Column k at rows 0..3 of dst, offset by meta.x (dst_off). + uint dst_off = u_meta.meta[block_idx].x; + uint stride = pc.dst_stride_u8; + uint a0 = dst_off + 0u * stride + k; + uint a1 = dst_off + 1u * stride + k; + uint a2 = dst_off + 2u * stride + k; + uint a3 = dst_off + 3u * stride + k; + + int p0 = int(u_dst.dst[a0]); + int p1 = int(u_dst.dst[a1]); + int p2 = int(u_dst.dst[a2]); + int p3 = int(u_dst.dst[a3]); + + u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 32) >> 6), 0, 255)); + u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 32) >> 6), 0, 255)); + u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 32) >> 6), 0, 255)); + u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 32) >> 6), 0, 255)); + } +}