diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c2faa7..d94a309 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -284,7 +284,29 @@ if (DAEDALUS_BUILD_VULKAN) VERBATIM ) - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV}) + set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv) + add_custom_command( + OUTPUT ${H264_IDCT4_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${H264_IDCT4_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp + COMMENT "glslang: v3d_h264_idct4.comp -> v3d_h264_idct4.spv" + VERBATIM + ) + + set(H264_IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct8.spv) + add_custom_command( + OUTPUT ${H264_IDCT8_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${H264_IDCT8_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp + COMMENT "glslang: v3d_h264_idct8.comp -> v3d_h264_idct8.spv" + VERBATIM + ) + + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV}) # v3d_runner — reusable Vulkan plumbing. add_library(v3d_runner STATIC src/v3d_runner.c) @@ -412,6 +434,8 @@ if (DAEDALUS_BUILD_VULKAN) ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} + ${H264_IDCT4_SPV} + ${H264_IDCT8_SPV} DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders ) endif() diff --git a/src/daedalus_core.c b/src/daedalus_core.c index fd7d73b..f9708d0 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -40,6 +40,10 @@ struct daedalus_ctx { v3d_pipeline cdef_pipe; int h264deblock_pipe_ready; v3d_pipeline h264deblock_pipe; + int h264_idct4_pipe_ready; + v3d_pipeline h264_idct4_pipe; + int h264_idct8_pipe_ready; + v3d_pipeline h264_idct8_pipe; }; daedalus_ctx *daedalus_ctx_create(void) @@ -53,6 +57,25 @@ daedalus_ctx *daedalus_ctx_create(void) daedalus_ctx *daedalus_ctx_create_no_qpu(void) { + /* + * Per the "QPU is default substrate" decree 2026-05-23: + * setting DAEDALUS_FORCE_QPU=1 in the process env escalates this + * function to a full daedalus_ctx_create(), letting the libavcodec + * substitution shims (which call create_no_qpu via pthread_once) + * fire the V3D shaders that exist for cycles 1/2/4/5/8. Without + * this hook each consumer process (firefox, mpv, daemon) would + * need its own shim build to opt into QPU. + * + * Default behaviour (env var unset / not "1") is unchanged: pure + * NEON ctx, no implicit Vulkan init. Firefox / mpv consumers + * that dlopen libavcodec without opting in stay on the + * Vulkan-free path; the daemon explicitly sets + * DAEDALUS_FORCE_QPU=1 before loading libavcodec. + */ + const char *force = getenv("DAEDALUS_FORCE_QPU"); + if (force && force[0] == '1' && force[1] == 0) + return daedalus_ctx_create(); + daedalus_ctx *ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; ctx->has_qpu = 0; @@ -75,6 +98,8 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx) if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe); if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe); if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe); + if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe); + if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe); v3d_runner_destroy(ctx->runner); } free(ctx); @@ -84,16 +109,25 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx) daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) { + /* + * Recipe table per the "QPU is default substrate" decree + * 2026-05-23. Any kernel that has a V3D compute shader returns + * SUBSTRATE_QPU; CPU is the fallback for kernels without a + * shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel + * mc20 — covered by follow-on task 165). The dispatch + * wrappers already fall back to CPU automatically when the + * ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0). + */ switch (k) { case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU; case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU; - case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */ case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU; - case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */ + case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */ + case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */ + case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */ + case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU; /* TODO task #165 */ } return DAEDALUS_SUBSTRATE_CPU; } @@ -715,6 +749,187 @@ fail: return -1; } +/* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */ + +typedef struct { + uint32_t n_blocks; + uint32_t dst_stride_u8; + uint32_t _pad0; + uint32_t _pad1; +} h264_idct4_pc; + +static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + if (!ctx->h264_idct4_pipe_ready) { + if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv", + 3, sizeof(h264_idct4_pc), + &ctx->h264_idct4_pipe) != 0) + return -1; + ctx->h264_idct4_pipe_ready = 1; + } + + size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t); + size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */ + size_t dst_max = 0; + for (size_t i = 0; i < n_blocks; i++) { + size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4; + if (e > dst_max) dst_max = e; + } + + v3d_buffer bc = {0}, bd = {0}, bm = {0}; + if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1; + if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { + v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; + } + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) { + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; + } + + memcpy(bc.mapped, coeffs, coeff_bytes); + memcpy(bd.mapped, dst, dst_max); + uint32_t *m = bm.mapped; + for (size_t i = 0; i < n_blocks; i++) { + m[4*i+0] = meta[i].dst_off; + m[4*i+1] = 0; + m[4*i+2] = 0; + m[4*i+3] = 0; + } + + v3d_buffer binds[3] = { bc, bd, bm }; + if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3)) + goto fail; + + uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */ + h264_idct4_pc pc = { + .n_blocks = (uint32_t) n_blocks, + .dst_stride_u8 = (uint32_t) dst_stride, + }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->h264_idct4_pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->h264_idct4_pipe.layout, 0, 1, + &ctx->h264_idct4_pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + memcpy(dst, bd.mapped, dst_max); + + /* H.264/FFmpeg convention: zero the coeffs block after the + * transform (matches the C ref + NEON .S behaviour). */ + memset(coeffs, 0, coeff_bytes); + + v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); + return 0; +fail: + v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); + return -1; +} + +/* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */ + +typedef struct { + uint32_t n_blocks; + uint32_t dst_stride_u8; + uint32_t _pad0; + uint32_t _pad1; +} h264_idct8_pc; + +static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + int16_t *coeffs, size_t n_blocks, + const daedalus_h264_block_meta *meta) +{ + if (!ctx->h264_idct8_pipe_ready) { + if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv", + 3, sizeof(h264_idct8_pc), + &ctx->h264_idct8_pipe) != 0) + return -1; + ctx->h264_idct8_pipe_ready = 1; + } + + size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t); + size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); + size_t dst_max = 0; + for (size_t i = 0; i < n_blocks; i++) { + size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8; + if (e > dst_max) dst_max = e; + } + + v3d_buffer bc = {0}, bd = {0}, bm = {0}; + if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1; + if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { + v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; + } + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) { + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; + } + + memcpy(bc.mapped, coeffs, coeff_bytes); + memcpy(bd.mapped, dst, dst_max); + uint32_t *m = bm.mapped; + for (size_t i = 0; i < n_blocks; i++) { + m[4*i+0] = meta[i].dst_off; + m[4*i+1] = 0; + m[4*i+2] = 0; + m[4*i+3] = 0; + } + + v3d_buffer binds[3] = { bc, bd, bm }; + if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3)) + goto fail; + + uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8); /* 8 blocks/WG */ + h264_idct8_pc pc = { + .n_blocks = (uint32_t) n_blocks, + .dst_stride_u8 = (uint32_t) dst_stride, + }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->h264_idct8_pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->h264_idct8_pipe.layout, 0, 1, + &ctx->h264_idct8_pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + memcpy(dst, bd.mapped, dst_max); + memset(coeffs, 0, coeff_bytes); + + v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); + return 0; +fail: + v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bc); + return -1; +} + /* -------------------- Public dispatch entry points -------------- */ #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \ @@ -803,8 +1018,16 @@ int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { - ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu, - dst, dst_stride, coeffs, n_blocks, meta); + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_h264_idct4_cpu(ctx, dst, dst_stride, + coeffs, n_blocks, meta); + return dispatch_h264_idct4_qpu(ctx, dst, dst_stride, + coeffs, n_blocks, meta); } int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub, @@ -812,8 +1035,16 @@ int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { - ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu, - dst, dst_stride, coeffs, n_blocks, meta); + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_h264_idct8_cpu(ctx, dst, dst_stride, + coeffs, n_blocks, meta); + return dispatch_h264_idct8_qpu(ctx, dst, dst_stride, + coeffs, n_blocks, meta); } int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub, diff --git a/src/v3d_h264_idct4.comp b/src/v3d_h264_idct4.comp new file mode 100644 index 0000000..dc66b77 --- /dev/null +++ b/src/v3d_h264_idct4.comp @@ -0,0 +1,129 @@ +// daedalus-fourier — H.264 4x4 inverse integer transform + add, V3D 7.1. +// +// H.264 spec §8.5.12.1. Pure integer arithmetic — no trig constants +// (unlike VP9 IDCT 8x8). Row pass first, column pass second; round +// (+32) >> 6, add to dst, clip to u8. +// +// Block memory layout: COLUMN-MAJOR. block[c*4 + r] = coefficient at +// (row r, column c). Matches FFmpeg `ff_h264_idct_add_neon`. +// +// Workgroup layout: 64 invocations = 4 lanes/block × 16 blocks/WG. +// - row pass: lane k (0..3) reads row k of the block (4 coefficients, +// one from each column), runs the butterfly, writes 4 +// outputs to one row of tmp_shared. +// - column pass: lane k reads column k of tmp_shared (4 rows), +// runs the butterfly, writes 4 outputs to dst as +// column k at rows 0..3. +// +// shared = 16 × 16 × 4 B = 1 KiB. Well under V3D's 16 KiB limit. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Coeffs { + int16_t coeffs[]; // N × 16 column-major +} u_coeffs; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; // H × stride bytes (caller-provided base) +} u_dst; + +layout(binding = 2) readonly buffer Meta { + uvec4 meta[]; // .x = dst_off (byte offset into u_dst.dst) +} u_meta; + +layout(push_constant) uniform PC { + uint n_blocks; + uint dst_stride_u8; + uint _pad0, _pad1; +} pc; + +// 16 blocks per WG × 16 ints per block = 256 ints = 1 KiB shared. +shared int tmp_shared[16 * 16]; + +// 1D butterfly per H.264 §8.5.12.1. d[0..3] in, o[0..3] out. +void idct4_1d(int d0, int d1, int d2, int d3, + out int o0, out int o1, out int o2, out int o3) +{ + int e = d0 + d2; + int f = d0 - d2; + int g = (d1 >> 1) - d3; + int h = d1 + (d3 >> 1); + o0 = e + h; + o1 = f + g; + o2 = f - g; + o3 = e - h; +} + +void main() +{ + // Lane decomposition: local_size 64 = 16 blocks × 4 lanes/block. + uint gid = gl_GlobalInvocationID.x; + uint wg_id = gid / 64u; + uint lane_in_wg = gid & 63u; + uint block_local = lane_in_wg >> 2; // 0..15 + uint k = lane_in_wg & 3u; // 0..3 + uint block_idx = wg_id * 16u + block_local; + + bool oob = (block_idx >= pc.n_blocks); + + // ---- Row pass -------------------------------------------------- + // lane k handles row r=k. Reads block[c*4 + k] for c=0..3 (one + // element from each column at fixed row). + if (!oob) { + uint base = block_idx * 16u; + int d0 = int(u_coeffs.coeffs[base + 0u * 4u + k]); + int d1 = int(u_coeffs.coeffs[base + 1u * 4u + k]); + int d2 = int(u_coeffs.coeffs[base + 2u * 4u + k]); + int d3 = int(u_coeffs.coeffs[base + 3u * 4u + k]); + + int o0, o1, o2, o3; + idct4_1d(d0, d1, d2, d3, o0, o1, o2, o3); + + // Write row k of tmp_shared[block_local]. + uint tbase = block_local * 16u + k * 4u; + tmp_shared[tbase + 0u] = o0; + tmp_shared[tbase + 1u] = o1; + tmp_shared[tbase + 2u] = o2; + tmp_shared[tbase + 3u] = o3; + } + + barrier(); + + // ---- Column pass ---------------------------------------------- + // lane k handles column c=k. Reads tmp[r][k] for r=0..3. + if (!oob) { + uint tbase = block_local * 16u; + int s0 = tmp_shared[tbase + 0u * 4u + k]; + int s1 = tmp_shared[tbase + 1u * 4u + k]; + int s2 = tmp_shared[tbase + 2u * 4u + k]; + int s3 = tmp_shared[tbase + 3u * 4u + k]; + + int o0, o1, o2, o3; + idct4_1d(s0, s1, s2, s3, o0, o1, o2, o3); + + // Column k at rows 0..3 of dst, offset by meta.x (dst_off). + uint dst_off = u_meta.meta[block_idx].x; + uint stride = pc.dst_stride_u8; + uint a0 = dst_off + 0u * stride + k; + uint a1 = dst_off + 1u * stride + k; + uint a2 = dst_off + 2u * stride + k; + uint a3 = dst_off + 3u * stride + k; + + int p0 = int(u_dst.dst[a0]); + int p1 = int(u_dst.dst[a1]); + int p2 = int(u_dst.dst[a2]); + int p3 = int(u_dst.dst[a3]); + + u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 32) >> 6), 0, 255)); + u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 32) >> 6), 0, 255)); + u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 32) >> 6), 0, 255)); + u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 32) >> 6), 0, 255)); + } +} diff --git a/src/v3d_h264_idct8.comp b/src/v3d_h264_idct8.comp new file mode 100644 index 0000000..ab3cec7 --- /dev/null +++ b/src/v3d_h264_idct8.comp @@ -0,0 +1,175 @@ +// daedalus-fourier — H.264 8x8 inverse integer transform + add, V3D 7.1. +// +// H.264 spec §8.5.13.2 (High profile 8x8 IT). Pure integer arithmetic +// — different butterfly from VP9 IDCT 8x8 (cycle 1, uses cospi +// multipliers). Row pass first, column pass second; round (+32) >> 6, +// add to dst, clip to u8. +// +// Block layout: COLUMN-MAJOR. block[c*8 + r] = coefficient at +// (row r, column c). Matches FFmpeg `ff_h264_idct8_add_neon`. +// +// Workgroup layout: 64 invocations = 8 lanes/block × 8 blocks/WG. +// - row pass: lane k (0..7) reads row k of the block (8 coefficients, +// one from each column), runs the butterfly, writes 8 +// outputs to one row of tmp_shared. +// - column pass: lane k reads column k of tmp_shared (8 rows), +// runs the butterfly, writes 8 outputs to dst as +// column k at rows 0..7. +// +// shared = 8 × 64 × 4 B = 2 KiB. Well under V3D's 16 KiB limit. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Coeffs { + int16_t coeffs[]; // N × 64 column-major +} u_coeffs; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; // H × stride bytes +} u_dst; + +layout(binding = 2) readonly buffer Meta { + uvec4 meta[]; // .x = dst_off +} u_meta; + +layout(push_constant) uniform PC { + uint n_blocks; + uint dst_stride_u8; + uint _pad0, _pad1; +} pc; + +// 8 blocks/WG × 64 ints/block × 4 B = 2 KiB shared. +shared int tmp_shared[8 * 64]; + +// 1D 8-element butterfly per H.264 §8.5.13.2. +void idct8_1d(int d0, int d1, int d2, int d3, + int d4, int d5, int d6, int d7, + out int g0, out int g1, out int g2, out int g3, + out int g4, out int g5, out int g6, out int g7) +{ + int e0 = d0 + d4; + int e1 = -d3 + d5 - d7 - (d7 >> 1); + int e2 = d0 - d4; + int e3 = d1 + d7 - d3 - (d3 >> 1); + int e4 = (d2 >> 1) - d6; + int e5 = -d1 + d7 + d5 + (d5 >> 1); + int e6 = d2 + (d6 >> 1); + int e7 = d3 + d5 + d1 + (d1 >> 1); + + int f0 = e0 + e6; + int f1 = e1 + (e7 >> 2); + int f2 = e2 + e4; + int f3 = e3 + (e5 >> 2); + int f4 = e2 - e4; + int f5 = (e3 >> 2) - e5; + int f6 = e0 - e6; + int f7 = e7 - (e1 >> 2); + + g0 = f0 + f7; + g1 = f2 + f5; + g2 = f4 + f3; + g3 = f6 + f1; + g4 = f6 - f1; + g5 = f4 - f3; + g6 = f2 - f5; + g7 = f0 - f7; +} + +void main() +{ + // local_size 64 = 8 blocks × 8 lanes/block. + uint gid = gl_GlobalInvocationID.x; + uint wg_id = gid / 64u; + uint lane_in_wg = gid & 63u; + uint block_local = lane_in_wg >> 3; // 0..7 + uint k = lane_in_wg & 7u; // 0..7 + uint block_idx = wg_id * 8u + block_local; + + bool oob = (block_idx >= pc.n_blocks); + + // ---- Row pass -------------------------------------------------- + // lane k handles row r=k. Reads block[c*8 + k] for c=0..7. + if (!oob) { + uint base = block_idx * 64u; + int d0 = int(u_coeffs.coeffs[base + 0u * 8u + k]); + int d1 = int(u_coeffs.coeffs[base + 1u * 8u + k]); + int d2 = int(u_coeffs.coeffs[base + 2u * 8u + k]); + int d3 = int(u_coeffs.coeffs[base + 3u * 8u + k]); + int d4 = int(u_coeffs.coeffs[base + 4u * 8u + k]); + int d5 = int(u_coeffs.coeffs[base + 5u * 8u + k]); + int d6 = int(u_coeffs.coeffs[base + 6u * 8u + k]); + int d7 = int(u_coeffs.coeffs[base + 7u * 8u + k]); + + int g0, g1, g2, g3, g4, g5, g6, g7; + idct8_1d(d0, d1, d2, d3, d4, d5, d6, d7, + g0, g1, g2, g3, g4, g5, g6, g7); + + // Write row k of tmp_shared[block_local]. + uint tbase = block_local * 64u + k * 8u; + tmp_shared[tbase + 0u] = g0; + tmp_shared[tbase + 1u] = g1; + tmp_shared[tbase + 2u] = g2; + tmp_shared[tbase + 3u] = g3; + tmp_shared[tbase + 4u] = g4; + tmp_shared[tbase + 5u] = g5; + tmp_shared[tbase + 6u] = g6; + tmp_shared[tbase + 7u] = g7; + } + + barrier(); + + // ---- Column pass ---------------------------------------------- + // lane k handles column c=k. Reads tmp[r][k] for r=0..7. + if (!oob) { + uint tbase = block_local * 64u; + int s0 = tmp_shared[tbase + 0u * 8u + k]; + int s1 = tmp_shared[tbase + 1u * 8u + k]; + int s2 = tmp_shared[tbase + 2u * 8u + k]; + int s3 = tmp_shared[tbase + 3u * 8u + k]; + int s4 = tmp_shared[tbase + 4u * 8u + k]; + int s5 = tmp_shared[tbase + 5u * 8u + k]; + int s6 = tmp_shared[tbase + 6u * 8u + k]; + int s7 = tmp_shared[tbase + 7u * 8u + k]; + + int g0, g1, g2, g3, g4, g5, g6, g7; + idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7, + g0, g1, g2, g3, g4, g5, g6, g7); + + // Column k at rows 0..7 of dst, offset by meta.x. + uint dst_off = u_meta.meta[block_idx].x; + uint stride = pc.dst_stride_u8; + uint a0 = dst_off + 0u * stride + k; + uint a1 = dst_off + 1u * stride + k; + uint a2 = dst_off + 2u * stride + k; + uint a3 = dst_off + 3u * stride + k; + uint a4 = dst_off + 4u * stride + k; + uint a5 = dst_off + 5u * stride + k; + uint a6 = dst_off + 6u * stride + k; + uint a7 = dst_off + 7u * stride + k; + + int p0 = int(u_dst.dst[a0]); + int p1 = int(u_dst.dst[a1]); + int p2 = int(u_dst.dst[a2]); + int p3 = int(u_dst.dst[a3]); + int p4 = int(u_dst.dst[a4]); + int p5 = int(u_dst.dst[a5]); + int p6 = int(u_dst.dst[a6]); + int p7 = int(u_dst.dst[a7]); + + u_dst.dst[a0] = uint8_t(clamp(p0 + ((g0 + 32) >> 6), 0, 255)); + u_dst.dst[a1] = uint8_t(clamp(p1 + ((g1 + 32) >> 6), 0, 255)); + u_dst.dst[a2] = uint8_t(clamp(p2 + ((g2 + 32) >> 6), 0, 255)); + u_dst.dst[a3] = uint8_t(clamp(p3 + ((g3 + 32) >> 6), 0, 255)); + u_dst.dst[a4] = uint8_t(clamp(p4 + ((g4 + 32) >> 6), 0, 255)); + u_dst.dst[a5] = uint8_t(clamp(p5 + ((g5 + 32) >> 6), 0, 255)); + u_dst.dst[a6] = uint8_t(clamp(p6 + ((g6 + 32) >> 6), 0, 255)); + u_dst.dst[a7] = uint8_t(clamp(p7 + ((g7 + 32) >> 6), 0, 255)); + } +}