QPU is default substrate: recipe table + ctx env-var override #7

Merged
marfrit merged 3 commits from noether/qpu-default-recipe-cycles-5-8 into main 2026-05-23 18:59:35 +00:00
4 changed files with 570 additions and 11 deletions
+25 -1
View File
@@ -284,7 +284,29 @@ if (DAEDALUS_BUILD_VULKAN)
VERBATIM
)
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV})
set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv)
add_custom_command(
OUTPUT ${H264_IDCT4_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${H264_IDCT4_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
COMMENT "glslang: v3d_h264_idct4.comp -> v3d_h264_idct4.spv"
VERBATIM
)
set(H264_IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct8.spv)
add_custom_command(
OUTPUT ${H264_IDCT8_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${H264_IDCT8_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
COMMENT "glslang: v3d_h264_idct8.comp -> v3d_h264_idct8.spv"
VERBATIM
)
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV})
# v3d_runner — reusable Vulkan plumbing.
add_library(v3d_runner STATIC src/v3d_runner.c)
@@ -412,6 +434,8 @@ if (DAEDALUS_BUILD_VULKAN)
${LPF8_SPV}
${CDEF_SPV}
${H264DEBLOCK_SPV}
${H264_IDCT4_SPV}
${H264_IDCT8_SPV}
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
)
endif()
+241 -10
View File
@@ -40,6 +40,10 @@ struct daedalus_ctx {
v3d_pipeline cdef_pipe;
int h264deblock_pipe_ready;
v3d_pipeline h264deblock_pipe;
int h264_idct4_pipe_ready;
v3d_pipeline h264_idct4_pipe;
int h264_idct8_pipe_ready;
v3d_pipeline h264_idct8_pipe;
};
daedalus_ctx *daedalus_ctx_create(void)
@@ -53,6 +57,25 @@ daedalus_ctx *daedalus_ctx_create(void)
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
{
/*
* Per the "QPU is default substrate" decree 2026-05-23:
* setting DAEDALUS_FORCE_QPU=1 in the process env escalates this
* function to a full daedalus_ctx_create(), letting the libavcodec
* substitution shims (which call create_no_qpu via pthread_once)
* fire the V3D shaders that exist for cycles 1/2/4/5/8. Without
* this hook each consumer process (firefox, mpv, daemon) would
* need its own shim build to opt into QPU.
*
* Default behaviour (env var unset / not "1") is unchanged: pure
* NEON ctx, no implicit Vulkan init. Firefox / mpv consumers
* that dlopen libavcodec without opting in stay on the
* Vulkan-free path; the daemon explicitly sets
* DAEDALUS_FORCE_QPU=1 before loading libavcodec.
*/
const char *force = getenv("DAEDALUS_FORCE_QPU");
if (force && force[0] == '1' && force[1] == 0)
return daedalus_ctx_create();
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
if (!ctx) return NULL;
ctx->has_qpu = 0;
@@ -75,6 +98,8 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
v3d_runner_destroy(ctx->runner);
}
free(ctx);
@@ -84,16 +109,25 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
{
/*
* Recipe table per the "QPU is default substrate" decree
* 2026-05-23. Any kernel that has a V3D compute shader returns
* SUBSTRATE_QPU; CPU is the fallback for kernels without a
* shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel
* mc20 — covered by follow-on task 165). The dispatch
* wrappers already fall back to CPU automatically when the
* ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0).
*/
switch (k) {
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU; /* TODO task #165 */
}
return DAEDALUS_SUBSTRATE_CPU;
}
@@ -715,6 +749,187 @@ fail:
return -1;
}
/* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */
typedef struct {
uint32_t n_blocks;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} h264_idct4_pc;
static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
if (!ctx->h264_idct4_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv",
3, sizeof(h264_idct4_pc),
&ctx->h264_idct4_pipe) != 0)
return -1;
ctx->h264_idct4_pipe_ready = 1;
}
size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t);
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */
size_t dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4;
if (e > dst_max) dst_max = e;
}
v3d_buffer bc = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
memcpy(bc.mapped, coeffs, coeff_bytes);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = 0;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bc, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3))
goto fail;
uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */
h264_idct4_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.dst_stride_u8 = (uint32_t) dst_stride,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct4_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct4_pipe.layout, 0, 1,
&ctx->h264_idct4_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
/* H.264/FFmpeg convention: zero the coeffs block after the
* transform (matches the C ref + NEON .S behaviour). */
memset(coeffs, 0, coeff_bytes);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return -1;
}
/* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
typedef struct {
uint32_t n_blocks;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} h264_idct8_pc;
static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
if (!ctx->h264_idct8_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
3, sizeof(h264_idct8_pc),
&ctx->h264_idct8_pipe) != 0)
return -1;
ctx->h264_idct8_pipe_ready = 1;
}
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
if (e > dst_max) dst_max = e;
}
v3d_buffer bc = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
memcpy(bc.mapped, coeffs, coeff_bytes);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = 0;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bc, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
goto fail;
uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8); /* 8 blocks/WG */
h264_idct8_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.dst_stride_u8 = (uint32_t) dst_stride,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct8_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct8_pipe.layout, 0, 1,
&ctx->h264_idct8_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
memset(coeffs, 0, coeff_bytes);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return -1;
}
/* -------------------- Public dispatch entry points -------------- */
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
@@ -803,8 +1018,16 @@ int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
dst, dst_stride, coeffs, n_blocks, meta);
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_idct4_cpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
return dispatch_h264_idct4_qpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
}
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -812,8 +1035,16 @@ int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
dst, dst_stride, coeffs, n_blocks, meta);
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
}
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
+129
View File
@@ -0,0 +1,129 @@
// daedalus-fourier — H.264 4x4 inverse integer transform + add, V3D 7.1.
//
// H.264 spec §8.5.12.1. Pure integer arithmetic — no trig constants
// (unlike VP9 IDCT 8x8). Row pass first, column pass second; round
// (+32) >> 6, add to dst, clip to u8.
//
// Block memory layout: COLUMN-MAJOR. block[c*4 + r] = coefficient at
// (row r, column c). Matches FFmpeg `ff_h264_idct_add_neon`.
//
// Workgroup layout: 64 invocations = 4 lanes/block × 16 blocks/WG.
// - row pass: lane k (0..3) reads row k of the block (4 coefficients,
// one from each column), runs the butterfly, writes 4
// outputs to one row of tmp_shared.
// - column pass: lane k reads column k of tmp_shared (4 rows),
// runs the butterfly, writes 4 outputs to dst as
// column k at rows 0..3.
//
// shared = 16 × 16 × 4 B = 1 KiB. Well under V3D's 16 KiB limit.
//
// License: BSD-2-Clause.
#version 450
#extension GL_EXT_shader_8bit_storage : require
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(binding = 0) readonly buffer Coeffs {
int16_t coeffs[]; // N × 16 column-major
} u_coeffs;
layout(binding = 1) buffer Dst {
uint8_t dst[]; // H × stride bytes (caller-provided base)
} u_dst;
layout(binding = 2) readonly buffer Meta {
uvec4 meta[]; // .x = dst_off (byte offset into u_dst.dst)
} u_meta;
layout(push_constant) uniform PC {
uint n_blocks;
uint dst_stride_u8;
uint _pad0, _pad1;
} pc;
// 16 blocks per WG × 16 ints per block = 256 ints = 1 KiB shared.
shared int tmp_shared[16 * 16];
// 1D butterfly per H.264 §8.5.12.1. d[0..3] in, o[0..3] out.
void idct4_1d(int d0, int d1, int d2, int d3,
out int o0, out int o1, out int o2, out int o3)
{
int e = d0 + d2;
int f = d0 - d2;
int g = (d1 >> 1) - d3;
int h = d1 + (d3 >> 1);
o0 = e + h;
o1 = f + g;
o2 = f - g;
o3 = e - h;
}
void main()
{
// Lane decomposition: local_size 64 = 16 blocks × 4 lanes/block.
uint gid = gl_GlobalInvocationID.x;
uint wg_id = gid / 64u;
uint lane_in_wg = gid & 63u;
uint block_local = lane_in_wg >> 2; // 0..15
uint k = lane_in_wg & 3u; // 0..3
uint block_idx = wg_id * 16u + block_local;
bool oob = (block_idx >= pc.n_blocks);
// ---- Row pass --------------------------------------------------
// lane k handles row r=k. Reads block[c*4 + k] for c=0..3 (one
// element from each column at fixed row).
if (!oob) {
uint base = block_idx * 16u;
int d0 = int(u_coeffs.coeffs[base + 0u * 4u + k]);
int d1 = int(u_coeffs.coeffs[base + 1u * 4u + k]);
int d2 = int(u_coeffs.coeffs[base + 2u * 4u + k]);
int d3 = int(u_coeffs.coeffs[base + 3u * 4u + k]);
int o0, o1, o2, o3;
idct4_1d(d0, d1, d2, d3, o0, o1, o2, o3);
// Write row k of tmp_shared[block_local].
uint tbase = block_local * 16u + k * 4u;
tmp_shared[tbase + 0u] = o0;
tmp_shared[tbase + 1u] = o1;
tmp_shared[tbase + 2u] = o2;
tmp_shared[tbase + 3u] = o3;
}
barrier();
// ---- Column pass ----------------------------------------------
// lane k handles column c=k. Reads tmp[r][k] for r=0..3.
if (!oob) {
uint tbase = block_local * 16u;
int s0 = tmp_shared[tbase + 0u * 4u + k];
int s1 = tmp_shared[tbase + 1u * 4u + k];
int s2 = tmp_shared[tbase + 2u * 4u + k];
int s3 = tmp_shared[tbase + 3u * 4u + k];
int o0, o1, o2, o3;
idct4_1d(s0, s1, s2, s3, o0, o1, o2, o3);
// Column k at rows 0..3 of dst, offset by meta.x (dst_off).
uint dst_off = u_meta.meta[block_idx].x;
uint stride = pc.dst_stride_u8;
uint a0 = dst_off + 0u * stride + k;
uint a1 = dst_off + 1u * stride + k;
uint a2 = dst_off + 2u * stride + k;
uint a3 = dst_off + 3u * stride + k;
int p0 = int(u_dst.dst[a0]);
int p1 = int(u_dst.dst[a1]);
int p2 = int(u_dst.dst[a2]);
int p3 = int(u_dst.dst[a3]);
u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 32) >> 6), 0, 255));
u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 32) >> 6), 0, 255));
u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 32) >> 6), 0, 255));
u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 32) >> 6), 0, 255));
}
}
+175
View File
@@ -0,0 +1,175 @@
// daedalus-fourier — H.264 8x8 inverse integer transform + add, V3D 7.1.
//
// H.264 spec §8.5.13.2 (High profile 8x8 IT). Pure integer arithmetic
// — different butterfly from VP9 IDCT 8x8 (cycle 1, uses cospi
// multipliers). Row pass first, column pass second; round (+32) >> 6,
// add to dst, clip to u8.
//
// Block layout: COLUMN-MAJOR. block[c*8 + r] = coefficient at
// (row r, column c). Matches FFmpeg `ff_h264_idct8_add_neon`.
//
// Workgroup layout: 64 invocations = 8 lanes/block × 8 blocks/WG.
// - row pass: lane k (0..7) reads row k of the block (8 coefficients,
// one from each column), runs the butterfly, writes 8
// outputs to one row of tmp_shared.
// - column pass: lane k reads column k of tmp_shared (8 rows),
// runs the butterfly, writes 8 outputs to dst as
// column k at rows 0..7.
//
// shared = 8 × 64 × 4 B = 2 KiB. Well under V3D's 16 KiB limit.
//
// License: BSD-2-Clause.
#version 450
#extension GL_EXT_shader_8bit_storage : require
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(binding = 0) readonly buffer Coeffs {
int16_t coeffs[]; // N × 64 column-major
} u_coeffs;
layout(binding = 1) buffer Dst {
uint8_t dst[]; // H × stride bytes
} u_dst;
layout(binding = 2) readonly buffer Meta {
uvec4 meta[]; // .x = dst_off
} u_meta;
layout(push_constant) uniform PC {
uint n_blocks;
uint dst_stride_u8;
uint _pad0, _pad1;
} pc;
// 8 blocks/WG × 64 ints/block × 4 B = 2 KiB shared.
shared int tmp_shared[8 * 64];
// 1D 8-element butterfly per H.264 §8.5.13.2.
void idct8_1d(int d0, int d1, int d2, int d3,
int d4, int d5, int d6, int d7,
out int g0, out int g1, out int g2, out int g3,
out int g4, out int g5, out int g6, out int g7)
{
int e0 = d0 + d4;
int e1 = -d3 + d5 - d7 - (d7 >> 1);
int e2 = d0 - d4;
int e3 = d1 + d7 - d3 - (d3 >> 1);
int e4 = (d2 >> 1) - d6;
int e5 = -d1 + d7 + d5 + (d5 >> 1);
int e6 = d2 + (d6 >> 1);
int e7 = d3 + d5 + d1 + (d1 >> 1);
int f0 = e0 + e6;
int f1 = e1 + (e7 >> 2);
int f2 = e2 + e4;
int f3 = e3 + (e5 >> 2);
int f4 = e2 - e4;
int f5 = (e3 >> 2) - e5;
int f6 = e0 - e6;
int f7 = e7 - (e1 >> 2);
g0 = f0 + f7;
g1 = f2 + f5;
g2 = f4 + f3;
g3 = f6 + f1;
g4 = f6 - f1;
g5 = f4 - f3;
g6 = f2 - f5;
g7 = f0 - f7;
}
void main()
{
// local_size 64 = 8 blocks × 8 lanes/block.
uint gid = gl_GlobalInvocationID.x;
uint wg_id = gid / 64u;
uint lane_in_wg = gid & 63u;
uint block_local = lane_in_wg >> 3; // 0..7
uint k = lane_in_wg & 7u; // 0..7
uint block_idx = wg_id * 8u + block_local;
bool oob = (block_idx >= pc.n_blocks);
// ---- Row pass --------------------------------------------------
// lane k handles row r=k. Reads block[c*8 + k] for c=0..7.
if (!oob) {
uint base = block_idx * 64u;
int d0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
int d1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
int d2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
int d3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
int d4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
int d5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
int d6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
int d7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
int g0, g1, g2, g3, g4, g5, g6, g7;
idct8_1d(d0, d1, d2, d3, d4, d5, d6, d7,
g0, g1, g2, g3, g4, g5, g6, g7);
// Write row k of tmp_shared[block_local].
uint tbase = block_local * 64u + k * 8u;
tmp_shared[tbase + 0u] = g0;
tmp_shared[tbase + 1u] = g1;
tmp_shared[tbase + 2u] = g2;
tmp_shared[tbase + 3u] = g3;
tmp_shared[tbase + 4u] = g4;
tmp_shared[tbase + 5u] = g5;
tmp_shared[tbase + 6u] = g6;
tmp_shared[tbase + 7u] = g7;
}
barrier();
// ---- Column pass ----------------------------------------------
// lane k handles column c=k. Reads tmp[r][k] for r=0..7.
if (!oob) {
uint tbase = block_local * 64u;
int s0 = tmp_shared[tbase + 0u * 8u + k];
int s1 = tmp_shared[tbase + 1u * 8u + k];
int s2 = tmp_shared[tbase + 2u * 8u + k];
int s3 = tmp_shared[tbase + 3u * 8u + k];
int s4 = tmp_shared[tbase + 4u * 8u + k];
int s5 = tmp_shared[tbase + 5u * 8u + k];
int s6 = tmp_shared[tbase + 6u * 8u + k];
int s7 = tmp_shared[tbase + 7u * 8u + k];
int g0, g1, g2, g3, g4, g5, g6, g7;
idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
g0, g1, g2, g3, g4, g5, g6, g7);
// Column k at rows 0..7 of dst, offset by meta.x.
uint dst_off = u_meta.meta[block_idx].x;
uint stride = pc.dst_stride_u8;
uint a0 = dst_off + 0u * stride + k;
uint a1 = dst_off + 1u * stride + k;
uint a2 = dst_off + 2u * stride + k;
uint a3 = dst_off + 3u * stride + k;
uint a4 = dst_off + 4u * stride + k;
uint a5 = dst_off + 5u * stride + k;
uint a6 = dst_off + 6u * stride + k;
uint a7 = dst_off + 7u * stride + k;
int p0 = int(u_dst.dst[a0]);
int p1 = int(u_dst.dst[a1]);
int p2 = int(u_dst.dst[a2]);
int p3 = int(u_dst.dst[a3]);
int p4 = int(u_dst.dst[a4]);
int p5 = int(u_dst.dst[a5]);
int p6 = int(u_dst.dst[a6]);
int p7 = int(u_dst.dst[a7]);
u_dst.dst[a0] = uint8_t(clamp(p0 + ((g0 + 32) >> 6), 0, 255));
u_dst.dst[a1] = uint8_t(clamp(p1 + ((g1 + 32) >> 6), 0, 255));
u_dst.dst[a2] = uint8_t(clamp(p2 + ((g2 + 32) >> 6), 0, 255));
u_dst.dst[a3] = uint8_t(clamp(p3 + ((g3 + 32) >> 6), 0, 255));
u_dst.dst[a4] = uint8_t(clamp(p4 + ((g4 + 32) >> 6), 0, 255));
u_dst.dst[a5] = uint8_t(clamp(p5 + ((g5 + 32) >> 6), 0, 255));
u_dst.dst[a6] = uint8_t(clamp(p6 + ((g6 + 32) >> 6), 0, 255));
u_dst.dst[a7] = uint8_t(clamp(p7 + ((g7 + 32) >> 6), 0, 255));
}
}