diff --git a/CMakeLists.txt b/CMakeLists.txt index 73126c8..4a3481a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -295,6 +295,28 @@ if (DAEDALUS_BUILD_VULKAN) VERBATIM ) + set(H264DEBLOCK_CHROMA_V_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock_chroma_v.spv) + add_custom_command( + OUTPUT ${H264DEBLOCK_CHROMA_V_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${H264DEBLOCK_CHROMA_V_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_v.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_v.comp + COMMENT "glslang: v3d_h264deblock_chroma_v.comp -> .spv" + VERBATIM + ) + + set(H264DEBLOCK_CHROMA_H_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock_chroma_h.spv) + add_custom_command( + OUTPUT ${H264DEBLOCK_CHROMA_H_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${H264DEBLOCK_CHROMA_H_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_h.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_h.comp + COMMENT "glslang: v3d_h264deblock_chroma_h.comp -> .spv" + VERBATIM + ) + set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv) add_custom_command( OUTPUT ${H264_IDCT4_SPV} @@ -328,7 +350,7 @@ if (DAEDALUS_BUILD_VULKAN) VERBATIM ) - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV}) + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV}) # v3d_runner — reusable Vulkan plumbing. add_library(v3d_runner STATIC src/v3d_runner.c) @@ -462,6 +484,8 @@ if (DAEDALUS_BUILD_VULKAN) ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} + ${H264DEBLOCK_CHROMA_V_SPV} + ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 058f2b0..566c0be 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -42,6 +42,10 @@ struct daedalus_ctx { v3d_pipeline h264deblock_pipe; int h264deblock_h_pipe_ready; v3d_pipeline h264deblock_h_pipe; + int h264deblock_chroma_v_pipe_ready; + v3d_pipeline h264deblock_chroma_v_pipe; + int h264deblock_chroma_h_pipe_ready; + v3d_pipeline h264deblock_chroma_h_pipe; int h264_idct4_pipe_ready; v3d_pipeline h264_idct4_pipe; int h264_idct8_pipe_ready; @@ -103,6 +107,8 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx) if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe); if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe); if (ctx->h264deblock_h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_h_pipe); + if (ctx->h264deblock_chroma_v_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_chroma_v_pipe); + if (ctx->h264deblock_chroma_h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_chroma_h_pipe); if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe); if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe); if (ctx->h264_qpel_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc20_pipe); @@ -134,8 +140,8 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_h.spv */ - case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ - case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ + case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_chroma_v.spv */ + case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_chroma_h.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */ @@ -1084,6 +1090,91 @@ fail: return -1; } +/* -------------------- H.264 chroma deblock QPU dispatch -------- */ + +/* Generic chroma QPU dispatch (shared between V and H variants). + * Both shaders use 8 cells per edge; max-addressed-byte differs: + * V: dst_off + 1*stride + 7 (-2..+1 rows, cols 0..7 of edge) + * H: dst_off + 7*stride + 1 (-2..+1 cols, rows 0..7 of edge) + * Caller passes the precomputed extent. + */ +static int dispatch_h264_deblock_chroma_qpu(daedalus_ctx *ctx, + v3d_pipeline *pipe, int *pipe_ready, const char *spv_name, + uint8_t *dst, size_t dst_stride, size_t n_edges, + const daedalus_h264_deblock_meta *meta, int orient_h) +{ + if (!*pipe_ready) { + if (v3d_runner_create_pipeline(ctx->runner, spv_name, + 2, sizeof(h264deblock_pc), pipe) != 0) + return -1; + *pipe_ready = 1; + } + size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); + size_t dst_max = 0; + for (size_t i = 0; i < n_edges; i++) { + size_t e = orient_h ? meta[i].dst_off + 7 * dst_stride + 2 + : meta[i].dst_off + 1 * dst_stride + 8; + if (e > dst_max) dst_max = e; + } + v3d_buffer bm = {0}, bd = {0}; + if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1; + if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; } + memcpy(bd.mapped, dst, dst_max); + uint32_t *m = bm.mapped; + for (size_t i = 0; i < n_edges; i++) { + m[4*i+0] = meta[i].dst_off; + m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8); + m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0]) + | (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8) + | (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16) + | (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24); + m[4*i+3] = 0; + } + v3d_buffer binds[2] = { bm, bd }; + if (v3d_runner_bind_buffers(ctx->runner, pipe, binds, 2)) goto fail; + uint32_t wg_count = (uint32_t)((n_edges + 15) / 16); + h264deblock_pc pc = { .n_edges = (uint32_t) n_edges, + .dst_stride_u8 = (uint32_t) dst_stride }; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, pipe)) goto fail; + VkCommandBuffer cb = pipe->cb; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe->pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + pipe->layout, 0, 1, &pipe->desc_set, 0, NULL); + vkCmdPushConstants(cb, pipe->layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + memcpy(dst, bd.mapped, dst_max); + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); + return 0; +fail: + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); + return -1; +} + +static int dispatch_h264_deblock_chroma_v_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + return dispatch_h264_deblock_chroma_qpu(ctx, + &ctx->h264deblock_chroma_v_pipe, &ctx->h264deblock_chroma_v_pipe_ready, + "v3d_h264deblock_chroma_v.spv", dst, dst_stride, n_edges, meta, 0); +} + +static int dispatch_h264_deblock_chroma_h_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + return dispatch_h264_deblock_chroma_qpu(ctx, + &ctx->h264deblock_chroma_h_pipe, &ctx->h264deblock_chroma_h_pipe_ready, + "v3d_h264deblock_chroma_h.spv", dst, dst_stride, n_edges, meta, 1); +} + /* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */ typedef struct { @@ -1520,9 +1611,9 @@ int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrat eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; - if (eff == DAEDALUS_SUBSTRATE_QPU) - return -1; /* No chroma QPU shader yet. */ - return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta); + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta); + return dispatch_h264_deblock_chroma_v_qpu(ctx, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub, @@ -1534,9 +1625,9 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; - if (eff == DAEDALUS_SUBSTRATE_QPU) - return -1; - return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta); + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta); + return dispatch_h264_deblock_chroma_h_qpu(ctx, dst, dst_stride, n_edges, meta); } #define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \ diff --git a/src/v3d_h264deblock_chroma_h.comp b/src/v3d_h264deblock_chroma_h.comp new file mode 100644 index 0000000..1b179b8 --- /dev/null +++ b/src/v3d_h264deblock_chroma_h.comp @@ -0,0 +1,69 @@ +// daedalus-fourier — H.264 chroma 4:2:0 H loop filter (horizontal +// filter across a vertical edge), non-intra bS<4 variant. +// +// Sibling of v3d_h264deblock_chroma_v.comp; same kernel transposed +// to read pix[-2..+1] (cols) instead of pix[-2*stride..+1*stride] +// (rows). Same 8-cell × 4-segment geometry, same WG layout (lanes +// 8..15 of each edge early-return — only 8 active per edge). +// +// 4:2:0-only: 4:2:2 chroma_h has a 16-row edge that this shader +// doesn't address. daedalus_dispatch_h264_deblock_chroma_h is +// 4:2:0-only by design; caller (libavcodec init) gates accordingly. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; + +layout(push_constant) uniform PC { + uint n_edges; + uint dst_stride_u8; + uint _pad0; + uint _pad1; +} pc; + +void main() +{ + uint lane_in_wg = gl_GlobalInvocationID.x & 255u; + uint edge_in_wg = lane_in_wg >> 4; // 0..15 + uint row_in_edge = lane_in_wg & 15u; // 0..15 — only 0..7 active + + uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg; + if (edge_idx >= pc.n_edges) return; + if (row_in_edge >= 8u) return; + + uvec4 m = u_meta.meta[edge_idx]; + uint stride = pc.dst_stride_u8; + uint dst_off = m.x + row_in_edge * stride; + int alpha = int(m.y & 0xffu); + int beta = int((m.y >> 8) & 0xffu); + + uint seg = row_in_edge >> 1; + uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu; + int tc0_s = int(tc0_byte); + if (tc0_s >= 128) tc0_s -= 256; + + if (alpha == 0 || beta == 0) return; + if (tc0_s < 0) return; + + int p1 = int(u_dst.dst[dst_off - 2u]); + int p0 = int(u_dst.dst[dst_off - 1u]); + int q0 = int(u_dst.dst[dst_off ]); + int q1 = int(u_dst.dst[dst_off + 1u]); + + if (abs(p0 - q0) >= alpha) return; + if (abs(p1 - p0) >= beta) return; + if (abs(q1 - q0) >= beta) return; + + int tc = tc0_s + 1; + int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc); + + u_dst.dst[dst_off - 1u] = uint8_t(clamp(p0 + delta, 0, 255)); + u_dst.dst[dst_off ] = uint8_t(clamp(q0 - delta, 0, 255)); +} diff --git a/src/v3d_h264deblock_chroma_v.comp b/src/v3d_h264deblock_chroma_v.comp new file mode 100644 index 0000000..86fac61 --- /dev/null +++ b/src/v3d_h264deblock_chroma_v.comp @@ -0,0 +1,76 @@ +// daedalus-fourier — H.264 chroma 4:2:0 V loop filter (vertical +// filter across a horizontal edge), non-intra bS<4 variant. +// +// Per H.264 §8.7.2.4: chroma kernel is simpler than luma's bS<4 — +// only p0 / q0 are updated (chroma never modifies p1, p2, q1, q2), +// tC = tc0_seg + 1 (no luma-style ap/aq side bonus), and the edge +// spans 8 cells (4 segments × 2 cells/seg). +// +// V3D 7.1 via Mesa v3dv compute. WG geometry kept identical to the +// luma shader (16 edges × 16 lanes/WG) for uniform dispatch math +// across the deblock family; lanes 8..15 of each edge early-return. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Meta { + uvec4 meta[]; // per edge: (dst_off, alpha|beta<<8, packed_tc0, _pad) +} u_meta; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; +} u_dst; + +layout(push_constant) uniform PC { + uint n_edges; + uint dst_stride_u8; + uint _pad0; + uint _pad1; +} pc; + +void main() +{ + uint lane_in_wg = gl_GlobalInvocationID.x & 255u; + uint edge_in_wg = lane_in_wg >> 4; // 0..15 + uint col_in_edge = lane_in_wg & 15u; // 0..15 — only 0..7 active + + uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg; + if (edge_idx >= pc.n_edges) return; + if (col_in_edge >= 8u) return; // 8 cells per chroma edge + + uvec4 m = u_meta.meta[edge_idx]; + uint dst_off = m.x + col_in_edge; + uint stride = pc.dst_stride_u8; + int alpha = int(m.y & 0xffu); + int beta = int((m.y >> 8) & 0xffu); + + // 8 cells / 4 segments = 2 cells per segment. + uint seg = col_in_edge >> 1; + uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu; + int tc0_s = int(tc0_byte); + if (tc0_s >= 128) tc0_s -= 256; + + if (alpha == 0 || beta == 0) return; + if (tc0_s < 0) return; + + int p1 = int(u_dst.dst[dst_off - 2u * stride]); + int p0 = int(u_dst.dst[dst_off - 1u * stride]); + int q0 = int(u_dst.dst[dst_off]); + int q1 = int(u_dst.dst[dst_off + 1u * stride]); + + if (abs(p0 - q0) >= alpha) return; + if (abs(p1 - p0) >= beta) return; + if (abs(q1 - q0) >= beta) return; + + int tc = tc0_s + 1; + int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc); + + u_dst.dst[dst_off - 1u * stride] = uint8_t(clamp(p0 + delta, 0, 255)); + u_dst.dst[dst_off ] = uint8_t(clamp(q0 - delta, 0, 255)); + // p1, q1 untouched — chroma kernel only updates p0/q0. +}