/* SPDX-License-Identifier: BSD-2-Clause */ /* * daedalus-decoder — public C API implementation. * * Scaffold only. Most functions return success with no GPU work * performed; the bodies will fill in across Phases 1-4 per DESIGN.md * §8. This file exists so the API surface compiles, links, and can * be smoke-tested end-to-end (ctx create / append / flush / destroy) * before any shader work begins. */ #include "internal.h" #include #include /* Built via -D from CMakeLists. */ #ifndef DAEDALUS_DECODER_VERSION #define DAEDALUS_DECODER_VERSION "0.0.1+scaffold" #endif const char *daedalus_decoder_version(void) { return DAEDALUS_DECODER_VERSION; } daedalus_decoder *daedalus_decoder_create(int width, int height) { if (width <= 0 || height <= 0) return NULL; if ((width & 15) || (height & 15)) return NULL; /* must be multiple of 16 */ daedalus_decoder *dec = calloc(1, sizeof(*dec)); if (!dec) return NULL; dec->width = width; dec->height = height; dec->mb_width = width >> 4; dec->mb_height = height >> 4; dec->n_mbs = dec->mb_width * dec->mb_height; dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12; dec->substrate = DAEDALUS_DECODER_SUBSTRATE_AUTO; /* daedalus-fourier ctx — required. Phase 1 needs the QPU; if * Vulkan init fails the decoder is unusable. Caller can check * via daedalus_decoder_has_qpu(). */ dec->dctx = daedalus_ctx_create(); if (!dec->dctx) { free(dec); return NULL; } dec->mb_descs = calloc((size_t) dec->n_mbs, sizeof(*dec->mb_descs)); dec->coeffs = calloc((size_t) dec->n_mbs * 384, sizeof(int16_t)); /* Predicted-samples buffers — zero-initialised so a frame where * every append_mb gets NULL `predicted` decodes residual-only * (the Stage 1 scaffold contract). flush_frame zeroes these at * end-of-frame to maintain that invariant for the next frame. */ const size_t pred_y_size = (size_t) width * (size_t) height; const size_t pred_uv_size = pred_y_size / 2; dec->predicted_y = calloc(1, pred_y_size); dec->predicted_uv = calloc(1, pred_uv_size); /* Edge buffer sized for the typical worst case (see daedalus_decoder.h). * 16 edges/MB × n_mbs. ~130k entries for 1080p; ~2 MB at sizeof(edge). */ dec->edges_capacity = (size_t) dec->n_mbs * 16; dec->edges_count = 0; dec->edges = malloc(dec->edges_capacity * sizeof(*dec->edges)); if (!dec->mb_descs || !dec->coeffs || !dec->predicted_y || !dec->predicted_uv || !dec->edges) { daedalus_decoder_destroy(dec); return NULL; } return dec; } void daedalus_decoder_destroy(daedalus_decoder *dec) { if (!dec) return; free(dec->edges); free(dec->predicted_uv); free(dec->predicted_y); free(dec->coeffs); free(dec->mb_descs); if (dec->dctx) daedalus_ctx_destroy(dec->dctx); free(dec); } int daedalus_decoder_set_output_format(daedalus_decoder *dec, daedalus_decoder_output_format fmt) { if (!dec) return -1; if (dec->mbs_appended != 0) return -1; /* mid-frame change forbidden */ if (fmt != DAEDALUS_DECODER_OUTPUT_NV12 && fmt != DAEDALUS_DECODER_OUTPUT_RGBA) return -1; dec->output_fmt = fmt; return 0; } int daedalus_decoder_set_substrate(daedalus_decoder *dec, daedalus_decoder_substrate sub) { if (!dec) return -1; if (dec->mbs_appended != 0) return -1; if (sub != DAEDALUS_DECODER_SUBSTRATE_AUTO && sub != DAEDALUS_DECODER_SUBSTRATE_CPU && sub != DAEDALUS_DECODER_SUBSTRATE_QPU) return -1; dec->substrate = sub; return 0; } /* Map our public substrate enum onto daedalus-fourier's. Same * ordering by intent — we duplicate the enum for ABI isolation. */ static daedalus_substrate map_substrate(daedalus_decoder_substrate s) { switch (s) { case DAEDALUS_DECODER_SUBSTRATE_CPU: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_DECODER_SUBSTRATE_QPU: return DAEDALUS_SUBSTRATE_QPU; case DAEDALUS_DECODER_SUBSTRATE_AUTO: default: return DAEDALUS_SUBSTRATE_AUTO; } } int daedalus_decoder_append_mb(daedalus_decoder *dec, const struct daedalus_decoder_mb_input *mb) { if (!dec || !mb || !mb->coeffs) return -1; if (mb->mb_x >= dec->mb_width || mb->mb_y >= dec->mb_height) return -1; /* Raster-order check — Phase 1's intra wavefront requires it. * Caller is libavcodec's slice loop which produces raster order * naturally, so this should never fire in practice. */ int expected = mb->mb_y * dec->mb_width + mb->mb_x; if (expected != dec->mbs_appended) return -1; struct daedalus_decoder_mb_desc *d = &dec->mb_descs[expected]; d->mb_x = mb->mb_x; d->mb_y = mb->mb_y; d->mb_type = mb->mb_type; d->mb_qp_y = mb->mb_qp_y; d->mb_qp_uv = mb->mb_qp_uv; d->cbp = mb->cbp; memcpy(d->intra_4x4_modes, mb->intra_4x4_modes, 16); d->intra_16x16_mode = mb->intra_16x16_mode; d->intra_chroma_mode = mb->intra_chroma_mode; d->partition_mode = mb->partition_mode; memcpy(d->ref_idx_l0, mb->ref_idx_l0, 4); memcpy(d->ref_idx_l1, mb->ref_idx_l1, 4); memcpy(d->mv_l0, mb->mv_l0, sizeof(d->mv_l0)); memcpy(d->mv_l1, mb->mv_l1, sizeof(d->mv_l1)); d->deblock_disable = mb->deblock_disable; d->deblock_alpha_c0 = mb->deblock_alpha_c0; d->deblock_beta = mb->deblock_beta; d->transform_8x8 = mb->transform_8x8; memcpy(&dec->coeffs[(size_t) expected * 384], mb->coeffs, 384 * sizeof(int16_t)); /* Splat predicted samples into frame-scoped planes at raster * (mb_y*16, mb_x*16) for luma, (mb_y*8, mb_x*8) for each chroma * component. NULL → leave buffers as-is (zeroed at create + at * end of each flush_frame); that's the zero-predictor contract. */ if (mb->predicted) { const size_t y_stride = (size_t) dec->width; const size_t uv_stride = (size_t) dec->width / 2; const size_t uv_plane = uv_stride * ((size_t) dec->height / 2); const uint8_t *p_y = mb->predicted; const uint8_t *p_cb = mb->predicted + 256; const uint8_t *p_cr = mb->predicted + 256 + 64; uint8_t *dst_y = &dec->predicted_y[ (size_t) mb->mb_y * 16 * y_stride + (size_t) mb->mb_x * 16]; uint8_t *dst_cb = &dec->predicted_uv[ (size_t) mb->mb_y * 8 * uv_stride + (size_t) mb->mb_x * 8]; uint8_t *dst_cr = &dec->predicted_uv[uv_plane + (size_t) mb->mb_y * 8 * uv_stride + (size_t) mb->mb_x * 8]; for (int r = 0; r < 16; r++) memcpy(&dst_y[(size_t) r * y_stride], &p_y[r * 16], 16); for (int r = 0; r < 8; r++) { memcpy(&dst_cb[(size_t) r * uv_stride], &p_cb[r * 8], 8); memcpy(&dst_cr[(size_t) r * uv_stride], &p_cr[r * 8], 8); } } /* Append per-MB deblock edges into the frame-scoped flat buffer. * Frame-boundary edges (mx=0 V or my=0 H) MUST have bS=0 per the * kernel's p3-at-±4 contract; we don't validate here (caller is * derived from H.264 spec which already enforces this). */ if (mb->edges && mb->n_edges > 0) { if (dec->edges_count + mb->n_edges > dec->edges_capacity) return -1; memcpy(&dec->edges[dec->edges_count], mb->edges, mb->n_edges * sizeof(*dec->edges)); dec->edges_count += mb->n_edges; } dec->mbs_appended++; return 0; } /* -------------------------------------------------------------------- * Deblock helper — walks dec->edges once for a given (plane, orient, * bS_band) selector, builds the corresponding daedalus-fourier * deblock-meta array, and dispatches it through the matching kernel. * * One call → one Vulkan submit, OR zero submits when the selector * matches no edges (a common case for B/P frames with most edges in * bS<4 and only MB-boundary edges in bS=4, or vice versa). * * Edge → dst_off math: * luma: px_x = mb_x*16, px_y = mb_y*16, edge step = 4 cells * chroma: px_x = mb_x*8, px_y = mb_y*8, edge step = 4 cells * Cb edges land at offset 0..cb_plane in scratch_uv; * Cr edges land at offset cb_plane..2*cb_plane (planar * layout matching the chroma IDCT scratch). * * orient == 0 (vertical edge filtered horizontally across): * dst_off = px_y * stride + px_x + edge_idx * 4 * * orient == 1 (horizontal edge filtered vertically across): * dst_off = (px_y + edge_idx * 4) * stride + px_x * * Edges at frame boundaries (mb_x=0 V, mb_y=0 H with edge_idx=0) MUST * have bS=0 (the kernel reads p3 at four samples beyond the edge); * caller-side spec compliance is assumed, no validation here. * * Returns the dispatch's rc (0 = success; <0 = failure). No-op when * the selector matches no edges, returning 0. */ static int dispatch_deblock_pass( daedalus_decoder *dec, daedalus_substrate sub, int target_plane, /* 0 = luma, 1 = chroma (Cb|Cr by plane field) */ int target_orient, /* 0 = V, 1 = H */ int target_bS_intra, /* 0 = bS<4 path, 1 = bS=4 intra path */ uint8_t *scratch, size_t stride, size_t cb_plane_size, /* chroma: bytes from scratch_uv start to Cr plane (0 for luma calls) */ daedalus_h264_deblock_meta *meta_scratch) { size_t n = 0; for (size_t i = 0; i < dec->edges_count; i++) { const struct daedalus_decoder_edge *e = &dec->edges[i]; if (e->bS == 0) continue; int is_intra = (e->bS == 4) ? 1 : 0; if (is_intra != target_bS_intra) continue; if (e->orient != target_orient) continue; int is_luma = (e->plane == 0) ? 1 : 0; if (is_luma != (target_plane == 0)) continue; uint32_t off; if (is_luma) { const size_t px_y = (size_t) e->mb_y * 16; const size_t px_x = (size_t) e->mb_x * 16; if (target_orient == 0) /* V */ off = (uint32_t)(px_y * stride + px_x + (size_t) e->edge_idx * 4); else /* H */ off = (uint32_t)((px_y + (size_t) e->edge_idx * 4) * stride + px_x); } else { const size_t px_y = (size_t) e->mb_y * 8; const size_t px_x = (size_t) e->mb_x * 8; const size_t plane_base = (e->plane == 2) ? cb_plane_size : 0; if (target_orient == 0) off = (uint32_t)(plane_base + px_y * stride + px_x + (size_t) e->edge_idx * 4); else off = (uint32_t)(plane_base + (px_y + (size_t) e->edge_idx * 4) * stride + px_x); } meta_scratch[n].dst_off = off; meta_scratch[n].alpha = e->alpha; meta_scratch[n].beta = e->beta; memcpy(meta_scratch[n].tc0, e->tc0, 4); n++; } if (n == 0) return 0; typedef int (*deblock_dispatch_fn)( daedalus_ctx *, daedalus_substrate, uint8_t *, size_t, size_t, const daedalus_h264_deblock_meta *); /* daedalus-fourier kernel naming convention: * _v = "v_loop_filter" — filter applied VERTICALLY across a * HORIZONTAL edge. Use for our orient=1 (H edge). * _h = "h_loop_filter" — filter applied HORIZONTALLY across a * VERTICAL edge. Use for our orient=0 (V edge). * The names refer to the FILTER DIRECTION, not the edge direction. */ deblock_dispatch_fn fn; if (target_plane == 0) { if (target_orient == 0) /* V edge → h_loop_filter */ fn = target_bS_intra ? daedalus_dispatch_h264_deblock_luma_h_intra : daedalus_dispatch_h264_deblock_luma_h; else /* H edge → v_loop_filter */ fn = target_bS_intra ? daedalus_dispatch_h264_deblock_luma_v_intra : daedalus_dispatch_h264_deblock_luma_v; } else { if (target_orient == 0) fn = target_bS_intra ? daedalus_dispatch_h264_deblock_chroma_h_intra : daedalus_dispatch_h264_deblock_chroma_h; else fn = target_bS_intra ? daedalus_dispatch_h264_deblock_chroma_v_intra : daedalus_dispatch_h264_deblock_chroma_v; } return fn(dec->dctx, sub, scratch, stride, n, meta_scratch); } /* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch (luma + chroma). * * Brings up the GPU substrate by calling daedalus-fourier's existing * `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity in * contrast to the substitution-arc shim that called it with * n_blocks = 1 per call. Two Vulkan submits + waits per frame (one * luma, one chroma) instead of millions of per-block dispatches. * * What's done in this stage: * - Luma: build a per-frame meta[] in raster order (n_blocks = * N_MBs × 16); flat-pack coeffs from each MB's first 256 int16; * dispatch into a frame-sized zero-initialised Y scratch plane. * - Chroma: build an interleaved Cb+Cr meta[] (n_blocks = N_MBs × 8, * 4 Cb + 4 Cr per MB); flat-pack coeffs from each MB's next 128 * int16 (64 Cb + 64 Cr); dispatch into a planar Cb||Cr scratch * buffer (W*H/4 each, concatenated W*H/2 total); CPU-interleave * into the caller's NV12 UV plane post-dispatch. * - Both dispatches pre-fill the scratch from the per-frame * predicted_y / predicted_uv buffers (accumulated by append_mb's * per-MB predicted-samples splat). The IDCT shader's * `dst += idct(coeffs)` + clip255 then folds reconstruction into * the IDCT pass — no separate Stage 3 dispatch needed. * * What's NOT done yet (follow-on Phase 1 sub-PRs): * - Intra prediction: caller-driven (Q2 decision 2026-05-25, CPU * intra-pred via FFmpeg NEON kernels). Caller writes the * intra-predicted samples into mb_input.predicted; this dispatch * consumes them as the IDCT-add starting state. GPU wavefront * intra-pred (DESIGN.md Stage 2a) is no longer planned. * - Motion compensation (Stage 2b): inter MBs not handled. * - High-profile IDCT 8x8 (Stage 1 extension). * - Chroma DC / luma Intra16x16 DC Hadamard pre-pass (currently we * treat all chroma blocks as plain 4×4 AC IDCT; real decode needs * the chroma DC 2×2 Hadamard contribution folded in). * - Deblock (Stage 4). * - dmabuf export — still memcpy-out to caller-provided planes. * - Stage 5 RGBA opt-in. * - GPU-side NV12 interleave — currently a CPU memcpy loop after * the chroma dispatch. Trivial cost (~1 MB / frame at 1080p) * vs the IDCT itself, but worth folding into a Stage-5 pass * later for full-GPU residency. */ int daedalus_decoder_flush_frame(daedalus_decoder *dec, uint8_t *out_y, size_t y_stride, uint8_t *out_uv, size_t uv_stride) { if (!dec) return -1; if (dec->mbs_appended != dec->n_mbs) return -1; /* incomplete frame */ if (!out_y) return -1; int rc = 0; /* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */ /* Two partitions of the per-MB luma section based on each MB's * transform_8x8 flag: * * transform_8x8 == 0 → 16 4x4 blocks contribute to the 4x4 * dispatch (16 coeffs each). * transform_8x8 == 1 → 4 8x8 blocks contribute to the 8x8 * dispatch (64 coeffs each). * * Both partitions can be non-empty in the same frame (FFmpeg sets * transform_8x8_size_flag per MB), so we allocate worst-case for * each and track actual counts. */ /* Pre-fill the dispatch scratch with the per-MB predicted samples * accumulated by append_mb. daedalus-fourier's IDCT 4x4/8x8 * shaders implement FFmpeg `idct_add` semantics — dst += idct(coeffs) * with clip255 — so a non-zero predicted dst becomes the * reconstruction step (residual + predicted → clip) "for free", * collapsing DESIGN.md's Stage 3 into Stage 1's existing dispatch. */ const size_t y_stride_int = (size_t) dec->width; const size_t y_size = y_stride_int * (size_t) dec->height; uint8_t *scratch_y = malloc(y_size); if (scratch_y) memcpy(scratch_y, dec->predicted_y, y_size); const size_t worst_4x4 = (size_t) dec->n_mbs * 16; const size_t worst_8x8 = (size_t) dec->n_mbs * 4; int16_t *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t)); int16_t *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t)); daedalus_h264_block_meta *meta4 = malloc(worst_4x4 * sizeof(*meta4)); daedalus_h264_block_meta *meta8 = malloc(worst_8x8 * sizeof(*meta8)); if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) { rc = -1; goto cleanup; } /* Walk MBs in raster order, append each MB's luma blocks to the * partition selected by its transform_8x8 flag. * * NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream * follows the z-scan from spec §6.4.3 / fig 6-10. We're using * flat raster on the input side too (sb_y outer, sb_x inner) for * Phase 1 self-consistency; the z-scan permutation is the * libavcodec-intercept patch's responsibility. */ size_t bi4 = 0, bi8 = 0; for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) { for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) { int mb_idx = mb_y * dec->mb_width + mb_x; const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx]; const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384]; if (d->transform_8x8) { /* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */ for (int sb_y = 0; sb_y < 2; sb_y++) { for (int sb_x = 0; sb_x < 2; sb_x++) { size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8; size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8; meta8[bi8].dst_off = (uint32_t) (px_y * y_stride_int + px_x); int block_in_mb = sb_y * 2 + sb_x; memcpy(&coeffs8[bi8 * 64], &mb_coeffs[block_in_mb * 64], 64 * sizeof(int16_t)); bi8++; } } } else { /* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */ for (int sb_y = 0; sb_y < 4; sb_y++) { for (int sb_x = 0; sb_x < 4; sb_x++) { size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4; size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4; meta4[bi4].dst_off = (uint32_t) (px_y * y_stride_int + px_x); int block_in_mb = sb_y * 4 + sb_x; memcpy(&coeffs4[bi4 * 16], &mb_coeffs[block_in_mb * 16], 16 * sizeof(int16_t)); bi4++; } } } } } /* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */ /* ---- One Vulkan submit + wait per non-empty luma partition. * AUTO substrate picks QPU per the post-decree recipe table; falls * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. * Skipping the dispatch when the partition is empty avoids the * shader-pool warm-up cost on the common case (a typical Baseline * stream is all-4x4 → 8x8 dispatch is no-op). */ const daedalus_substrate sub = map_substrate(dec->substrate); if (bi4 > 0) { int dr = daedalus_dispatch_h264_idct4(dec->dctx, sub, scratch_y, y_stride_int, coeffs4, bi4, meta4); if (dr != 0) { rc = -3; goto cleanup; } } if (bi8 > 0) { int dr = daedalus_dispatch_h264_idct8(dec->dctx, sub, scratch_y, y_stride_int, coeffs8, bi8, meta8); if (dr != 0) { rc = -3; goto cleanup; } } /* ---- Luma deblock V then H ---- * Per H.264 §8.7 deblock order is V edges first, then H edges, * within each MB. At frame scale we hit the same dependency: a * row of V-filtered samples is the input to the H filter for * the row's H edges. Order: V bS<4 + V bS=4 (independent edges, * either order), barrier (implicit at each dispatch's wait), then * H bS<4 + H bS=4. */ daedalus_h264_deblock_meta *dbk_meta = NULL; if (dec->edges_count > 0) { dbk_meta = malloc(dec->edges_count * sizeof(*dbk_meta)); if (!dbk_meta) { rc = -1; goto cleanup; } int dr; dr = dispatch_deblock_pass(dec, sub, 0, 0, 0, scratch_y, y_stride_int, 0, dbk_meta); if (dr != 0) { rc = -3; goto cleanup; } dr = dispatch_deblock_pass(dec, sub, 0, 0, 1, scratch_y, y_stride_int, 0, dbk_meta); if (dr != 0) { rc = -3; goto cleanup; } dr = dispatch_deblock_pass(dec, sub, 0, 1, 0, scratch_y, y_stride_int, 0, dbk_meta); if (dr != 0) { rc = -3; goto cleanup; } dr = dispatch_deblock_pass(dec, sub, 0, 1, 1, scratch_y, y_stride_int, 0, dbk_meta); if (dr != 0) { rc = -3; goto cleanup; } } /* ---- Copy Y out to caller's plane at the requested stride. ---- */ for (int r = 0; r < dec->height; r++) memcpy(out_y + (size_t) r * y_stride, &scratch_y[(size_t) r * y_stride_int], (size_t) dec->width); /* ---- Build frame-scaled chroma 4×4 dispatch ---- */ /* * 4:2:0 layout — chroma planes are (W/2) by (H/2), one Cb + one * Cr per pixel pair. H.264 per-MB chroma is two 8×8 components, * each split into 4 4×4 blocks, so 8 chroma 4×4 blocks per MB. * * We dispatch BOTH components in a single shader call against a * planar scratch buffer: * scratch_uv[0 .. cb_plane_size) — Cb plane (W/2 × H/2) * scratch_uv[cb_plane_size .. 2*size) — Cr plane (W/2 × H/2) * * meta[i].dst_off is a flat offset into the scratch buffer (the * shader treats dst+dst_off as a contiguous 4×4 with row pitch = * stride), so Cr blocks just add cb_plane_size to their offset. * Stride is W/2 (the chroma row width); this works because Cb and * Cr planes share the same row pitch. * * Post-dispatch we interleave the two planes into NV12 UV layout * on the CPU. Doing this on the GPU is a Stage-5 follow-up * (would need a small "copy + interleave" shader); CPU memcpy * loop is ~1 MB/frame at 1080p so it's not on the critical path. */ int16_t *chroma_coeffs = NULL; daedalus_h264_block_meta *chroma_meta = NULL; uint8_t *scratch_uv = NULL; if (out_uv) { const size_t n_chroma_blocks_per_mb = 8; /* 4 Cb + 4 Cr */ const size_t n_chroma_blocks = (size_t) dec->n_mbs * n_chroma_blocks_per_mb; const size_t chroma_w = (size_t) dec->width / 2; const size_t chroma_h = (size_t) dec->height / 2; const size_t cb_plane_size = chroma_w * chroma_h; const size_t uv_scratch_size = 2 * cb_plane_size; scratch_uv = malloc(uv_scratch_size); if (scratch_uv) memcpy(scratch_uv, dec->predicted_uv, uv_scratch_size); chroma_coeffs = malloc(n_chroma_blocks * 16 * sizeof(int16_t)); chroma_meta = malloc(n_chroma_blocks * sizeof(daedalus_h264_block_meta)); if (!scratch_uv || !chroma_coeffs || !chroma_meta) { rc = -1; goto chroma_cleanup; } size_t cbi = 0; for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) { for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) { int mb_idx = mb_y * dec->mb_width + mb_x; const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384]; /* Per-MB coeff layout (set by append_mb): * [ 0 .. 256) — 16 luma 4×4 blocks * [256 .. 320) — 4 Cb 4×4 blocks (raster sb_y*2+sb_x) * [320 .. 384) — 4 Cr 4×4 blocks (raster sb_y*2+sb_x) */ for (int comp = 0; comp < 2; comp++) { /* 0=Cb 1=Cr */ size_t plane_base = (size_t) comp * cb_plane_size; size_t coeff_base = 256u + (size_t) comp * 64u; for (int sb_y = 0; sb_y < 2; sb_y++) { for (int sb_x = 0; sb_x < 2; sb_x++) { size_t px_y = (size_t) mb_y * 8 + (size_t) sb_y * 4; size_t px_x = (size_t) mb_x * 8 + (size_t) sb_x * 4; chroma_meta[cbi].dst_off = (uint32_t) (plane_base + px_y * chroma_w + px_x); int block_in_comp = sb_y * 2 + sb_x; memcpy(&chroma_coeffs[cbi * 16], &mb_coeffs[coeff_base + (size_t) block_in_comp * 16], 16 * sizeof(int16_t)); cbi++; } } } } } /* assert cbi == n_chroma_blocks; loop math guarantees it */ int cr_rc = daedalus_dispatch_h264_idct4(dec->dctx, sub, scratch_uv, chroma_w, chroma_coeffs, n_chroma_blocks, chroma_meta); if (cr_rc != 0) { rc = -3; goto chroma_cleanup; } /* ---- Chroma deblock V then H ---- * scratch_uv is PLANAR Cb||Cr with stride = chroma_w; both * planes filtered in the same dispatch via Cb's dst_off and * Cr's dst_off = cb_plane_size + (same). */ if (dec->edges_count > 0 && dbk_meta) { int dr; dr = dispatch_deblock_pass(dec, sub, 1, 0, 0, scratch_uv, chroma_w, cb_plane_size, dbk_meta); if (dr != 0) { rc = -3; goto chroma_cleanup; } dr = dispatch_deblock_pass(dec, sub, 1, 0, 1, scratch_uv, chroma_w, cb_plane_size, dbk_meta); if (dr != 0) { rc = -3; goto chroma_cleanup; } dr = dispatch_deblock_pass(dec, sub, 1, 1, 0, scratch_uv, chroma_w, cb_plane_size, dbk_meta); if (dr != 0) { rc = -3; goto chroma_cleanup; } dr = dispatch_deblock_pass(dec, sub, 1, 1, 1, scratch_uv, chroma_w, cb_plane_size, dbk_meta); if (dr != 0) { rc = -3; goto chroma_cleanup; } } /* CPU NV12 interleave: out_uv[r][2c+0] = Cb[r][c], [2c+1] = Cr. */ const uint8_t *cb_plane = scratch_uv; const uint8_t *cr_plane = scratch_uv + cb_plane_size; for (size_t r = 0; r < chroma_h; r++) { uint8_t *dst_row = out_uv + r * uv_stride; const uint8_t *cb_row = cb_plane + r * chroma_w; const uint8_t *cr_row = cr_plane + r * chroma_w; for (size_t c = 0; c < chroma_w; c++) { dst_row[c * 2 + 0] = cb_row[c]; dst_row[c * 2 + 1] = cr_row[c]; } } chroma_cleanup: free(chroma_meta); free(chroma_coeffs); free(scratch_uv); if (rc != 0) goto cleanup; } cleanup: free(dbk_meta); free(meta8); free(meta4); free(coeffs8); free(coeffs4); free(scratch_y); /* Zero the predicted-samples buffers so the next frame starts from * the all-zero-predictor baseline; MBs whose append_mb gets NULL * for `predicted` then decode residual-only. */ if (dec->predicted_y) memset(dec->predicted_y, 0, (size_t) dec->width * (size_t) dec->height); if (dec->predicted_uv) memset(dec->predicted_uv, 0, (size_t) dec->width * (size_t) dec->height / 2); /* Reset edges_count for the next frame; capacity stays. */ dec->edges_count = 0; dec->mbs_appended = 0; return rc; } int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane) { (void) dec; (void) plane; /* TODO Phase 1: vkGetMemoryFdKHR on the DPB slot's VkImage memory. */ return -1; } int daedalus_decoder_has_qpu(const daedalus_decoder *dec) { if (!dec || !dec->dctx) return 0; return daedalus_ctx_has_qpu(dec->dctx); }