Update DESIGN.md

2026-05-24 19:51:38 +00:00
9 changed files with 9 additions and 1089 deletions
@@ -1,15 +0,0 @@
 build/
 build-*/
 *.o
 *.a
 *.so
 *.so.*
 *.spv
 .vscode/
 .cache/
 compile_commands.json
 CMakeCache.txt
 CMakeFiles/
 cmake_install.cmake
 Makefile
 .ninja_*
@@ -1,132 +0,0 @@
 # SPDX-License-Identifier: BSD-2-Clause
 #
 # daedalus-decoder — frame-level GPU H.264 decoder for V3D7 (Pi 5).
 # Phase 1 scaffold; see DESIGN.md for architecture.
 #
 # Build dependencies:
 #   - daedalus-fourier ≥ 0.1.0 (kernel pack, V3D primitives + recipe layer)
 #     resolved via pkg-config; install via the daedalus-fourier upstream
 #     `cmake --install` rule (PR #5 made the .pc relocatable, so any
 #     install prefix works as long as $PKG_CONFIG_PATH is set).
 #   - Vulkan headers + libvulkan (pulled in transitively via
 #     daedalus-fourier, listed here explicitly for the link order).
 #
 # Build:
 #   cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release
 #   cmake --build build
 #   ctest --test-dir build
 cmake_minimum_required(VERSION 3.20)
 project(daedalus-decoder
    VERSION 0.0.1
    DESCRIPTION "Frame-level GPU H.264 decoder for Raspberry Pi 5 / V3D7"
    LANGUAGES C)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 set(CMAKE_C_EXTENSIONS OFF)
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release)
 endif()
 # Pi 5 is the only supported target.  Other aarch64 SoCs (Pi 4 V3D4,
 # RK3588 Mali, …) might work but would need explicit substrate +
 # shader-pack validation per the daedalus-fourier architecture
 # backlog.  Don't pretend to support what we haven't validated.
 if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message(WARNING
        "daedalus-decoder is designed for aarch64 (Pi 5 BCM2712 / V3D7). "
        "Build will proceed but is unlikely to function.")
 endif()
 add_compile_options(-Wall -Wextra -Wno-unused-parameter)
 # ---- Dependencies --------------------------------------------------
 find_package(PkgConfig REQUIRED)
 # daedalus-fourier — find_package via pkg-config per the Phase 1
 # decision §9.6.  Minimum version 0.1.0 (the cycle 6-9 shaders + pool
 # + recipe-flip baseline).  PKG_CONFIG_PATH should point at the
 # directory holding daedalus-fourier.pc (e.g. /usr/local/lib/pkgconfig
 # or a custom install prefix).
 pkg_check_modules(DAEDALUS_FOURIER REQUIRED daedalus-fourier>=0.1.0)
 # Vulkan — daedalus-fourier already depends on this; we add it
 # explicitly so the link order stays correct (daedalus-fourier static
 # archive contains undefined vk* symbols that the loader resolves).
 find_package(Vulkan REQUIRED)
 # ---- Version string baked into the library ------------------------
 # git rev tagged onto the version string for traceability; degrades
 # gracefully to bare semver if git isn't available.
 execute_process(
    COMMAND git -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse --short=7 HEAD
    OUTPUT_VARIABLE DAEDALUS_DECODER_GITREV
    OUTPUT_STRIP_TRAILING_WHITESPACE
    ERROR_QUIET)
 if(DAEDALUS_DECODER_GITREV)
    set(DAEDALUS_DECODER_VERSION "${PROJECT_VERSION}+g${DAEDALUS_DECODER_GITREV}")
 else()
    set(DAEDALUS_DECODER_VERSION "${PROJECT_VERSION}")
 endif()
 message(STATUS "daedalus-decoder version: ${DAEDALUS_DECODER_VERSION}")
 # ---- Library ------------------------------------------------------
 add_library(daedalus_decoder STATIC
    src/daedalus_decoder.c
 )
 target_include_directories(daedalus_decoder
    PUBLIC
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
        $<INSTALL_INTERFACE:include>
    PRIVATE
        src
        ${DAEDALUS_FOURIER_INCLUDE_DIRS}
 )
 target_link_directories(daedalus_decoder
    PUBLIC
        ${DAEDALUS_FOURIER_LIBRARY_DIRS}
 )
 target_link_libraries(daedalus_decoder
    PUBLIC
        # Order matters: daedalus-fourier static archive references
        # vulkan symbols; the loader needs daedalus-fourier first then
        # vulkan to resolve them.
        ${DAEDALUS_FOURIER_LIBRARIES}
        Vulkan::Vulkan
 )
 target_compile_definitions(daedalus_decoder
    PRIVATE
        DAEDALUS_DECODER_VERSION="${DAEDALUS_DECODER_VERSION}"
 )
 target_compile_options(daedalus_decoder PRIVATE -O2)
 # ---- Smoke test ---------------------------------------------------
 enable_testing()
 add_executable(test_smoke tests/test_smoke.c)
 target_link_libraries(test_smoke PRIVATE daedalus_decoder)
 target_compile_options(test_smoke PRIVATE -O2)
 add_test(NAME smoke COMMAND test_smoke)
 add_executable(test_idct_bitexact tests/test_idct_bitexact.c)
 target_link_libraries(test_idct_bitexact PRIVATE daedalus_decoder)
 target_compile_options(test_idct_bitexact PRIVATE -O2)
 add_test(NAME idct_bitexact COMMAND test_idct_bitexact)
 # ---- Install ------------------------------------------------------
 #
 # Library + public header.  Stage 2/3 will add a pkg-config file and
 # CMake config exports once the API stabilises; pre-0.1 the scaffold
 # install just gives the static archive a home.
 include(GNUInstallDirs)
 install(TARGETS daedalus_decoder
    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
 install(FILES include/daedalus_decoder.h
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
@@ -261,31 +261,25 @@ That's a substantial shader inventory.  Each requires bit-exact M1 gate against
 **Phase 4 — Production-ready deblock + perf optimization + libva integration** (+4 weeks).  Real-world stream conformance.  Plug into daedalus-v4l2 daemon as the actual decode backend.
-**Total H.264 budget:** 4-6 months.
+**Total budget:** 4-6 months.
 **Phase 5+ (future codec scope, not committed):** VP9 and AV1 reuse the same frame-level dispatch architecture, daedalus-fourier kernel pack, and DPB plumbing.  Per §9.7, they are deferred but *not firmly out-of-scope*.  HEVC stays firmly out (Pi 5 has `rpi-hevc-dec` for that).
 ---
-## 9. Phase 1 decisions
+## 9. Open questions
-User-confirmed 2026-05-24.  All seven questions from the initial
+1. **Intra prediction strategy:** GPU wavefront (~187 dispatches, more complex) vs CPU speculative (simpler, slower).  Plan: wavefront in Phase 1; revisit if it's the perf bottleneck. [x]
 draft are now decided; this section preserves the original wording
 of each item for traceability.
-1. **Intra prediction strategy:** GPU wavefront (~187 dispatches, more complex) vs CPU speculative (simpler, slower).  **Decision: wavefront in Phase 1; revisit if it's the perf bottleneck.**
+2. **libavcodec intercept granularity:** macroblock-level (substitution-arc evolution) vs slice-level (cleaner rewrite).  Plan: macroblock-level for Phase 1; consider slice-level later if buffer accumulation overhead is non-trivial. [x]
-2. **libavcodec intercept granularity:** macroblock-level (substitution-arc evolution) vs slice-level (cleaner rewrite).  **Decision: macroblock-level for Phase 1; consider slice-level later if buffer accumulation overhead is non-trivial.**
+3. **Shader parameterization:** 16 qpel variants as 16 shaders, or one parameterized shader with switch on mc_position?  V3D's compiler might inline-optimize either; needs measurement. [x] Measurement it is.
-3. **Shader parameterization:** 16 qpel variants as 16 shaders, or one parameterized shader with switch on mc_position?  **Decision: measure both during Phase 2 (the MC phase) and pick the winner.  No commit ahead of measurement.**
+4. **DPB allocation:** Vulkan-native VkImage with dmabuf export, vs CPU-allocated dma_buf imported into Vulkan.  Affects V4L2 integration story.  Plan: Vulkan-native with `VK_KHR_external_memory_dma_buf` export; daedalus-v4l2 daemon imports. [x]
-4. **DPB allocation:** Vulkan-native VkImage with dmabuf export, vs CPU-allocated dma_buf imported into Vulkan.  **Decision: Vulkan-native with `VK_KHR_external_memory_dma_buf` export; daedalus-v4l2 daemon imports.**
+5. **Daemon integration shape:** does daedalus-decoder ship as a static library the daemon links, or as a separate process the daemon talks to?  Library, almost certainly — process boundary would multiply IPC cost. [x] library.
-5. **Daemon integration shape:** static library the daemon links, or separate process.  **Decision: library link.**
+6. **Build dependency on daedalus-fourier:** as a CMake `find_package`, or vendored?  `find_package`, pinned to a tagged release.  daedalus-fourier becomes the "kernel pack" upstream library. [x] Yes.
-6. **Build dependency on daedalus-fourier:** CMake `find_package`, or vendored?  **Decision: `find_package`, pinned to a tagged release.  daedalus-fourier becomes the "kernel pack" upstream library.**
+7. **Out-of-scope for daedalus-decoder (firmly):** HEVC (Pi 5 has rpi-hevc-dec for that), 10-bit, interlaced, FMO/ASO.
 7. **Codec scope.**  **Decision: firmly out-of-scope for daedalus-decoder are HEVC (Pi 5 has `rpi-hevc-dec` for that), 10-bit, interlaced, and FMO/ASO.**  VP9 and AV1 are *not* firmly out — they're future codec scope for the same framework after H.264 lands.  This is a scope expansion from the initial draft, which had grouped them with HEVC under "firmly out".
 ---
@@ -1,24 +0,0 @@
 BSD 2-Clause License
 Copyright (c) 2026, Markus Fritsche
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -1,182 +0,0 @@
 /* SPDX-License-Identifier: BSD-2-Clause */
 /*
 * daedalus-decoder — public C API.
 *
 * Frame-level GPU H.264 decoder targeting V3D7 (Raspberry Pi 5).  Built
 * on daedalus-fourier's V3D compute primitives at frame granularity —
 * one Vulkan submit per frame, one fence wait per frame, encoded
 * bitstream in (via libavcodec's per-MB intercept), NV12 frame out.
 *
 * Per the 2026-05-24 Phase 1 design decisions:
 *   - libavcodec intercept is at macroblock-level (substitution-arc
 *     evolution): the caller is expected to drive the per-MB CABAC /
 *     CAVLC entropy decode and feed each macroblock's descriptor +
 *     coefficients via daedalus_decoder_append_mb().  flush_frame()
 *     builds the per-frame VkCommandBuffer and submits.
 *   - DPB is Vulkan-native VkImage with VK_KHR_external_memory_dma_buf
 *     export.  The caller can obtain the output frame's dmabuf fd
 *     via daedalus_decoder_export_dmabuf().
 *   - Daemon integration shape: this library is statically linked into
 *     daedalus_v4l2_daemon.  No IPC.
 *
 * STATUS: scaffold.  No GPU pipeline implemented yet; all functions
 * are stubs that compile but do not decode anything.  See DESIGN.md
 * for the architecture.
 *
 * ABI: pre-0.1 — every signature here may change.  Don't rely on
 * stability yet.
 */
 #ifndef DAEDALUS_DECODER_H
 #define DAEDALUS_DECODER_H
 #include <stddef.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* -------------------------------------------------------------------
 * Opaque decoder context.  One per concurrent stream.
 * ----------------------------------------------------------------- */
 typedef struct daedalus_decoder daedalus_decoder;
 /* -------------------------------------------------------------------
 * Per-macroblock input.  Mirrors §3 of DESIGN.md.  The caller's
 * libavcodec intercept populates this from the H264SliceContext
 * fields after ff_h264_decode_mb_cabac/cavlc returns and before
 * ff_h264_hl_decode_mb is supposed to run (we replace the latter).
 * ----------------------------------------------------------------- */
 struct daedalus_decoder_mb_input {
    /* Frame coordinates (macroblock units). */
    uint16_t mb_x;
    uint16_t mb_y;
    /* Type + quantisation. */
    uint8_t  mb_type;            /* H.264 spec table 7-13/7-14/7-17/7-18 enum */
    uint8_t  mb_qp_y;
    uint8_t  mb_qp_uv;
    uint8_t  cbp;                /* coded block pattern, 0..47 */
    /* Intra prediction (used iff mb_type == I_NxN or I_16x16). */
    uint8_t  intra_4x4_modes[16];
    uint8_t  intra_16x16_mode;
    uint8_t  intra_chroma_mode;
    /* Inter motion / partitions (used iff P_* or B_*). */
    uint8_t  partition_mode;     /* P_16x16 / P_16x8 / P_8x16 / P_8x8 / etc. */
    int8_t   ref_idx_l0[4];      /* per partition; -1 = not used */
    int8_t   ref_idx_l1[4];      /* B only */
    int16_t  mv_l0[4][2];        /* qpel precision (1/4 sample); (x, y) */
    int16_t  mv_l1[4][2];
    /* Deblocking filter parameters. */
    uint8_t  deblock_disable;    /* 0 = enabled */
    int8_t   deblock_alpha_c0;
    int8_t   deblock_beta;
    /* Transform coefficients — 256 luma (4x4 x 16) + 64 cb + 64 cr,
     * column-major within each 4x4 block (matches FFmpeg convention).
     * Caller-owned; copied during append. */
    const int16_t *coeffs;       /* points at exactly 384 int16_t */
 };
 /* -------------------------------------------------------------------
 * Output frame format selector.
 * ----------------------------------------------------------------- */
 typedef enum {
    DAEDALUS_DECODER_OUTPUT_NV12 = 0,   /* default; Stage 4 final */
    DAEDALUS_DECODER_OUTPUT_RGBA = 1,   /* Stage 5 opt-in */
 } daedalus_decoder_output_format;
 /* -------------------------------------------------------------------
 * Lifecycle
 * ----------------------------------------------------------------- */
 /* Create a decoder context for the given **coded** frame dimensions.
 *
 * width, height: pixels of the H.264 coded picture, NOT the displayed
 * picture.  Both must be multiples of 16 (macroblock granularity).
 * For displayed 1080p (1920×1080), the coded frame is 1920×1088 with
 * the SPS's `frame_cropping_*` offsets cropping the bottom 8 rows.
 * The caller is responsible for translating from SPS dims + crop
 * rectangle to the values passed here; we decode the coded frame.
 *
 * Returns NULL on bad dimensions or allocation failure.  Returns a
 * usable context with daedalus_decoder_has_qpu() == 0 when Vulkan
 * init fails — callers that need GPU work should check has_qpu
 * before relying on it.
 */
 daedalus_decoder *daedalus_decoder_create(int width, int height);
 /* Free all resources.  Safe with NULL. */
 void daedalus_decoder_destroy(daedalus_decoder *dec);
 /* Switch output format BEFORE the first append_mb call of a frame.
 * Default is NV12.  Returns 0 on success, -1 if called mid-frame
 * (caller must flush first). */
 int daedalus_decoder_set_output_format(daedalus_decoder *dec,
                                        daedalus_decoder_output_format fmt);
 /* -------------------------------------------------------------------
 * Per-frame submission
 * ----------------------------------------------------------------- */
 /* Append one macroblock's data to the current frame's descriptor SSBO
 * + coefficient SSBO.  No GPU dispatch yet — just CPU-side writes.
 *
 * Must be called in raster order (mb_y * mb_width + mb_x) for the
 * intra-prediction wavefront to work correctly in Phase 1.
 *
 * Returns 0 on success, negative on bounds violation or OOM.
 */
 int daedalus_decoder_append_mb(daedalus_decoder *dec,
                                const struct daedalus_decoder_mb_input *mb);
 /* End-of-frame flush: builds the per-frame VkCommandBuffer with all
 * pipeline stages, submits once, waits on a single fence, copies the
 * NV12 (or RGBA when opted in) output into the caller-provided
 * planes.
 *
 * For NV12:
 *   out_y / y_stride: Y plane (W*H bytes minimum, at the given stride)
 *   out_uv / uv_stride: interleaved UV plane (W*(H/2) bytes minimum)
 *
 * For RGBA: out_y receives 4*W*H bytes at y_stride; out_uv ignored.
 *
 * Returns 0 on success, negative on Vulkan failure or undecodable
 * frame.  After return, the decoder is ready for the next frame's
 * append calls.
 */
 int daedalus_decoder_flush_frame(daedalus_decoder *dec,
                                  uint8_t *out_y,  size_t y_stride,
                                  uint8_t *out_uv, size_t uv_stride);
 /* Export the most-recently-decoded frame as a dma_buf fd.  The fd is
 * owned by the caller and must be closed when done.  Lets V4L2
 * consumers (daedalus_v4l2_daemon, libva-v4l2-request-fourier) attach
 * the GPU-decoded surface directly to a CAPTURE plane without a CPU
 * round-trip.
 *
 * Returns the dmabuf fd on success, -1 on failure.  Must be called
 * AFTER flush_frame returns for the relevant frame.
 */
 int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane);
 /* -------------------------------------------------------------------
 * Diagnostics
 * ----------------------------------------------------------------- */
 /* daedalus-decoder build version (semver string, e.g. "0.0.1+g0a1b2c3"). */
 const char *daedalus_decoder_version(void);
 /* Whether the underlying daedalus-fourier context picked up a working
 * V3D7 Vulkan instance.  Returns 0 if Vulkan init failed and the
 * decoder is operating in stub / failure mode. */
 int daedalus_decoder_has_qpu(const daedalus_decoder *dec);
 #ifdef __cplusplus
 }
 #endif
 #endif /* DAEDALUS_DECODER_H */
@@ -1,282 +0,0 @@
 /* SPDX-License-Identifier: BSD-2-Clause */
 /*
 * daedalus-decoder — public C API implementation.
 *
 * Scaffold only.  Most functions return success with no GPU work
 * performed; the bodies will fill in across Phases 1-4 per DESIGN.md
 * §8.  This file exists so the API surface compiles, links, and can
 * be smoke-tested end-to-end (ctx create / append / flush / destroy)
 * before any shader work begins.
 */
 #include "internal.h"
 #include <stdlib.h>
 #include <string.h>
 /* Built via -D from CMakeLists. */
 #ifndef DAEDALUS_DECODER_VERSION
 #define DAEDALUS_DECODER_VERSION "0.0.1+scaffold"
 #endif
 const char *daedalus_decoder_version(void)
 {
    return DAEDALUS_DECODER_VERSION;
 }
 daedalus_decoder *daedalus_decoder_create(int width, int height)
 {
    if (width <= 0 || height <= 0)
        return NULL;
    if ((width & 15) || (height & 15))
        return NULL;  /* must be multiple of 16 */
    daedalus_decoder *dec = calloc(1, sizeof(*dec));
    if (!dec)
        return NULL;
    dec->width      = width;
    dec->height     = height;
    dec->mb_width   = width >> 4;
    dec->mb_height  = height >> 4;
    dec->n_mbs      = dec->mb_width * dec->mb_height;
    dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
    /* daedalus-fourier ctx — required.  Phase 1 needs the QPU; if
     * Vulkan init fails the decoder is unusable.  Caller can check
     * via daedalus_decoder_has_qpu(). */
    dec->dctx = daedalus_ctx_create();
    if (!dec->dctx) {
        free(dec);
        return NULL;
    }
    dec->mb_descs = calloc((size_t) dec->n_mbs, sizeof(*dec->mb_descs));
    dec->coeffs   = calloc((size_t) dec->n_mbs * 384, sizeof(int16_t));
    if (!dec->mb_descs || !dec->coeffs) {
        daedalus_decoder_destroy(dec);
        return NULL;
    }
    return dec;
 }
 void daedalus_decoder_destroy(daedalus_decoder *dec)
 {
    if (!dec)
        return;
    free(dec->coeffs);
    free(dec->mb_descs);
    if (dec->dctx)
        daedalus_ctx_destroy(dec->dctx);
    free(dec);
 }
 int daedalus_decoder_set_output_format(daedalus_decoder *dec,
                                        daedalus_decoder_output_format fmt)
 {
    if (!dec)
        return -1;
    if (dec->mbs_appended != 0)
        return -1;  /* mid-frame change forbidden */
    if (fmt != DAEDALUS_DECODER_OUTPUT_NV12 &&
        fmt != DAEDALUS_DECODER_OUTPUT_RGBA)
        return -1;
    dec->output_fmt = fmt;
    return 0;
 }
 int daedalus_decoder_append_mb(daedalus_decoder *dec,
                                const struct daedalus_decoder_mb_input *mb)
 {
    if (!dec || !mb || !mb->coeffs)
        return -1;
    if (mb->mb_x >= dec->mb_width || mb->mb_y >= dec->mb_height)
        return -1;
    /* Raster-order check — Phase 1's intra wavefront requires it.
     * Caller is libavcodec's slice loop which produces raster order
     * naturally, so this should never fire in practice. */
    int expected = mb->mb_y * dec->mb_width + mb->mb_x;
    if (expected != dec->mbs_appended)
        return -1;
    struct daedalus_decoder_mb_desc *d = &dec->mb_descs[expected];
    d->mb_x              = mb->mb_x;
    d->mb_y              = mb->mb_y;
    d->mb_type           = mb->mb_type;
    d->mb_qp_y           = mb->mb_qp_y;
    d->mb_qp_uv          = mb->mb_qp_uv;
    d->cbp               = mb->cbp;
    memcpy(d->intra_4x4_modes, mb->intra_4x4_modes, 16);
    d->intra_16x16_mode  = mb->intra_16x16_mode;
    d->intra_chroma_mode = mb->intra_chroma_mode;
    d->partition_mode    = mb->partition_mode;
    memcpy(d->ref_idx_l0, mb->ref_idx_l0, 4);
    memcpy(d->ref_idx_l1, mb->ref_idx_l1, 4);
    memcpy(d->mv_l0, mb->mv_l0, sizeof(d->mv_l0));
    memcpy(d->mv_l1, mb->mv_l1, sizeof(d->mv_l1));
    d->deblock_disable   = mb->deblock_disable;
    d->deblock_alpha_c0  = mb->deblock_alpha_c0;
    d->deblock_beta      = mb->deblock_beta;
    memcpy(&dec->coeffs[(size_t) expected * 384],
           mb->coeffs,
           384 * sizeof(int16_t));
    dec->mbs_appended++;
    return 0;
 }
 /* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch.
 *
 * Brings up the GPU substrate by calling daedalus-fourier's existing
 * `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity
 * (n_blocks = N_MBs × 16 luma 4×4 blocks per frame), in contrast to
 * the substitution-arc shim that called it with n_blocks = 1 per call.
 * ONE Vulkan submit + wait round-trip per frame instead of millions.
 *
 * What's done in this stage:
 *   - Build a per-frame luma-4x4 meta[] in raster order across all MBs
 *   - Repack the per-MB coeffs[] (384 int16, first 256 are luma) into
 *     a flat block-major coeffs buffer (n_blocks × 16 int16)
 *   - Allocate a frame-sized scratch Y plane (zero-initialised — no
 *     intra prediction yet, so "predicted" = 0)
 *   - Dispatch once via the recipe layer; the shader does
 *     clip255(predicted + idct(coeffs)), i.e. with predicted=0 it's
 *     clip255(idct(coeffs))
 *   - Copy the scratch Y plane to the caller's out_y at the requested
 *     stride
 *
 * What's NOT done yet (follow-on Phase 1 sub-PRs):
 *   - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
 *     so output pixels are residual-only and not a valid frame decode.
 *     Sufficient for Vulkan round-trip validation, not for bit-exact
 *     against FFmpeg.
 *   - Motion compensation (Stage 2b): inter MBs not handled.
 *   - High-profile IDCT 8x8 (Stage 1 extension)
 *   - Deblock (Stage 4)
 *   - Chroma planes — the daedalus-fourier idct4 shader is luma-only
 *     in this revision; chroma blocks (4×4, 4 cb + 4 cr per MB) need a
 *     separate dispatch with different meta/dst layout.  out_uv is
 *     filled with neutral grey (128) as placeholder.
 *   - dmabuf export — still memcpy-out to caller-provided planes.
 *   - Stage 5 RGBA opt-in.
 */
 int daedalus_decoder_flush_frame(daedalus_decoder *dec,
                                  uint8_t *out_y,  size_t y_stride,
                                  uint8_t *out_uv, size_t uv_stride)
 {
    if (!dec)
        return -1;
    if (dec->mbs_appended != dec->n_mbs)
        return -1;  /* incomplete frame */
    if (!out_y)
        return -1;
    int rc = 0;
    /* ---- Build frame-scaled luma-4x4 dispatch ---- */
    const size_t n_luma_blocks_per_mb = 16;
    const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb;
    /* Scratch Y plane — coded-size byte buffer.  Zero-initialised so
     * the IDCT-ADD-clip operation reduces to clip255(IDCT) per block
     * (predicted=0 because no intra/MC has run yet). */
    const size_t y_stride_int = (size_t) dec->width;
    const size_t y_size = y_stride_int * (size_t) dec->height;
    uint8_t *scratch_y = calloc(1, y_size);
    int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t));
    daedalus_h264_block_meta *meta = malloc(
        n_luma_blocks * sizeof(daedalus_h264_block_meta));
    if (!scratch_y || !flat_coeffs || !meta) {
        rc = -1;
        goto cleanup;
    }
    /* Raster-order layout: walk each MB, then each of its 16 luma 4×4
     * sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner).
     *
     * NB: H.264's actual per-MB 4×4 coefficient scan order is the
     * z-scan from spec §6.4.3 / fig 6-10.  We're using a flat raster
     * here because Phase 1 stage 1 only validates the dispatch
     * round-trip; bit-exact against an FFmpeg reference requires the
     * z-scan permutation and is a follow-on test.  The per-MB
     * coeffs[] field's first 256 entries are interpreted as 16
     * consecutive 4×4 blocks in the same raster order on the input
     * side, so this is self-consistent for the validation. */
    size_t bi = 0;
    for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
        for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
            int mb_idx = mb_y * dec->mb_width + mb_x;
            const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
            for (int sb_y = 0; sb_y < 4; sb_y++) {
                for (int sb_x = 0; sb_x < 4; sb_x++) {
                    /* Block top-left pixel in the coded Y plane. */
                    size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
                    size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
                    meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x);
                    /* Copy 16 coeffs for this block from the per-MB
                     * coeffs[] (luma offset = block_idx * 16). */
                    int block_in_mb = sb_y * 4 + sb_x;
                    memcpy(&flat_coeffs[bi * 16],
                           &mb_coeffs[block_in_mb * 16],
                           16 * sizeof(int16_t));
                    bi++;
                }
            }
        }
    }
    /* assert bi == n_luma_blocks; the loop math guarantees it */
    /* ---- One Vulkan submit + wait for the whole frame's luma IDCT.
     * AUTO substrate picks QPU per the post-decree recipe table; falls
     * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */
    int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
                                                  scratch_y, y_stride_int,
                                                  flat_coeffs,
                                                  n_luma_blocks,
                                                  meta);
    if (dr != 0) {
        rc = -3;  /* GPU dispatch failure */
        goto cleanup;
    }
    /* ---- Copy out to caller's planes at the requested stride. ---- */
    for (int r = 0; r < dec->height; r++)
        memcpy(out_y + (size_t) r * y_stride,
               &scratch_y[(size_t) r * y_stride_int],
               (size_t) dec->width);
    /* Chroma placeholder: 128 = mid-grey (NV12 neutral).  Real chroma
     * IDCT dispatch is the next sub-PR. */
    if (out_uv) {
        for (int r = 0; r < dec->height / 2; r++)
            memset(out_uv + (size_t) r * uv_stride, 128, (size_t) dec->width);
    }
 cleanup:
    free(meta);
    free(flat_coeffs);
    free(scratch_y);
    dec->mbs_appended = 0;
    return rc;
 }
 int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
 {
    (void) dec; (void) plane;
    /* TODO Phase 1: vkGetMemoryFdKHR on the DPB slot's VkImage memory. */
    return -1;
 }
 int daedalus_decoder_has_qpu(const daedalus_decoder *dec)
 {
    if (!dec || !dec->dctx)
        return 0;
    return daedalus_ctx_has_qpu(dec->dctx);
 }
@@ -1,68 +0,0 @@
 /* SPDX-License-Identifier: BSD-2-Clause */
 /*
 * daedalus-decoder — internal types shared across translation units.
 * Not installed; pure-internal.
 */
 #ifndef DAEDALUS_DECODER_INTERNAL_H
 #define DAEDALUS_DECODER_INTERNAL_H
 #include "daedalus_decoder.h"
 #include <stdint.h>
 #include <stddef.h>
 #include <daedalus.h>     /* daedalus-fourier public API */
 /* Per-MB descriptor as the GPU sees it.  Bit-laid-out to match the
 * shader's std430 layout.  Kept narrow (32 bytes target) so a 1080p
 * frame's 8160 entries fit in ~256 KiB SSBO.
 *
 * TODO once the shaders exist: nail down the exact std430 layout and
 * static_assert sizeof / alignof here. */
 struct daedalus_decoder_mb_desc {
    uint16_t mb_x;
    uint16_t mb_y;
    uint8_t  mb_type;
    uint8_t  mb_qp_y;
    uint8_t  mb_qp_uv;
    uint8_t  cbp;
    uint8_t  intra_4x4_modes[16];
    uint8_t  intra_16x16_mode;
    uint8_t  intra_chroma_mode;
    uint8_t  partition_mode;
    uint8_t  _pad0;
    int8_t   ref_idx_l0[4];
    int8_t   ref_idx_l1[4];
    int16_t  mv_l0[4][2];
    int16_t  mv_l1[4][2];
    uint8_t  deblock_disable;
    int8_t   deblock_alpha_c0;
    int8_t   deblock_beta;
    uint8_t  _pad1;
 };
 struct daedalus_decoder {
    /* Geometry. */
    int  width;
    int  height;
    int  mb_width;       /* width / 16 */
    int  mb_height;      /* height / 16 */
    int  n_mbs;
    /* daedalus-fourier context (Vulkan + V3D7 runner). */
    daedalus_ctx *dctx;
    /* Frame-shaped staging (CPU-side; will move to mapped SSBO once
     * Vulkan plumbing is in place). */
    struct daedalus_decoder_mb_desc *mb_descs;  /* n_mbs */
    int16_t                         *coeffs;    /* n_mbs * 384 */
    int                              mbs_appended;  /* per-frame count */
    /* Output format. */
    daedalus_decoder_output_format   output_fmt;
 };
 #endif /* DAEDALUS_DECODER_INTERNAL_H */
@@ -1,210 +0,0 @@
 /* SPDX-License-Identifier: BSD-2-Clause */
 /*
 * test_idct_bitexact — phase1 stage1 bit-exact gate for the frame-
 * scaled luma IDCT 4×4 dispatch.
 *
 * Generates a frame of random coefficients, runs daedalus_decoder
 * (with predicted=0 by the scaffold's flush_frame contract), and
 * compares every output byte against an inline C reference that
 * mirrors the H.264 §8.5.12.1 1D butterfly.
 *
 * Why "bit-exact": the GPU shader and the C reference apply the same
 * integer arithmetic.  Any rounding / sign / overflow disagreement is
 * a bug.  Pass = every output byte matches.
 *
 * Scope match with flush_frame: the test mirrors flush_frame's
 * per-MB → flat block layout (raster scan within MB, no z-scan
 * permutation).  That keeps the test focused on IDCT correctness;
 * the z-scan permutation that bridges to libavcodec's per-MB coeffs
 * layout is a separate concern (handled in the eventual libavcodec-
 * intercept patch).
 *
 * Not in scope (covered by other tests / future PRs):
 *   - chroma planes (Phase 1 stage 1 fills UV with grey 128)
 *   - IDCT 8×8 (Phase 1 follow-on)
 *   - bit-exactness against real H.264 streams (test-vector PR)
 *   - non-zero predicted pixels (intra prediction lands in Stage 2a)
 */
 #include "daedalus_decoder.h"
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 /* xorshift64* for deterministic random coefficient generation. */
 static uint64_t xs64_state;
 static uint64_t xs64(void)
 {
    uint64_t x = xs64_state;
    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
    return xs64_state = x;
 }
 /* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
 * then column pass; +32 rounding, >>6, add to predicted (=0 here),
 * clip to u8.  Bit-exact-equivalent transcription of daedalus-fourier
 * tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
 * fair-use for test purposes — same algorithm, no copy of code). */
 static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
 static void h264_idct4_butterfly(const int d[4], int out[4])
 {
    int e = d[0] + d[2];
    int f = d[0] - d[2];
    int g = (d[1] >> 1) - d[3];
    int h = d[1] + (d[3] >> 1);
    out[0] = e + h;
    out[1] = f + g;
    out[2] = f - g;
    out[3] = e - h;
 }
 static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
 {
    /* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
     *   block[c*4 + r] = coeff at (row=r, col=c).
     * Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
    int tmp[4][4];
    for (int r = 0; r < 4; r++) {
        int d[4]  = { block[0*4 + r], block[1*4 + r],
                      block[2*4 + r], block[3*4 + r] };
        int o[4];
        h264_idct4_butterfly(d, o);
        for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
    }
    /* Column pass: gather d[r] = tmp[r][c] for fixed c. */
    int col_out[4][4];
    for (int c = 0; c < 4; c++) {
        int d[4]  = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
        int o[4];
        h264_idct4_butterfly(d, o);
        for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
    }
    /* Add (predicted=dst, here 0) + clip. */
    for (int r = 0; r < 4; r++)
        for (int c = 0; c < 4; c++)
            dst[r * stride + c] = (uint8_t) clip_u8(
                dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
 }
 int main(int argc, char **argv)
 {
    /* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
     * the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */
    int width  = argc > 1 ? atoi(argv[1]) : 320;
    int height = argc > 2 ? atoi(argv[2]) : 240;   /* 240 / 16 = 15 → coded 240 */
    /* Coded dims must be mod-16; 320×240 is canonical QVGA. */
    uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
    xs64_state = seed;
    int mb_w = width  / 16;
    int mb_h = height / 16;
    int n_mbs = mb_w * mb_h;
    printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
           width, height, n_mbs, (unsigned long) seed);
    daedalus_decoder *dec = daedalus_decoder_create(width, height);
    if (!dec) {
        fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
        return 0;
    }
    /* Build the per-MB inputs.  Each MB gets 16 luma 4×4 blocks of
     * random coeffs in [-512, 511] — same range as the daedalus-fourier
     * cycle-6 M1 gate uses. */
    int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
    if (!per_mb_coeffs) { fprintf(stderr, "alloc fail\n"); return 1; }
    for (int mb = 0; mb < n_mbs; mb++) {
        for (int i = 0; i < 384; i++) {
            if (i < 256)
                per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
            else
                per_mb_coeffs[mb][i] = 0;  /* chroma — unused this stage */
        }
    }
    /* Append in raster order. */
    struct daedalus_decoder_mb_input mb = {0};
    for (int my = 0; my < mb_h; my++) {
        for (int mx = 0; mx < mb_w; mx++) {
            int idx = my * mb_w + mx;
            mb.mb_x = (uint16_t) mx;
            mb.mb_y = (uint16_t) my;
            mb.coeffs = per_mb_coeffs[idx];
            if (daedalus_decoder_append_mb(dec, &mb) != 0) {
                fprintf(stderr, "append (%d,%d) failed\n", mx, my);
                return 1;
            }
        }
    }
    /* Flush. */
    size_t y_size = (size_t) width * height;
    uint8_t *gpu_y = calloc(1, y_size);
    if (!gpu_y) return 1;
    int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
                                            NULL, 0);
    if (frc != 0) {
        fprintf(stderr, "flush_frame rc=%d\n", frc);
        return 1;
    }
    /* Compute the reference output: same per-MB → flat raster block
     * layout as flush_frame uses. */
    uint8_t *ref_y = calloc(1, y_size);
    if (!ref_y) return 1;
    /* Need a destructively-mutable copy because the reference IDCT
     * doesn't actually mutate, but the GPU's IDCT shader does zero
     * the coeffs.  Our reference doesn't zero; that's fine because we
     * use a fresh copy per block. */
    int16_t block_scratch[16];
    for (int my = 0; my < mb_h; my++) {
        for (int mx = 0; mx < mb_w; mx++) {
            int mb_idx = my * mb_w + mx;
            for (int sb_y = 0; sb_y < 4; sb_y++) {
                for (int sb_x = 0; sb_x < 4; sb_x++) {
                    int block_in_mb = sb_y * 4 + sb_x;
                    memcpy(block_scratch,
                           &per_mb_coeffs[mb_idx][block_in_mb * 16],
                           16 * sizeof(int16_t));
                    size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
                    size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
                    ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
                                  width, block_scratch);
                }
            }
        }
    }
    /* Byte-by-byte compare. */
    size_t diffs = 0;
    size_t first_diff = 0;
    for (size_t i = 0; i < y_size; i++) {
        if (gpu_y[i] != ref_y[i]) {
            if (diffs == 0) first_diff = i;
            diffs++;
        }
    }
    printf("Y bytes total:  %zu\n", y_size);
    printf("Y bytes diff:   %zu (%.4f%%)\n", diffs, 100.0 * diffs / y_size);
    if (diffs) {
        printf("first diff at offset %zu: gpu=%u ref=%u\n",
               first_diff, gpu_y[first_diff], ref_y[first_diff]);
    }
    free(ref_y);
    free(gpu_y);
    free(per_mb_coeffs);
    daedalus_decoder_destroy(dec);
    if (diffs == 0) {
        printf("BIT-EXACT PASS\n");
        return 0;
    }
    fprintf(stderr, "BIT-EXACT FAIL\n");
    return 1;
 }
@@ -1,161 +0,0 @@
 /* SPDX-License-Identifier: BSD-2-Clause */
 /*
 * Scaffold smoke test — verifies the daedalus-decoder library links
 * cleanly against daedalus-fourier and the lifecycle entry points
 * don't immediately crash.  No actual decoding work yet.
 *
 * Returns 0 on success, non-zero on any unexpected behaviour.
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "daedalus_decoder.h"
 #define EXPECT(cond, msg) do { \
    if (!(cond)) { \
        fprintf(stderr, "EXPECT FAIL %s:%d: %s\n", __FILE__, __LINE__, msg); \
        return 1; \
    } \
 } while (0)
 int main(void)
 {
    printf("daedalus-decoder version: %s\n", daedalus_decoder_version());
    /* Create / destroy null is a no-op. */
    daedalus_decoder_destroy(NULL);
    /* Bad dimensions rejected. */
    EXPECT(daedalus_decoder_create(0,    0   ) == NULL, "zero dims must reject");
    EXPECT(daedalus_decoder_create(1919, 1088) == NULL, "non-16-multiple width must reject");
    EXPECT(daedalus_decoder_create(1920, 1079) == NULL, "non-16-multiple height must reject");
    /* Valid 1088p create. */
    daedalus_decoder *dec = daedalus_decoder_create(1920, 1088);
    if (!dec) {
        /* Vulkan init failure on this host — degrades to skip, not fail.
         * (CI runners without V3D7 will hit this; the smoke test
         * shouldn't gate on hardware presence.) */
        fprintf(stderr, "SKIP: daedalus_decoder_create returned NULL "
                "(Vulkan / V3D7 unavailable on this host)\n");
        return 0;
    }
    printf("ctx created: 1920x1088, has_qpu=%d\n",
           daedalus_decoder_has_qpu(dec));
    /* set_output_format mid-frame on virgin ctx is allowed
     * (mbs_appended == 0). */
    EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_RGBA) == 0,
           "switch to RGBA on virgin ctx");
    EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_NV12) == 0,
           "switch back to NV12");
    /* Append rejects out-of-bounds + null inputs. */
    int16_t coeffs[384] = {0};
    struct daedalus_decoder_mb_input mb = {0};
    mb.coeffs = coeffs;
    mb.mb_x = 0; mb.mb_y = 0;
    EXPECT(daedalus_decoder_append_mb(dec, NULL) == -1, "null mb rejects");
    {
        struct daedalus_decoder_mb_input mb2 = mb;
        mb2.coeffs = NULL;
        EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1, "null coeffs rejects");
    }
    {
        struct daedalus_decoder_mb_input mb2 = mb;
        mb2.mb_x = 9999; mb2.mb_y = 9999;
        EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1, "OOB coords reject");
    }
    /* Append first MB at raster index 0 — should succeed. */
    EXPECT(daedalus_decoder_append_mb(dec, &mb) == 0, "append (0,0)");
    /* Skipping (0,1) and appending (1,0) violates raster order — reject. */
    {
        struct daedalus_decoder_mb_input mb2 = mb;
        mb2.mb_x = 0; mb2.mb_y = 1;
        EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1,
               "out-of-raster-order rejects");
    }
    /* In-order: (1,0). */
    mb.mb_x = 1; mb.mb_y = 0;
    EXPECT(daedalus_decoder_append_mb(dec, &mb) == 0, "append (1,0)");
    /* Flush an incomplete frame: should fail because mbs_appended != n_mbs. */
    EXPECT(daedalus_decoder_flush_frame(dec, NULL, 0, NULL, 0) == -1,
           "incomplete-frame flush rejects");
    /* set_output_format mid-frame (mbs_appended > 0) must reject. */
    EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_RGBA) == -1,
           "mid-frame format change rejects");
    daedalus_decoder_destroy(dec);
    /* ---- Full-frame round-trip with all-zero coefficients.
     * Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT
     * dispatch and a successful GPU round-trip returns 0.  IDCT of
     * all-zero coefficients with zero-initialised predicted = all-zero
     * output pixels. */
    dec = daedalus_decoder_create(1920, 1088);
    if (!dec) {
        fprintf(stderr, "SKIP roundtrip: ctx create failed\n");
        return 0;
    }
    static int16_t zero_coeffs[384] = {0};
    struct daedalus_decoder_mb_input zmb = {0};
    zmb.coeffs = zero_coeffs;
    int mb_width = 1920 / 16;   /* 120 */
    int mb_height = 1088 / 16;  /* 68 */
    int n_mbs = mb_width * mb_height;
    for (int mby = 0; mby < mb_height; mby++) {
        for (int mbx = 0; mbx < mb_width; mbx++) {
            zmb.mb_x = (uint16_t) mbx;
            zmb.mb_y = (uint16_t) mby;
            if (daedalus_decoder_append_mb(dec, &zmb) != 0) {
                fprintf(stderr, "append (%d, %d) failed\n", mbx, mby);
                return 1;
            }
        }
    }
    printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height);
    size_t y_size = (size_t) 1920 * 1088;
    size_t uv_size = (size_t) 1920 * 1088 / 2;
    uint8_t *out_y = malloc(y_size);
    uint8_t *out_uv = malloc(uv_size);
    /* Pre-fill with sentinel so any read-then-write bug becomes visible. */
    memset(out_y, 0xab, y_size);
    memset(out_uv, 0xcd, uv_size);
    int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920);
    printf("flush_frame rc=%d\n", frc);
    EXPECT(frc == 0, "flush succeeds on full frame");
    /* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */
    int y_nz = 0;
    for (size_t i = 0; i < y_size; i++)
        if (out_y[i] != 0) y_nz++;
    printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
    EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");
    /* UV plane should be neutral grey (128) per Phase 1 placeholder. */
    int uv_wrong = 0;
    for (size_t i = 0; i < uv_size; i++)
        if (out_uv[i] != 128) uv_wrong++;
    printf("UV non-128 bytes: %d / %zu\n", uv_wrong, uv_size);
    EXPECT(uv_wrong == 0, "UV plane is grey (128) Phase 1 placeholder");
    free(out_y);
    free(out_uv);
    daedalus_decoder_destroy(dec);
    printf("smoke OK\n");
    return 0;
 }