1 Commits

Author SHA1 Message Date
marfrit cfeb998ff8 Update DESIGN.md 2026-05-24 19:51:38 +00:00
10 changed files with 9 additions and 1745 deletions
-15
View File
@@ -1,15 +0,0 @@
build/
build-*/
*.o
*.a
*.so
*.so.*
*.spv
.vscode/
.cache/
compile_commands.json
CMakeCache.txt
CMakeFiles/
cmake_install.cmake
Makefile
.ninja_*
-160
View File
@@ -1,160 +0,0 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# daedalus-decoder — frame-level GPU H.264 decoder for V3D7 (Pi 5).
# Phase 1 scaffold; see DESIGN.md for architecture.
#
# Build dependencies:
# - daedalus-fourier ≥ 0.1.0 (kernel pack, V3D primitives + recipe layer)
# resolved via pkg-config; install via the daedalus-fourier upstream
# `cmake --install` rule (PR #5 made the .pc relocatable, so any
# install prefix works as long as $PKG_CONFIG_PATH is set).
# - Vulkan headers + libvulkan (pulled in transitively via
# daedalus-fourier, listed here explicitly for the link order).
#
# Build:
# cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release
# cmake --build build
# ctest --test-dir build
cmake_minimum_required(VERSION 3.20)
project(daedalus-decoder
VERSION 0.0.1
DESCRIPTION "Frame-level GPU H.264 decoder for Raspberry Pi 5 / V3D7"
LANGUAGES C)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED ON)
set(CMAKE_C_EXTENSIONS OFF)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# Pi 5 is the only supported target. Other aarch64 SoCs (Pi 4 V3D4,
# RK3588 Mali, …) might work but would need explicit substrate +
# shader-pack validation per the daedalus-fourier architecture
# backlog. Don't pretend to support what we haven't validated.
if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
message(WARNING
"daedalus-decoder is designed for aarch64 (Pi 5 BCM2712 / V3D7). "
"Build will proceed but is unlikely to function.")
endif()
add_compile_options(-Wall -Wextra -Wno-unused-parameter)
# ---- Dependencies --------------------------------------------------
find_package(PkgConfig REQUIRED)
# daedalus-fourier — find_package via pkg-config per the Phase 1
# decision §9.6. Minimum version 0.1.0 (the cycle 6-9 shaders + pool
# + recipe-flip baseline). PKG_CONFIG_PATH should point at the
# directory holding daedalus-fourier.pc (e.g. /usr/local/lib/pkgconfig
# or a custom install prefix).
pkg_check_modules(DAEDALUS_FOURIER REQUIRED daedalus-fourier>=0.1.0)
# Vulkan — daedalus-fourier already depends on this; we add it
# explicitly so the link order stays correct (daedalus-fourier static
# archive contains undefined vk* symbols that the loader resolves).
find_package(Vulkan REQUIRED)
# ---- Version string baked into the library ------------------------
# git rev tagged onto the version string for traceability; degrades
# gracefully to bare semver if git isn't available.
execute_process(
COMMAND git -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse --short=7 HEAD
OUTPUT_VARIABLE DAEDALUS_DECODER_GITREV
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET)
if(DAEDALUS_DECODER_GITREV)
set(DAEDALUS_DECODER_VERSION "${PROJECT_VERSION}+g${DAEDALUS_DECODER_GITREV}")
else()
set(DAEDALUS_DECODER_VERSION "${PROJECT_VERSION}")
endif()
message(STATUS "daedalus-decoder version: ${DAEDALUS_DECODER_VERSION}")
# ---- Library ------------------------------------------------------
add_library(daedalus_decoder STATIC
src/daedalus_decoder.c
)
target_include_directories(daedalus_decoder
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
PRIVATE
src
${DAEDALUS_FOURIER_INCLUDE_DIRS}
)
target_link_directories(daedalus_decoder
PUBLIC
${DAEDALUS_FOURIER_LIBRARY_DIRS}
)
target_link_libraries(daedalus_decoder
PUBLIC
# Order matters: daedalus-fourier static archive references
# vulkan symbols; the loader needs daedalus-fourier first then
# vulkan to resolve them.
${DAEDALUS_FOURIER_LIBRARIES}
Vulkan::Vulkan
)
target_compile_definitions(daedalus_decoder
PRIVATE
DAEDALUS_DECODER_VERSION="${DAEDALUS_DECODER_VERSION}"
)
target_compile_options(daedalus_decoder PRIVATE -O2)
# ---- Smoke test ---------------------------------------------------
enable_testing()
add_executable(test_smoke tests/test_smoke.c)
target_link_libraries(test_smoke PRIVATE daedalus_decoder)
target_compile_options(test_smoke PRIVATE -O2)
add_test(NAME smoke COMMAND test_smoke)
add_executable(test_idct_bitexact tests/test_idct_bitexact.c)
target_link_libraries(test_idct_bitexact PRIVATE daedalus_decoder)
target_compile_options(test_idct_bitexact PRIVATE -O2)
# 320x240 QVGA — fast inner-loop test (300 MBs, sub-second).
add_test(NAME idct_bitexact COMMAND test_idct_bitexact)
# Same QVGA test re-run on the CPU NEON path (forces fallback even on
# V3D7 hosts). Catches silent drift between the V3D shader and the
# NEON reference path — both must produce identical output for the
# same coefficient input. Also keeps the bit-exact gate alive on
# hosts without V3D7 (CI runners, x86 dev boxes).
add_test(NAME idct_bitexact_cpu COMMAND test_idct_bitexact 320 240
0xfeedface5a5a5a5a cpu)
# 1920x1088 1080p — deployment-scale test (8160 MBs, ~0.25 s on hertz).
# Validates the per-MB block index + pixel offset math at full coded
# height (1088, not 1080 — see daedalus_decoder.h on H.264 coded vs
# displayed dims). Cheap enough to run unconditionally; if it ever
# gets slow we'll split into a CTest LABEL for opt-in.
add_test(NAME idct_bitexact_1080p COMMAND test_idct_bitexact 1920 1088)
# ---- Benchmarks (not gated by ctest) ------------------------------
#
# Build-time only; user runs them by hand when checking perf. Adding
# them as ctest would make every CI run slow and the numbers would
# get drowned in pass/fail noise. See the header of each .c for what
# they measure.
add_executable(bench_flush_frame tests/bench_flush_frame.c)
target_link_libraries(bench_flush_frame PRIVATE daedalus_decoder)
target_compile_options(bench_flush_frame PRIVATE -O2)
# ---- Install ------------------------------------------------------
#
# Library + public header. Stage 2/3 will add a pkg-config file and
# CMake config exports once the API stabilises; pre-0.1 the scaffold
# install just gives the static archive a home.
include(GNUInstallDirs)
install(TARGETS daedalus_decoder
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
install(FILES include/daedalus_decoder.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+9 -15
View File
@@ -261,31 +261,25 @@ That's a substantial shader inventory. Each requires bit-exact M1 gate against
**Phase 4 — Production-ready deblock + perf optimization + libva integration** (+4 weeks). Real-world stream conformance. Plug into daedalus-v4l2 daemon as the actual decode backend.
**Total H.264 budget:** 4-6 months.
**Phase 5+ (future codec scope, not committed):** VP9 and AV1 reuse the same frame-level dispatch architecture, daedalus-fourier kernel pack, and DPB plumbing. Per §9.7, they are deferred but *not firmly out-of-scope*. HEVC stays firmly out (Pi 5 has `rpi-hevc-dec` for that).
**Total budget:** 4-6 months.
---
## 9. Phase 1 decisions
## 9. Open questions
User-confirmed 2026-05-24. All seven questions from the initial
draft are now decided; this section preserves the original wording
of each item for traceability.
1. **Intra prediction strategy:** GPU wavefront (~187 dispatches, more complex) vs CPU speculative (simpler, slower). Plan: wavefront in Phase 1; revisit if it's the perf bottleneck. [x]
1. **Intra prediction strategy:** GPU wavefront (~187 dispatches, more complex) vs CPU speculative (simpler, slower). **Decision: wavefront in Phase 1; revisit if it's the perf bottleneck.**
2. **libavcodec intercept granularity:** macroblock-level (substitution-arc evolution) vs slice-level (cleaner rewrite). Plan: macroblock-level for Phase 1; consider slice-level later if buffer accumulation overhead is non-trivial. [x]
2. **libavcodec intercept granularity:** macroblock-level (substitution-arc evolution) vs slice-level (cleaner rewrite). **Decision: macroblock-level for Phase 1; consider slice-level later if buffer accumulation overhead is non-trivial.**
3. **Shader parameterization:** 16 qpel variants as 16 shaders, or one parameterized shader with switch on mc_position? V3D's compiler might inline-optimize either; needs measurement. [x] Measurement it is.
3. **Shader parameterization:** 16 qpel variants as 16 shaders, or one parameterized shader with switch on mc_position? **Decision: measure both during Phase 2 (the MC phase) and pick the winner. No commit ahead of measurement.**
4. **DPB allocation:** Vulkan-native VkImage with dmabuf export, vs CPU-allocated dma_buf imported into Vulkan. Affects V4L2 integration story. Plan: Vulkan-native with `VK_KHR_external_memory_dma_buf` export; daedalus-v4l2 daemon imports. [x]
4. **DPB allocation:** Vulkan-native VkImage with dmabuf export, vs CPU-allocated dma_buf imported into Vulkan. **Decision: Vulkan-native with `VK_KHR_external_memory_dma_buf` export; daedalus-v4l2 daemon imports.**
5. **Daemon integration shape:** does daedalus-decoder ship as a static library the daemon links, or as a separate process the daemon talks to? Library, almost certainly — process boundary would multiply IPC cost. [x] library.
5. **Daemon integration shape:** static library the daemon links, or separate process. **Decision: library link.**
6. **Build dependency on daedalus-fourier:** as a CMake `find_package`, or vendored? `find_package`, pinned to a tagged release. daedalus-fourier becomes the "kernel pack" upstream library. [x] Yes.
6. **Build dependency on daedalus-fourier:** CMake `find_package`, or vendored? **Decision: `find_package`, pinned to a tagged release. daedalus-fourier becomes the "kernel pack" upstream library.**
7. **Codec scope.** **Decision: firmly out-of-scope for daedalus-decoder are HEVC (Pi 5 has `rpi-hevc-dec` for that), 10-bit, interlaced, and FMO/ASO.** VP9 and AV1 are *not* firmly out — they're future codec scope for the same framework after H.264 lands. This is a scope expansion from the initial draft, which had grouped them with HEVC under "firmly out".
7. **Out-of-scope for daedalus-decoder (firmly):** HEVC (Pi 5 has rpi-hevc-dec for that), 10-bit, interlaced, FMO/ASO.
---
-24
View File
@@ -1,24 +0,0 @@
BSD 2-Clause License
Copyright (c) 2026, Markus Fritsche
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-225
View File
@@ -1,225 +0,0 @@
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* daedalus-decoder — public C API.
*
* Frame-level GPU H.264 decoder targeting V3D7 (Raspberry Pi 5). Built
* on daedalus-fourier's V3D compute primitives at frame granularity —
* one Vulkan submit per frame, one fence wait per frame, encoded
* bitstream in (via libavcodec's per-MB intercept), NV12 frame out.
*
* Per the 2026-05-24 Phase 1 design decisions:
* - libavcodec intercept is at macroblock-level (substitution-arc
* evolution): the caller is expected to drive the per-MB CABAC /
* CAVLC entropy decode and feed each macroblock's descriptor +
* coefficients via daedalus_decoder_append_mb(). flush_frame()
* builds the per-frame VkCommandBuffer and submits.
* - DPB is Vulkan-native VkImage with VK_KHR_external_memory_dma_buf
* export. The caller can obtain the output frame's dmabuf fd
* via daedalus_decoder_export_dmabuf().
* - Daemon integration shape: this library is statically linked into
* daedalus_v4l2_daemon. No IPC.
*
* STATUS: scaffold. No GPU pipeline implemented yet; all functions
* are stubs that compile but do not decode anything. See DESIGN.md
* for the architecture.
*
* ABI: pre-0.1 — every signature here may change. Don't rely on
* stability yet.
*/
#ifndef DAEDALUS_DECODER_H
#define DAEDALUS_DECODER_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/* -------------------------------------------------------------------
* Opaque decoder context. One per concurrent stream.
* ----------------------------------------------------------------- */
typedef struct daedalus_decoder daedalus_decoder;
/* -------------------------------------------------------------------
* Per-macroblock input. Mirrors §3 of DESIGN.md. The caller's
* libavcodec intercept populates this from the H264SliceContext
* fields after ff_h264_decode_mb_cabac/cavlc returns and before
* ff_h264_hl_decode_mb is supposed to run (we replace the latter).
* ----------------------------------------------------------------- */
struct daedalus_decoder_mb_input {
/* Frame coordinates (macroblock units). */
uint16_t mb_x;
uint16_t mb_y;
/* Type + quantisation. */
uint8_t mb_type; /* H.264 spec table 7-13/7-14/7-17/7-18 enum */
uint8_t mb_qp_y;
uint8_t mb_qp_uv;
uint8_t cbp; /* coded block pattern, 0..47 */
/* Intra prediction (used iff mb_type == I_NxN or I_16x16). */
uint8_t intra_4x4_modes[16];
uint8_t intra_16x16_mode;
uint8_t intra_chroma_mode;
/* Inter motion / partitions (used iff P_* or B_*). */
uint8_t partition_mode; /* P_16x16 / P_16x8 / P_8x16 / P_8x8 / etc. */
int8_t ref_idx_l0[4]; /* per partition; -1 = not used */
int8_t ref_idx_l1[4]; /* B only */
int16_t mv_l0[4][2]; /* qpel precision (1/4 sample); (x, y) */
int16_t mv_l1[4][2];
/* Deblocking filter parameters. */
uint8_t deblock_disable; /* 0 = enabled */
int8_t deblock_alpha_c0;
int8_t deblock_beta;
/* High-profile 8x8 transform selector.
* 0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks
* (16 coeffs each, raster sb_y*4+sb_x); the chroma section is
* always 4x4.
* 1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs
* each, raster sb_y*2+sb_x). Set per H.264's
* transform_8x8_size_flag. Chroma remains 4x4 (4:2:0).
*/
uint8_t transform_8x8;
/* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all
* column-major within each 4x4 or 8x8 block (matches FFmpeg
* convention). Caller-owned; copied during append. */
const int16_t *coeffs; /* points at exactly 384 int16_t */
};
/* -------------------------------------------------------------------
* Output frame format selector.
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_DECODER_OUTPUT_NV12 = 0, /* default; Stage 4 final */
DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */
} daedalus_decoder_output_format;
/* -------------------------------------------------------------------
* Substrate selector. Determines which backend daedalus-fourier
* dispatches the per-frame compute through.
*
* AUTO is the only sensible choice for production — it picks per the
* recipe table baked into daedalus-fourier (post 2026-05-23 decree:
* QPU when a V3D shader exists, CPU NEON otherwise). The explicit
* options exist for testing:
*
* - CPU forces the dispatch onto the NEON path even when V3D7 is
* available. Lets the bit-exact ctests run on hosts without a
* working Vulkan/V3D stack (CI runners, dev x86 boxes via
* cross-build), and lets us cross-check the V3D shader output
* against the NEON reference path on hosts that DO have V3D.
* - QPU is the dual — force QPU even on a CPU-preferred kernel.
* Useful for benchmarking specific QPU paths in isolation.
*
* A non-AUTO selection on a host that can't satisfy it
* (DAEDALUS_DECODER_SUBSTRATE_QPU on an x86 dev box) propagates a
* dispatch failure back through flush_frame as -3.
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_DECODER_SUBSTRATE_AUTO = 0,
DAEDALUS_DECODER_SUBSTRATE_CPU = 1,
DAEDALUS_DECODER_SUBSTRATE_QPU = 2,
} daedalus_decoder_substrate;
/* -------------------------------------------------------------------
* Lifecycle
* ----------------------------------------------------------------- */
/* Create a decoder context for the given **coded** frame dimensions.
*
* width, height: pixels of the H.264 coded picture, NOT the displayed
* picture. Both must be multiples of 16 (macroblock granularity).
* For displayed 1080p (1920×1080), the coded frame is 1920×1088 with
* the SPS's `frame_cropping_*` offsets cropping the bottom 8 rows.
* The caller is responsible for translating from SPS dims + crop
* rectangle to the values passed here; we decode the coded frame.
*
* Returns NULL on bad dimensions or allocation failure. Returns a
* usable context with daedalus_decoder_has_qpu() == 0 when Vulkan
* init fails — callers that need GPU work should check has_qpu
* before relying on it.
*/
daedalus_decoder *daedalus_decoder_create(int width, int height);
/* Free all resources. Safe with NULL. */
void daedalus_decoder_destroy(daedalus_decoder *dec);
/* Switch output format BEFORE the first append_mb call of a frame.
* Default is NV12. Returns 0 on success, -1 if called mid-frame
* (caller must flush first). */
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
daedalus_decoder_output_format fmt);
/* Override the dispatch substrate for subsequent flush_frame calls.
* Default is AUTO. Same mid-frame-change restriction as
* set_output_format. */
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
daedalus_decoder_substrate sub);
/* -------------------------------------------------------------------
* Per-frame submission
* ----------------------------------------------------------------- */
/* Append one macroblock's data to the current frame's descriptor SSBO
* + coefficient SSBO. No GPU dispatch yet — just CPU-side writes.
*
* Must be called in raster order (mb_y * mb_width + mb_x) for the
* intra-prediction wavefront to work correctly in Phase 1.
*
* Returns 0 on success, negative on bounds violation or OOM.
*/
int daedalus_decoder_append_mb(daedalus_decoder *dec,
const struct daedalus_decoder_mb_input *mb);
/* End-of-frame flush: builds the per-frame VkCommandBuffer with all
* pipeline stages, submits once, waits on a single fence, copies the
* NV12 (or RGBA when opted in) output into the caller-provided
* planes.
*
* For NV12:
* out_y / y_stride: Y plane (W*H bytes minimum, at the given stride)
* out_uv / uv_stride: interleaved UV plane (W*(H/2) bytes minimum)
*
* For RGBA: out_y receives 4*W*H bytes at y_stride; out_uv ignored.
*
* Returns 0 on success, negative on Vulkan failure or undecodable
* frame. After return, the decoder is ready for the next frame's
* append calls.
*/
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
uint8_t *out_y, size_t y_stride,
uint8_t *out_uv, size_t uv_stride);
/* Export the most-recently-decoded frame as a dma_buf fd. The fd is
* owned by the caller and must be closed when done. Lets V4L2
* consumers (daedalus_v4l2_daemon, libva-v4l2-request-fourier) attach
* the GPU-decoded surface directly to a CAPTURE plane without a CPU
* round-trip.
*
* Returns the dmabuf fd on success, -1 on failure. Must be called
* AFTER flush_frame returns for the relevant frame.
*/
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane);
/* -------------------------------------------------------------------
* Diagnostics
* ----------------------------------------------------------------- */
/* daedalus-decoder build version (semver string, e.g. "0.0.1+g0a1b2c3"). */
const char *daedalus_decoder_version(void);
/* Whether the underlying daedalus-fourier context picked up a working
* V3D7 Vulkan instance. Returns 0 if Vulkan init failed and the
* decoder is operating in stub / failure mode. */
int daedalus_decoder_has_qpu(const daedalus_decoder *dec);
#ifdef __cplusplus
}
#endif
#endif /* DAEDALUS_DECODER_H */
-446
View File
@@ -1,446 +0,0 @@
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* daedalus-decoder — public C API implementation.
*
* Scaffold only. Most functions return success with no GPU work
* performed; the bodies will fill in across Phases 1-4 per DESIGN.md
* §8. This file exists so the API surface compiles, links, and can
* be smoke-tested end-to-end (ctx create / append / flush / destroy)
* before any shader work begins.
*/
#include "internal.h"
#include <stdlib.h>
#include <string.h>
/* Built via -D from CMakeLists. */
#ifndef DAEDALUS_DECODER_VERSION
#define DAEDALUS_DECODER_VERSION "0.0.1+scaffold"
#endif
const char *daedalus_decoder_version(void)
{
return DAEDALUS_DECODER_VERSION;
}
daedalus_decoder *daedalus_decoder_create(int width, int height)
{
if (width <= 0 || height <= 0)
return NULL;
if ((width & 15) || (height & 15))
return NULL; /* must be multiple of 16 */
daedalus_decoder *dec = calloc(1, sizeof(*dec));
if (!dec)
return NULL;
dec->width = width;
dec->height = height;
dec->mb_width = width >> 4;
dec->mb_height = height >> 4;
dec->n_mbs = dec->mb_width * dec->mb_height;
dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
dec->substrate = DAEDALUS_DECODER_SUBSTRATE_AUTO;
/* daedalus-fourier ctx — required. Phase 1 needs the QPU; if
* Vulkan init fails the decoder is unusable. Caller can check
* via daedalus_decoder_has_qpu(). */
dec->dctx = daedalus_ctx_create();
if (!dec->dctx) {
free(dec);
return NULL;
}
dec->mb_descs = calloc((size_t) dec->n_mbs, sizeof(*dec->mb_descs));
dec->coeffs = calloc((size_t) dec->n_mbs * 384, sizeof(int16_t));
if (!dec->mb_descs || !dec->coeffs) {
daedalus_decoder_destroy(dec);
return NULL;
}
return dec;
}
void daedalus_decoder_destroy(daedalus_decoder *dec)
{
if (!dec)
return;
free(dec->coeffs);
free(dec->mb_descs);
if (dec->dctx)
daedalus_ctx_destroy(dec->dctx);
free(dec);
}
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
daedalus_decoder_output_format fmt)
{
if (!dec)
return -1;
if (dec->mbs_appended != 0)
return -1; /* mid-frame change forbidden */
if (fmt != DAEDALUS_DECODER_OUTPUT_NV12 &&
fmt != DAEDALUS_DECODER_OUTPUT_RGBA)
return -1;
dec->output_fmt = fmt;
return 0;
}
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
daedalus_decoder_substrate sub)
{
if (!dec)
return -1;
if (dec->mbs_appended != 0)
return -1;
if (sub != DAEDALUS_DECODER_SUBSTRATE_AUTO &&
sub != DAEDALUS_DECODER_SUBSTRATE_CPU &&
sub != DAEDALUS_DECODER_SUBSTRATE_QPU)
return -1;
dec->substrate = sub;
return 0;
}
/* Map our public substrate enum onto daedalus-fourier's. Same
* ordering by intent — we duplicate the enum for ABI isolation. */
static daedalus_substrate map_substrate(daedalus_decoder_substrate s)
{
switch (s) {
case DAEDALUS_DECODER_SUBSTRATE_CPU: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_DECODER_SUBSTRATE_QPU: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_DECODER_SUBSTRATE_AUTO:
default: return DAEDALUS_SUBSTRATE_AUTO;
}
}
int daedalus_decoder_append_mb(daedalus_decoder *dec,
const struct daedalus_decoder_mb_input *mb)
{
if (!dec || !mb || !mb->coeffs)
return -1;
if (mb->mb_x >= dec->mb_width || mb->mb_y >= dec->mb_height)
return -1;
/* Raster-order check — Phase 1's intra wavefront requires it.
* Caller is libavcodec's slice loop which produces raster order
* naturally, so this should never fire in practice. */
int expected = mb->mb_y * dec->mb_width + mb->mb_x;
if (expected != dec->mbs_appended)
return -1;
struct daedalus_decoder_mb_desc *d = &dec->mb_descs[expected];
d->mb_x = mb->mb_x;
d->mb_y = mb->mb_y;
d->mb_type = mb->mb_type;
d->mb_qp_y = mb->mb_qp_y;
d->mb_qp_uv = mb->mb_qp_uv;
d->cbp = mb->cbp;
memcpy(d->intra_4x4_modes, mb->intra_4x4_modes, 16);
d->intra_16x16_mode = mb->intra_16x16_mode;
d->intra_chroma_mode = mb->intra_chroma_mode;
d->partition_mode = mb->partition_mode;
memcpy(d->ref_idx_l0, mb->ref_idx_l0, 4);
memcpy(d->ref_idx_l1, mb->ref_idx_l1, 4);
memcpy(d->mv_l0, mb->mv_l0, sizeof(d->mv_l0));
memcpy(d->mv_l1, mb->mv_l1, sizeof(d->mv_l1));
d->deblock_disable = mb->deblock_disable;
d->deblock_alpha_c0 = mb->deblock_alpha_c0;
d->deblock_beta = mb->deblock_beta;
d->transform_8x8 = mb->transform_8x8;
memcpy(&dec->coeffs[(size_t) expected * 384],
mb->coeffs,
384 * sizeof(int16_t));
dec->mbs_appended++;
return 0;
}
/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch (luma + chroma).
*
* Brings up the GPU substrate by calling daedalus-fourier's existing
* `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity in
* contrast to the substitution-arc shim that called it with
* n_blocks = 1 per call. Two Vulkan submits + waits per frame (one
* luma, one chroma) instead of millions of per-block dispatches.
*
* What's done in this stage:
* - Luma: build a per-frame meta[] in raster order (n_blocks =
* N_MBs × 16); flat-pack coeffs from each MB's first 256 int16;
* dispatch into a frame-sized zero-initialised Y scratch plane.
* - Chroma: build an interleaved Cb+Cr meta[] (n_blocks = N_MBs × 8,
* 4 Cb + 4 Cr per MB); flat-pack coeffs from each MB's next 128
* int16 (64 Cb + 64 Cr); dispatch into a planar Cb||Cr scratch
* buffer (W*H/4 each, concatenated W*H/2 total); CPU-interleave
* into the caller's NV12 UV plane post-dispatch.
* - Both dispatches use predicted=0 (the scratch buffers are
* calloc'd); the shader does clip255(predicted + idct(coeffs)).
*
* What's NOT done yet (follow-on Phase 1 sub-PRs):
* - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
* so output pixels are residual-only and not a valid frame decode.
* Sufficient for Vulkan round-trip validation, not for bit-exact
* against FFmpeg.
* - Motion compensation (Stage 2b): inter MBs not handled.
* - High-profile IDCT 8x8 (Stage 1 extension).
* - Chroma DC / luma Intra16x16 DC Hadamard pre-pass (currently we
* treat all chroma blocks as plain 4×4 AC IDCT; real decode needs
* the chroma DC 2×2 Hadamard contribution folded in).
* - Deblock (Stage 4).
* - dmabuf export — still memcpy-out to caller-provided planes.
* - Stage 5 RGBA opt-in.
* - GPU-side NV12 interleave — currently a CPU memcpy loop after
* the chroma dispatch. Trivial cost (~1 MB / frame at 1080p)
* vs the IDCT itself, but worth folding into a Stage-5 pass
* later for full-GPU residency.
*/
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
uint8_t *out_y, size_t y_stride,
uint8_t *out_uv, size_t uv_stride)
{
if (!dec)
return -1;
if (dec->mbs_appended != dec->n_mbs)
return -1; /* incomplete frame */
if (!out_y)
return -1;
int rc = 0;
/* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */
/* Two partitions of the per-MB luma section based on each MB's
* transform_8x8 flag:
*
* transform_8x8 == 0 → 16 4x4 blocks contribute to the 4x4
* dispatch (16 coeffs each).
* transform_8x8 == 1 → 4 8x8 blocks contribute to the 8x8
* dispatch (64 coeffs each).
*
* Both partitions can be non-empty in the same frame (FFmpeg sets
* transform_8x8_size_flag per MB), so we allocate worst-case for
* each and track actual counts.
*/
const size_t y_stride_int = (size_t) dec->width;
const size_t y_size = y_stride_int * (size_t) dec->height;
uint8_t *scratch_y = calloc(1, y_size);
const size_t worst_4x4 = (size_t) dec->n_mbs * 16;
const size_t worst_8x8 = (size_t) dec->n_mbs * 4;
int16_t *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t));
int16_t *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t));
daedalus_h264_block_meta *meta4 = malloc(worst_4x4 * sizeof(*meta4));
daedalus_h264_block_meta *meta8 = malloc(worst_8x8 * sizeof(*meta8));
if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) {
rc = -1;
goto cleanup;
}
/* Walk MBs in raster order, append each MB's luma blocks to the
* partition selected by its transform_8x8 flag.
*
* NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream
* follows the z-scan from spec §6.4.3 / fig 6-10. We're using
* flat raster on the input side too (sb_y outer, sb_x inner) for
* Phase 1 self-consistency; the z-scan permutation is the
* libavcodec-intercept patch's responsibility.
*/
size_t bi4 = 0, bi8 = 0;
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
int mb_idx = mb_y * dec->mb_width + mb_x;
const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx];
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
if (d->transform_8x8) {
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8;
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8;
meta8[bi8].dst_off = (uint32_t)
(px_y * y_stride_int + px_x);
int block_in_mb = sb_y * 2 + sb_x;
memcpy(&coeffs8[bi8 * 64],
&mb_coeffs[block_in_mb * 64],
64 * sizeof(int16_t));
bi8++;
}
}
} else {
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
meta4[bi4].dst_off = (uint32_t)
(px_y * y_stride_int + px_x);
int block_in_mb = sb_y * 4 + sb_x;
memcpy(&coeffs4[bi4 * 16],
&mb_coeffs[block_in_mb * 16],
16 * sizeof(int16_t));
bi4++;
}
}
}
}
}
/* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */
/* ---- One Vulkan submit + wait per non-empty luma partition.
* AUTO substrate picks QPU per the post-decree recipe table; falls
* back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable.
* Skipping the dispatch when the partition is empty avoids the
* shader-pool warm-up cost on the common case (a typical Baseline
* stream is all-4x4 → 8x8 dispatch is no-op). */
const daedalus_substrate sub = map_substrate(dec->substrate);
if (bi4 > 0) {
int dr = daedalus_dispatch_h264_idct4(dec->dctx, sub,
scratch_y, y_stride_int,
coeffs4, bi4, meta4);
if (dr != 0) { rc = -3; goto cleanup; }
}
if (bi8 > 0) {
int dr = daedalus_dispatch_h264_idct8(dec->dctx, sub,
scratch_y, y_stride_int,
coeffs8, bi8, meta8);
if (dr != 0) { rc = -3; goto cleanup; }
}
/* ---- Copy Y out to caller's plane at the requested stride. ---- */
for (int r = 0; r < dec->height; r++)
memcpy(out_y + (size_t) r * y_stride,
&scratch_y[(size_t) r * y_stride_int],
(size_t) dec->width);
/* ---- Build frame-scaled chroma 4×4 dispatch ---- */
/*
* 4:2:0 layout — chroma planes are (W/2) by (H/2), one Cb + one
* Cr per pixel pair. H.264 per-MB chroma is two 8×8 components,
* each split into 4 4×4 blocks, so 8 chroma 4×4 blocks per MB.
*
* We dispatch BOTH components in a single shader call against a
* planar scratch buffer:
* scratch_uv[0 .. cb_plane_size) — Cb plane (W/2 × H/2)
* scratch_uv[cb_plane_size .. 2*size) — Cr plane (W/2 × H/2)
*
* meta[i].dst_off is a flat offset into the scratch buffer (the
* shader treats dst+dst_off as a contiguous 4×4 with row pitch =
* stride), so Cr blocks just add cb_plane_size to their offset.
* Stride is W/2 (the chroma row width); this works because Cb and
* Cr planes share the same row pitch.
*
* Post-dispatch we interleave the two planes into NV12 UV layout
* on the CPU. Doing this on the GPU is a Stage-5 follow-up
* (would need a small "copy + interleave" shader); CPU memcpy
* loop is ~1 MB/frame at 1080p so it's not on the critical path.
*/
int16_t *chroma_coeffs = NULL;
daedalus_h264_block_meta *chroma_meta = NULL;
uint8_t *scratch_uv = NULL;
if (out_uv) {
const size_t n_chroma_blocks_per_mb = 8; /* 4 Cb + 4 Cr */
const size_t n_chroma_blocks =
(size_t) dec->n_mbs * n_chroma_blocks_per_mb;
const size_t chroma_w = (size_t) dec->width / 2;
const size_t chroma_h = (size_t) dec->height / 2;
const size_t cb_plane_size = chroma_w * chroma_h;
const size_t uv_scratch_size = 2 * cb_plane_size;
scratch_uv = calloc(1, uv_scratch_size);
chroma_coeffs = malloc(n_chroma_blocks * 16 * sizeof(int16_t));
chroma_meta = malloc(n_chroma_blocks *
sizeof(daedalus_h264_block_meta));
if (!scratch_uv || !chroma_coeffs || !chroma_meta) {
rc = -1;
goto chroma_cleanup;
}
size_t cbi = 0;
for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
int mb_idx = mb_y * dec->mb_width + mb_x;
const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
/* Per-MB coeff layout (set by append_mb):
* [ 0 .. 256) — 16 luma 4×4 blocks
* [256 .. 320) — 4 Cb 4×4 blocks (raster sb_y*2+sb_x)
* [320 .. 384) — 4 Cr 4×4 blocks (raster sb_y*2+sb_x)
*/
for (int comp = 0; comp < 2; comp++) { /* 0=Cb 1=Cr */
size_t plane_base = (size_t) comp * cb_plane_size;
size_t coeff_base = 256u + (size_t) comp * 64u;
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
size_t px_y = (size_t) mb_y * 8 + (size_t) sb_y * 4;
size_t px_x = (size_t) mb_x * 8 + (size_t) sb_x * 4;
chroma_meta[cbi].dst_off = (uint32_t)
(plane_base + px_y * chroma_w + px_x);
int block_in_comp = sb_y * 2 + sb_x;
memcpy(&chroma_coeffs[cbi * 16],
&mb_coeffs[coeff_base + (size_t) block_in_comp * 16],
16 * sizeof(int16_t));
cbi++;
}
}
}
}
}
/* assert cbi == n_chroma_blocks; loop math guarantees it */
int cr_rc = daedalus_dispatch_h264_idct4(dec->dctx, sub,
scratch_uv, chroma_w,
chroma_coeffs,
n_chroma_blocks,
chroma_meta);
if (cr_rc != 0) {
rc = -3;
goto chroma_cleanup;
}
/* CPU NV12 interleave: out_uv[r][2c+0] = Cb[r][c], [2c+1] = Cr. */
const uint8_t *cb_plane = scratch_uv;
const uint8_t *cr_plane = scratch_uv + cb_plane_size;
for (size_t r = 0; r < chroma_h; r++) {
uint8_t *dst_row = out_uv + r * uv_stride;
const uint8_t *cb_row = cb_plane + r * chroma_w;
const uint8_t *cr_row = cr_plane + r * chroma_w;
for (size_t c = 0; c < chroma_w; c++) {
dst_row[c * 2 + 0] = cb_row[c];
dst_row[c * 2 + 1] = cr_row[c];
}
}
chroma_cleanup:
free(chroma_meta);
free(chroma_coeffs);
free(scratch_uv);
if (rc != 0)
goto cleanup;
}
cleanup:
free(meta8);
free(meta4);
free(coeffs8);
free(coeffs4);
free(scratch_y);
dec->mbs_appended = 0;
return rc;
}
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
{
(void) dec; (void) plane;
/* TODO Phase 1: vkGetMemoryFdKHR on the DPB slot's VkImage memory. */
return -1;
}
int daedalus_decoder_has_qpu(const daedalus_decoder *dec)
{
if (!dec || !dec->dctx)
return 0;
return daedalus_ctx_has_qpu(dec->dctx);
}
-72
View File
@@ -1,72 +0,0 @@
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* daedalus-decoder internal types shared across translation units.
* Not installed; pure-internal.
*/
#ifndef DAEDALUS_DECODER_INTERNAL_H
#define DAEDALUS_DECODER_INTERNAL_H
#include "daedalus_decoder.h"
#include <stdint.h>
#include <stddef.h>
#include <daedalus.h> /* daedalus-fourier public API */
/* Per-MB descriptor as the GPU sees it. Bit-laid-out to match the
* shader's std430 layout. Kept narrow (32 bytes target) so a 1080p
* frame's 8160 entries fit in ~256 KiB SSBO.
*
* TODO once the shaders exist: nail down the exact std430 layout and
* static_assert sizeof / alignof here. */
struct daedalus_decoder_mb_desc {
uint16_t mb_x;
uint16_t mb_y;
uint8_t mb_type;
uint8_t mb_qp_y;
uint8_t mb_qp_uv;
uint8_t cbp;
uint8_t intra_4x4_modes[16];
uint8_t intra_16x16_mode;
uint8_t intra_chroma_mode;
uint8_t partition_mode;
uint8_t _pad0;
int8_t ref_idx_l0[4];
int8_t ref_idx_l1[4];
int16_t mv_l0[4][2];
int16_t mv_l1[4][2];
uint8_t deblock_disable;
int8_t deblock_alpha_c0;
int8_t deblock_beta;
uint8_t transform_8x8; /* 0 = 4 luma blocks of 4x4 (16 total),
* 1 = 4 luma blocks of 8x8. */
};
struct daedalus_decoder {
/* Geometry. */
int width;
int height;
int mb_width; /* width / 16 */
int mb_height; /* height / 16 */
int n_mbs;
/* daedalus-fourier context (Vulkan + V3D7 runner). */
daedalus_ctx *dctx;
/* Frame-shaped staging (CPU-side; will move to mapped SSBO once
* Vulkan plumbing is in place). */
struct daedalus_decoder_mb_desc *mb_descs; /* n_mbs */
int16_t *coeffs; /* n_mbs * 384 */
int mbs_appended; /* per-frame count */
/* Output format. */
daedalus_decoder_output_format output_fmt;
/* Dispatch substrate (AUTO by default — recipe-table-driven). */
daedalus_decoder_substrate substrate;
};
#endif /* DAEDALUS_DECODER_INTERNAL_H */
-215
View File
@@ -1,215 +0,0 @@
/* SPDX-License-Identifier: BSD-2-Clause */
/* Needed for CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */
#define _POSIX_C_SOURCE 200809L
/*
* bench_flush_frame IDCT-layer throughput baseline.
*
* Times daedalus_decoder_flush_frame at a configurable coded
* resolution with random coefficients (the dispatch path doesn't
* care if the residuals are meaningful, only the layout / counts /
* bit-exactness; perf is independent of coefficient content).
*
* NOT a ctest produces wall-time numbers, doesn't pass/fail.
* Invoke manually after a build:
*
* ./build/bench_flush_frame [width] [height] [iters] [warmup] [substrate]
*
* Defaults: 1920 1088 100 5 auto
*
* The [substrate] argument selects the dispatch path:
* auto recipe table picks (V3D7 when available, else NEON)
* cpu force NEON path
* qpu force V3D7 path (fails on hosts without it)
*
* Run both to quantify the substrate gap. The "QPU is default
* substrate" decree (2026-05-23, feedback_qpu_is_default_substrate.md)
* is a policy claim; this bench is how we measure whether the policy
* pays off for the IDCT layer specifically.
*
* The first `warmup` iterations are excluded from the timing
* average because the daedalus-fourier shader pool needs to
* materialise pipelines + buffer pool entries on the first few
* calls (cycle 8b buffer-pool work amortises this; this bench is
* how we'd notice if that ever regresses).
*
* Output gives:
* - per-frame mean / median / p99 latency
* - frames per second steady-state
* - vs. the 30 fps @ 1080p target from the user's
* project_30fps_floor_is_fine.md memory
*
* NB: this is IDCT-only (luma 4x4 + 8x8 + chroma 4x4). It does
* NOT include intra prediction, MC, or deblock those land in
* Stage 2+ / 4. A 30 fps number here is necessary-but-not-sufficient
* for the final decoder hitting the same.
*/
#include "daedalus_decoder.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
static uint64_t xs64_state;
static uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
static int cmp_double(const void *a, const void *b)
{
double da = *(const double *)a, db = *(const double *)b;
return (da > db) - (da < db);
}
static double now_ms(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6;
}
int main(int argc, char **argv)
{
int width = argc > 1 ? atoi(argv[1]) : 1920;
int height = argc > 2 ? atoi(argv[2]) : 1088;
int iters = argc > 3 ? atoi(argv[3]) : 100;
int warmup = argc > 4 ? atoi(argv[4]) : 5;
daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
const char *sub_name = "auto";
if (argc > 5) {
if (!strcmp(argv[5], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; }
else if (!strcmp(argv[5], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; }
else if (!strcmp(argv[5], "auto")) { /* default */ }
else {
fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[5]);
return 1;
}
}
if (warmup >= iters) {
fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters);
return 1;
}
int mb_w = width / 16;
int mb_h = height / 16;
int n_mbs = mb_w * mb_h;
printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup), substrate=%s\n",
width, height, n_mbs, iters, warmup, sub_name);
daedalus_decoder *dec = daedalus_decoder_create(width, height);
if (!dec) {
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
return 0;
}
if (daedalus_decoder_set_substrate(dec, sub) != 0) {
fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
return 1;
}
printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec));
/* Pre-generate per-MB random coeffs once. We re-append the same
* per-MB buffer across iterations the dispatch path doesn't
* cache anything per-MB across frames, so this is representative. */
xs64_state = 0xfeedface5a5a5a5aULL;
int16_t (*per_mb)[384] = malloc((size_t) n_mbs * sizeof(*per_mb));
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
if (!per_mb || !mb_8x8) {
fprintf(stderr, "alloc fail\n");
return 1;
}
for (int mb = 0; mb < n_mbs; mb++) {
for (int i = 0; i < 384; i++)
per_mb[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
mb_8x8[mb] = (mb & 1) ? 1 : 0; /* same 50/50 mix as bit-exact test */
}
size_t y_size = (size_t) width * height;
size_t uv_size = (size_t) width * height / 2;
uint8_t *out_y = malloc(y_size);
uint8_t *out_uv = malloc(uv_size);
if (!out_y || !out_uv) {
fprintf(stderr, "alloc fail\n");
return 1;
}
/* Sample buffer for per-iteration timings (post-warmup). */
int sample_count = iters - warmup;
double *samples = malloc((size_t) sample_count * sizeof(double));
if (!samples) return 1;
for (int it = 0; it < iters; it++) {
/* Re-append all MBs for the frame. flush_frame resets
* mbs_appended to 0 internally on completion, so this loop
* is exactly the cost we'd pay per real frame. */
struct daedalus_decoder_mb_input mb = {0};
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int idx = my * mb_w + mx;
mb.mb_x = (uint16_t) mx;
mb.mb_y = (uint16_t) my;
mb.coeffs = per_mb[idx];
mb.transform_8x8 = mb_8x8[idx];
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
fprintf(stderr, "append fail iter=%d idx=%d\n", it, idx);
return 1;
}
}
}
double t0 = now_ms();
int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width,
out_uv, (size_t) width);
double t1 = now_ms();
if (frc != 0) {
fprintf(stderr, "flush_frame rc=%d iter=%d\n", frc, it);
return 1;
}
if (it >= warmup) samples[it - warmup] = t1 - t0;
}
/* Stats. */
qsort(samples, (size_t) sample_count, sizeof(double), cmp_double);
double sum = 0;
for (int i = 0; i < sample_count; i++) sum += samples[i];
double mean = sum / sample_count;
double median = samples[sample_count / 2];
double p99 = samples[(sample_count * 99) / 100];
double min_ = samples[0];
double max_ = samples[sample_count - 1];
printf("\nflush_frame (post-warmup, %d samples):\n", sample_count);
printf(" min = %7.3f ms\n", min_);
printf(" median = %7.3f ms\n", median);
printf(" mean = %7.3f ms\n", mean);
printf(" p99 = %7.3f ms\n", p99);
printf(" max = %7.3f ms\n", max_);
double fps_mean = 1000.0 / mean;
double fps_median = 1000.0 / median;
printf("\nthroughput (steady-state, IDCT only — NO intra/MC/deblock):\n");
printf(" mean = %.1f fps\n", fps_mean);
printf(" median = %.1f fps\n", fps_median);
printf(" target = 30.0 fps (project_30fps_floor_is_fine.md)\n");
if (fps_median >= 30.0)
printf(" status = MEETS target (with %.1fx headroom for "
"intra/MC/deblock)\n", fps_median / 30.0);
else
printf(" status = BELOW target (need %.1fx speedup just at IDCT)\n",
30.0 / fps_median);
free(samples);
free(out_uv);
free(out_y);
free(mb_8x8);
free(per_mb);
daedalus_decoder_destroy(dec);
return 0;
}
-398
View File
@@ -1,398 +0,0 @@
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* test_idct_bitexact phase1 stage1 bit-exact gate for the frame-
* scaled luma IDCT 4×4 dispatch.
*
* Generates a frame of random coefficients, runs daedalus_decoder
* (with predicted=0 by the scaffold's flush_frame contract), and
* compares every output byte against an inline C reference that
* mirrors the H.264 §8.5.12.1 1D butterfly.
*
* Why "bit-exact": the GPU shader and the C reference apply the same
* integer arithmetic. Any rounding / sign / overflow disagreement is
* a bug. Pass = every output byte matches.
*
* Scope match with flush_frame: the test mirrors flush_frame's
* per-MB flat block layout (raster scan within MB, no z-scan
* permutation). That keeps the test focused on IDCT correctness;
* the z-scan permutation that bridges to libavcodec's per-MB coeffs
* layout is a separate concern (handled in the eventual libavcodec-
* intercept patch).
*
* Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved).
* Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use
* transform_8x8=0 (16 luma 4x4 blocks); both partitions are
* exercised in the same frame so the flush_frame partitioning logic
* is also under test, not just the underlying shaders. Random coeffs
* for all components; reference IDCT applied per block. The chroma
* compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
*
* Not in scope (covered by other tests / future PRs):
* - Chroma DC / Intra16x16 DC Hadamard pre-pass
* - bit-exactness against real H.264 streams (test-vector PR)
* - non-zero predicted pixels (intra prediction lands in Stage 2a)
*/
#include "daedalus_decoder.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* xorshift64* for deterministic random coefficient generation. */
static uint64_t xs64_state;
static uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
/* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
* then column pass; +32 rounding, >>6, add to predicted (=0 here),
* clip to u8. Bit-exact-equivalent transcription of daedalus-fourier
* tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
* fair-use for test purposes same algorithm, no copy of code). */
static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
static void h264_idct4_butterfly(const int d[4], int out[4])
{
int e = d[0] + d[2];
int f = d[0] - d[2];
int g = (d[1] >> 1) - d[3];
int h = d[1] + (d[3] >> 1);
out[0] = e + h;
out[1] = f + g;
out[2] = f - g;
out[3] = e - h;
}
/* 1D 8-point butterfly per H.264 §8.5.13.2. Transcribed from
* daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original
* algorithm reproduced here for test purposes, no copy of code). */
static void h264_idct8_butterfly(const int d[8], int g[8])
{
int e[8], f[8];
e[0] = d[0] + d[4];
e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
e[2] = d[0] - d[4];
e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
e[4] = (d[2] >> 1) - d[6];
e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
e[6] = d[2] + (d[6] >> 1);
e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);
f[0] = e[0] + e[6];
f[1] = e[1] + (e[7] >> 2);
f[2] = e[2] + e[4];
f[3] = e[3] + (e[5] >> 2);
f[4] = e[2] - e[4];
f[5] = (e[3] >> 2) - e[5];
f[6] = e[0] - e[6];
f[7] = e[7] - (e[1] >> 2);
g[0] = f[0] + f[7];
g[1] = f[2] + f[5];
g[2] = f[4] + f[3];
g[3] = f[6] + f[1];
g[4] = f[6] - f[1];
g[5] = f[4] - f[3];
g[6] = f[2] - f[5];
g[7] = f[0] - f[7];
}
static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
/* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */
int tmp[8][8];
for (int r = 0; r < 8; r++) {
int d[8];
for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r];
int g[8];
h264_idct8_butterfly(d, g);
for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
}
int col_out[8][8];
for (int c = 0; c < 8; c++) {
int d[8];
for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
int g[8];
h264_idct8_butterfly(d, g);
for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
}
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * stride + c] = (uint8_t) clip_u8(
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}
static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
/* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
* block[c*4 + r] = coeff at (row=r, col=c).
* Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
int tmp[4][4];
for (int r = 0; r < 4; r++) {
int d[4] = { block[0*4 + r], block[1*4 + r],
block[2*4 + r], block[3*4 + r] };
int o[4];
h264_idct4_butterfly(d, o);
for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
}
/* Column pass: gather d[r] = tmp[r][c] for fixed c. */
int col_out[4][4];
for (int c = 0; c < 4; c++) {
int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
int o[4];
h264_idct4_butterfly(d, o);
for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
}
/* Add (predicted=dst, here 0) + clip. */
for (int r = 0; r < 4; r++)
for (int c = 0; c < 4; c++)
dst[r * stride + c] = (uint8_t) clip_u8(
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}
int main(int argc, char **argv)
{
/* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
* the dispatch covers multiple workgroups (16 blocks/WG 4 WGs). */
int width = argc > 1 ? atoi(argv[1]) : 320;
int height = argc > 2 ? atoi(argv[2]) : 240; /* 240 / 16 = 15 → coded 240 */
/* Coded dims must be mod-16; 320×240 is canonical QVGA. */
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
xs64_state = seed;
/* Optional 4th argv: "auto" (default) / "cpu" / "qpu" to pin the
* dispatch substrate. Both substrates must produce IDENTICAL
* output (the V3D shaders are bit-exact gates against the same
* spec the NEON path implements); the ctest suite runs the QVGA
* test once per substrate to catch any silent drift. */
daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
const char *sub_name = "auto";
if (argc > 4) {
if (!strcmp(argv[4], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; }
else if (!strcmp(argv[4], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; }
else if (!strcmp(argv[4], "auto")) { /* default */ }
else {
fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[4]);
return 1;
}
}
int mb_w = width / 16;
int mb_h = height / 16;
int n_mbs = mb_w * mb_h;
printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
width, height, n_mbs, (unsigned long) seed);
daedalus_decoder *dec = daedalus_decoder_create(width, height);
if (!dec) {
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
return 0;
}
if (daedalus_decoder_set_substrate(dec, sub) != 0) {
fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
return 1;
}
printf("substrate: %s\n", sub_name);
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
* random coeffs in [-512, 511] same range as the daedalus-fourier
* cycle-6 M1 gate uses. */
int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
if (!per_mb_coeffs) { fprintf(stderr, "alloc fail\n"); return 1; }
for (int mb = 0; mb < n_mbs; mb++) {
for (int i = 0; i < 384; i++) {
/* Random coeffs in [-512, 511] for all of luma + Cb + Cr.
* Same range as the daedalus-fourier cycle-6 M1 gate. */
per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
}
}
/* Per-MB transform mode (deterministic split: every odd raster MB
* is 8x8, every even is 4x4 exercises BOTH partitions in the
* same frame so the flush_frame partitioning logic is under test). */
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; }
for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0;
/* Append in raster order. */
struct daedalus_decoder_mb_input mb = {0};
int n_8x8_mbs = 0, n_4x4_mbs = 0;
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int idx = my * mb_w + mx;
mb.mb_x = (uint16_t) mx;
mb.mb_y = (uint16_t) my;
mb.coeffs = per_mb_coeffs[idx];
mb.transform_8x8 = mb_8x8[idx];
if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++;
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
fprintf(stderr, "append (%d,%d) failed\n", mx, my);
return 1;
}
}
}
printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs);
/* Flush — exercise BOTH the luma path (out_y) and the chroma path
* (out_uv set to non-NULL so flush_frame runs the chroma dispatch
* + NV12 interleave). */
size_t y_size = (size_t) width * height;
size_t uv_size = (size_t) width * height / 2;
uint8_t *gpu_y = calloc(1, y_size);
uint8_t *gpu_uv = calloc(1, uv_size);
if (!gpu_y || !gpu_uv) return 1;
int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
gpu_uv, (size_t) width);
if (frc != 0) {
fprintf(stderr, "flush_frame rc=%d\n", frc);
return 1;
}
/* Compute the reference output: same per-MB → flat raster block
* layout as flush_frame uses. Branch per MB on transform_8x8. */
uint8_t *ref_y = calloc(1, y_size);
if (!ref_y) return 1;
int16_t block_scratch[64]; /* large enough for 8x8 */
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
if (mb_8x8[mb_idx]) {
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
int block_in_mb = sb_y * 2 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][block_in_mb * 64],
64 * sizeof(int16_t));
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8;
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8;
ref_idct8_add(&ref_y[px_y * (size_t) width + px_x],
width, block_scratch);
}
}
} else {
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
int block_in_mb = sb_y * 4 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][block_in_mb * 16],
16 * sizeof(int16_t));
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
width, block_scratch);
}
}
}
}
}
/* Build the chroma reference: separate planar Cb and Cr (W/2 by
* H/2), each block IDCT'd into its plane. Chroma per-MB layout
* matches flush_frame: 4 Cb blocks then 4 Cr blocks, raster order
* within each component (sb_y * 2 + sb_x). */
size_t chroma_w = (size_t) width / 2;
size_t chroma_h = (size_t) height / 2;
size_t chroma_plane_size = chroma_w * chroma_h;
uint8_t *ref_cb = calloc(1, chroma_plane_size);
uint8_t *ref_cr = calloc(1, chroma_plane_size);
if (!ref_cb || !ref_cr) return 1;
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
for (int comp = 0; comp < 2; comp++) {
uint8_t *plane = (comp == 0) ? ref_cb : ref_cr;
size_t coeff_base = 256u + (size_t) comp * 64u;
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
int block_in_comp = sb_y * 2 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][coeff_base +
(size_t) block_in_comp * 16],
16 * sizeof(int16_t));
size_t px_y = (size_t) my * 8 + (size_t) sb_y * 4;
size_t px_x = (size_t) mx * 8 + (size_t) sb_x * 4;
ref_idct4_add(&plane[px_y * chroma_w + px_x],
(ptrdiff_t) chroma_w, block_scratch);
}
}
}
}
}
/* Y compare. */
size_t y_diffs = 0, y_first_diff = 0;
for (size_t i = 0; i < y_size; i++) {
if (gpu_y[i] != ref_y[i]) {
if (y_diffs == 0) y_first_diff = i;
y_diffs++;
}
}
printf("Y bytes total: %zu\n", y_size);
printf("Y bytes diff: %zu (%.4f%%)\n", y_diffs, 100.0 * y_diffs / y_size);
if (y_diffs) {
printf("Y first diff at offset %zu: gpu=%u ref=%u\n",
y_first_diff, gpu_y[y_first_diff], ref_y[y_first_diff]);
}
/* UV compare — deinterleave NV12 back into Cb/Cr and compare. */
size_t cb_diffs = 0, cr_diffs = 0;
size_t cb_first = 0, cr_first = 0;
for (size_t r = 0; r < chroma_h; r++) {
const uint8_t *gpu_row = gpu_uv + r * (size_t) width;
const uint8_t *cb_row = ref_cb + r * chroma_w;
const uint8_t *cr_row = ref_cr + r * chroma_w;
for (size_t c = 0; c < chroma_w; c++) {
uint8_t gpu_cb = gpu_row[c * 2 + 0];
uint8_t gpu_cr = gpu_row[c * 2 + 1];
if (gpu_cb != cb_row[c]) {
if (cb_diffs == 0) cb_first = r * chroma_w + c;
cb_diffs++;
}
if (gpu_cr != cr_row[c]) {
if (cr_diffs == 0) cr_first = r * chroma_w + c;
cr_diffs++;
}
}
}
printf("Cb bytes total: %zu diff: %zu (%.4f%%)\n",
chroma_plane_size, cb_diffs,
100.0 * cb_diffs / chroma_plane_size);
printf("Cr bytes total: %zu diff: %zu (%.4f%%)\n",
chroma_plane_size, cr_diffs,
100.0 * cr_diffs / chroma_plane_size);
if (cb_diffs) {
size_t r = cb_first / chroma_w, c = cb_first % chroma_w;
printf("Cb first diff at (%zu,%zu): gpu=%u ref=%u\n",
r, c, gpu_uv[r * (size_t) width + c * 2 + 0], ref_cb[cb_first]);
}
if (cr_diffs) {
size_t r = cr_first / chroma_w, c = cr_first % chroma_w;
printf("Cr first diff at (%zu,%zu): gpu=%u ref=%u\n",
r, c, gpu_uv[r * (size_t) width + c * 2 + 1], ref_cr[cr_first]);
}
free(ref_cr);
free(ref_cb);
free(ref_y);
free(gpu_uv);
free(gpu_y);
free(mb_8x8);
free(per_mb_coeffs);
daedalus_decoder_destroy(dec);
if (y_diffs == 0 && cb_diffs == 0 && cr_diffs == 0) {
printf("BIT-EXACT PASS (Y + Cb + Cr)\n");
return 0;
}
fprintf(stderr, "BIT-EXACT FAIL\n");
return 1;
}
-175
View File
@@ -1,175 +0,0 @@
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* Scaffold smoke test verifies the daedalus-decoder library links
* cleanly against daedalus-fourier and the lifecycle entry points
* don't immediately crash. No actual decoding work yet.
*
* Returns 0 on success, non-zero on any unexpected behaviour.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "daedalus_decoder.h"
#define EXPECT(cond, msg) do { \
if (!(cond)) { \
fprintf(stderr, "EXPECT FAIL %s:%d: %s\n", __FILE__, __LINE__, msg); \
return 1; \
} \
} while (0)
int main(void)
{
printf("daedalus-decoder version: %s\n", daedalus_decoder_version());
/* Create / destroy null is a no-op. */
daedalus_decoder_destroy(NULL);
/* Bad dimensions rejected. */
EXPECT(daedalus_decoder_create(0, 0 ) == NULL, "zero dims must reject");
EXPECT(daedalus_decoder_create(1919, 1088) == NULL, "non-16-multiple width must reject");
EXPECT(daedalus_decoder_create(1920, 1079) == NULL, "non-16-multiple height must reject");
/* Valid 1088p create. */
daedalus_decoder *dec = daedalus_decoder_create(1920, 1088);
if (!dec) {
/* Vulkan init failure on this host — degrades to skip, not fail.
* (CI runners without V3D7 will hit this; the smoke test
* shouldn't gate on hardware presence.) */
fprintf(stderr, "SKIP: daedalus_decoder_create returned NULL "
"(Vulkan / V3D7 unavailable on this host)\n");
return 0;
}
printf("ctx created: 1920x1088, has_qpu=%d\n",
daedalus_decoder_has_qpu(dec));
/* set_output_format mid-frame on virgin ctx is allowed
* (mbs_appended == 0). */
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_RGBA) == 0,
"switch to RGBA on virgin ctx");
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_NV12) == 0,
"switch back to NV12");
/* Substrate setter — same lifecycle rules. */
EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_CPU) == 0,
"force CPU substrate on virgin ctx");
EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_QPU) == 0,
"force QPU substrate on virgin ctx");
EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_AUTO) == 0,
"back to AUTO");
EXPECT(daedalus_decoder_set_substrate(dec, (daedalus_decoder_substrate) 99) == -1,
"bogus substrate rejects");
/* Append rejects out-of-bounds + null inputs. */
int16_t coeffs[384] = {0};
struct daedalus_decoder_mb_input mb = {0};
mb.coeffs = coeffs;
mb.mb_x = 0; mb.mb_y = 0;
EXPECT(daedalus_decoder_append_mb(dec, NULL) == -1, "null mb rejects");
{
struct daedalus_decoder_mb_input mb2 = mb;
mb2.coeffs = NULL;
EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1, "null coeffs rejects");
}
{
struct daedalus_decoder_mb_input mb2 = mb;
mb2.mb_x = 9999; mb2.mb_y = 9999;
EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1, "OOB coords reject");
}
/* Append first MB at raster index 0 — should succeed. */
EXPECT(daedalus_decoder_append_mb(dec, &mb) == 0, "append (0,0)");
/* Skipping (0,1) and appending (1,0) violates raster order — reject. */
{
struct daedalus_decoder_mb_input mb2 = mb;
mb2.mb_x = 0; mb2.mb_y = 1;
EXPECT(daedalus_decoder_append_mb(dec, &mb2) == -1,
"out-of-raster-order rejects");
}
/* In-order: (1,0). */
mb.mb_x = 1; mb.mb_y = 0;
EXPECT(daedalus_decoder_append_mb(dec, &mb) == 0, "append (1,0)");
/* Flush an incomplete frame: should fail because mbs_appended != n_mbs. */
EXPECT(daedalus_decoder_flush_frame(dec, NULL, 0, NULL, 0) == -1,
"incomplete-frame flush rejects");
/* set_output_format mid-frame (mbs_appended > 0) must reject. */
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_RGBA) == -1,
"mid-frame format change rejects");
daedalus_decoder_destroy(dec);
/* ---- Full-frame round-trip with all-zero coefficients.
* Phase 1 stage 1 validation: flush_frame builds the per-frame IDCT
* dispatch and a successful GPU round-trip returns 0. IDCT of
* all-zero coefficients with zero-initialised predicted = all-zero
* output pixels. */
dec = daedalus_decoder_create(1920, 1088);
if (!dec) {
fprintf(stderr, "SKIP roundtrip: ctx create failed\n");
return 0;
}
static int16_t zero_coeffs[384] = {0};
struct daedalus_decoder_mb_input zmb = {0};
zmb.coeffs = zero_coeffs;
int mb_width = 1920 / 16; /* 120 */
int mb_height = 1088 / 16; /* 68 */
int n_mbs = mb_width * mb_height;
for (int mby = 0; mby < mb_height; mby++) {
for (int mbx = 0; mbx < mb_width; mbx++) {
zmb.mb_x = (uint16_t) mbx;
zmb.mb_y = (uint16_t) mby;
if (daedalus_decoder_append_mb(dec, &zmb) != 0) {
fprintf(stderr, "append (%d, %d) failed\n", mbx, mby);
return 1;
}
}
}
printf("appended %d MBs (%dx%d)\n", n_mbs, mb_width, mb_height);
size_t y_size = (size_t) 1920 * 1088;
size_t uv_size = (size_t) 1920 * 1088 / 2;
uint8_t *out_y = malloc(y_size);
uint8_t *out_uv = malloc(uv_size);
/* Pre-fill with sentinel so any read-then-write bug becomes visible. */
memset(out_y, 0xab, y_size);
memset(out_uv, 0xcd, uv_size);
int frc = daedalus_decoder_flush_frame(dec, out_y, 1920, out_uv, 1920);
printf("flush_frame rc=%d\n", frc);
EXPECT(frc == 0, "flush succeeds on full frame");
/* Y plane should be all zero (clip255(IDCT(zeros)) = 0). */
int y_nz = 0;
for (size_t i = 0; i < y_size; i++)
if (out_y[i] != 0) y_nz++;
printf("Y non-zero bytes: %d / %zu\n", y_nz, y_size);
EXPECT(y_nz == 0, "Y plane all zero for zero-coeff frame");
/* UV plane should be all zero now (real chroma IDCT runs with
* zero coeffs zero residual clip255(0+0) = 0). Previously a
* 128 placeholder when chroma was a memset stub; this PR replaced
* that with the real dispatch. Sentinel 0xcd above guarantees we
* are observing post-dispatch writes, not the leftover memset. */
int uv_nz = 0;
for (size_t i = 0; i < uv_size; i++)
if (out_uv[i] != 0) uv_nz++;
printf("UV non-zero bytes: %d / %zu\n", uv_nz, uv_size);
EXPECT(uv_nz == 0, "UV plane all zero for zero-coeff frame");
free(out_y);
free(out_uv);
daedalus_decoder_destroy(dec);
printf("smoke OK\n");
return 0;
}