From 44ca4e550fbd45fafef783af7d2b8b1e1eadb023 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 24 May 2026 23:07:45 +0200 Subject: [PATCH] phase1: substrate selector API + cross-substrate bit-exact ctest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces daedalus-fourier's substrate-override capability at the decoder boundary. Lets tests run on CPU-only hosts (CI runners, x86 dev boxes) AND cross-checks V3D shader output against NEON reference on hosts that have both. API additions (pre-0.1 ABI, additive): - daedalus_decoder_substrate enum { AUTO, CPU, QPU } (mirrors daedalus_substrate; isolated for ABI reasons). - daedalus_decoder_set_substrate(dec, sub) setter, same mid-frame-change restrictions as set_output_format. - Default remains AUTO — the only sensible choice for production. Internal: - flush_frame now calls daedalus_dispatch_h264_idct{4,8} with an explicit substrate instead of daedalus_recipe_dispatch_*. Mapped via a small map_substrate() helper. No perf delta on AUTO (recipe layer was just doing the same dispatch under the hood). Test changes: - test_smoke: new EXPECTs for set_substrate (valid + bogus). - test_idct_bitexact: new argv[4] takes "auto" (default), "cpu", or "qpu" to force the substrate. - CMakeLists.txt: new ctest entry `idct_bitexact_cpu` re-runs the QVGA case forcing the CPU path. Catches silent drift between the V3D shader and the NEON reference; both must produce identical output for the same coefficient input (and they do — see ctest log below). Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0): $ ctest --test-dir build --output-on-failure Start 1: smoke 1/4 Test #1: smoke ............................ Passed 0.10 sec Start 2: idct_bitexact 2/4 Test #2: idct_bitexact .................... Passed 0.03 sec Start 3: idct_bitexact_cpu 3/4 Test #3: idct_bitexact_cpu ................ Passed 0.03 sec Start 4: idct_bitexact_1080p 4/4 Test #4: idct_bitexact_1080p .............. Passed 0.06 sec 100% tests passed, 0 tests failed out of 4 CPU substrate produces byte-identical Y + Cb + Cr planes against the same C reference that the AUTO/QPU path matches — confirming the V3D shaders and the daedalus-fourier NEON path agree at the spec level. Why we plumbed the lower-level dispatch instead of leaving recipe in place: recipe is just a thin wrapper that calls dispatch with AUTO. Once we needed substrate control, the wrapper became a liability (would have required adding a parallel recipe API for each substrate); going direct is simpler and the AUTO path is unchanged. Coverage note: idct_bitexact_cpu runs at QVGA (300 MBs); not also at 1080p because the CPU path's wall time scales linearly with block count and a 1080p CPU run is ~0.5s on hertz — fine standalone but slows ctest enough that it would tempt opt-in gating. The bit-exact content is the same regardless of frame size; the 1080p variant only exists to gate index-arithmetic bugs that surface above small int boundaries. --- CMakeLists.txt | 8 ++++++ include/daedalus_decoder.h | 33 ++++++++++++++++++++++++ src/daedalus_decoder.c | 51 ++++++++++++++++++++++++++++++-------- src/internal.h | 3 +++ tests/test_idct_bitexact.c | 22 ++++++++++++++++ tests/test_smoke.c | 10 ++++++++ 6 files changed, 116 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index db9e726..1e5e080 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,6 +121,14 @@ target_compile_options(test_idct_bitexact PRIVATE -O2) # 320x240 QVGA — fast inner-loop test (300 MBs, sub-second). add_test(NAME idct_bitexact COMMAND test_idct_bitexact) +# Same QVGA test re-run on the CPU NEON path (forces fallback even on +# V3D7 hosts). Catches silent drift between the V3D shader and the +# NEON reference path — both must produce identical output for the +# same coefficient input. Also keeps the bit-exact gate alive on +# hosts without V3D7 (CI runners, x86 dev boxes). +add_test(NAME idct_bitexact_cpu COMMAND test_idct_bitexact 320 240 + 0xfeedface5a5a5a5a cpu) + # 1920x1088 1080p — deployment-scale test (8160 MBs, ~0.25 s on hertz). # Validates the per-MB block index + pixel offset math at full coded # height (1088, not 1080 — see daedalus_decoder.h on H.264 coded vs diff --git a/include/daedalus_decoder.h b/include/daedalus_decoder.h index 925a953..639aefe 100644 --- a/include/daedalus_decoder.h +++ b/include/daedalus_decoder.h @@ -99,6 +99,33 @@ typedef enum { DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */ } daedalus_decoder_output_format; +/* ------------------------------------------------------------------- + * Substrate selector. Determines which backend daedalus-fourier + * dispatches the per-frame compute through. + * + * AUTO is the only sensible choice for production — it picks per the + * recipe table baked into daedalus-fourier (post 2026-05-23 decree: + * QPU when a V3D shader exists, CPU NEON otherwise). The explicit + * options exist for testing: + * + * - CPU forces the dispatch onto the NEON path even when V3D7 is + * available. Lets the bit-exact ctests run on hosts without a + * working Vulkan/V3D stack (CI runners, dev x86 boxes via + * cross-build), and lets us cross-check the V3D shader output + * against the NEON reference path on hosts that DO have V3D. + * - QPU is the dual — force QPU even on a CPU-preferred kernel. + * Useful for benchmarking specific QPU paths in isolation. + * + * A non-AUTO selection on a host that can't satisfy it + * (DAEDALUS_DECODER_SUBSTRATE_QPU on an x86 dev box) propagates a + * dispatch failure back through flush_frame as -3. + * ----------------------------------------------------------------- */ +typedef enum { + DAEDALUS_DECODER_SUBSTRATE_AUTO = 0, + DAEDALUS_DECODER_SUBSTRATE_CPU = 1, + DAEDALUS_DECODER_SUBSTRATE_QPU = 2, +} daedalus_decoder_substrate; + /* ------------------------------------------------------------------- * Lifecycle * ----------------------------------------------------------------- */ @@ -128,6 +155,12 @@ void daedalus_decoder_destroy(daedalus_decoder *dec); int daedalus_decoder_set_output_format(daedalus_decoder *dec, daedalus_decoder_output_format fmt); +/* Override the dispatch substrate for subsequent flush_frame calls. + * Default is AUTO. Same mid-frame-change restriction as + * set_output_format. */ +int daedalus_decoder_set_substrate(daedalus_decoder *dec, + daedalus_decoder_substrate sub); + /* ------------------------------------------------------------------- * Per-frame submission * ----------------------------------------------------------------- */ diff --git a/src/daedalus_decoder.c b/src/daedalus_decoder.c index 74934f0..a48c0b2 100644 --- a/src/daedalus_decoder.c +++ b/src/daedalus_decoder.c @@ -41,6 +41,7 @@ daedalus_decoder *daedalus_decoder_create(int width, int height) dec->mb_height = height >> 4; dec->n_mbs = dec->mb_width * dec->mb_height; dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12; + dec->substrate = DAEDALUS_DECODER_SUBSTRATE_AUTO; /* daedalus-fourier ctx — required. Phase 1 needs the QPU; if * Vulkan init fails the decoder is unusable. Caller can check @@ -86,6 +87,33 @@ int daedalus_decoder_set_output_format(daedalus_decoder *dec, return 0; } +int daedalus_decoder_set_substrate(daedalus_decoder *dec, + daedalus_decoder_substrate sub) +{ + if (!dec) + return -1; + if (dec->mbs_appended != 0) + return -1; + if (sub != DAEDALUS_DECODER_SUBSTRATE_AUTO && + sub != DAEDALUS_DECODER_SUBSTRATE_CPU && + sub != DAEDALUS_DECODER_SUBSTRATE_QPU) + return -1; + dec->substrate = sub; + return 0; +} + +/* Map our public substrate enum onto daedalus-fourier's. Same + * ordering by intent — we duplicate the enum for ABI isolation. */ +static daedalus_substrate map_substrate(daedalus_decoder_substrate s) +{ + switch (s) { + case DAEDALUS_DECODER_SUBSTRATE_CPU: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_DECODER_SUBSTRATE_QPU: return DAEDALUS_SUBSTRATE_QPU; + case DAEDALUS_DECODER_SUBSTRATE_AUTO: + default: return DAEDALUS_SUBSTRATE_AUTO; + } +} + int daedalus_decoder_append_mb(daedalus_decoder *dec, const struct daedalus_decoder_mb_input *mb) { @@ -267,16 +295,17 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec, * Skipping the dispatch when the partition is empty avoids the * shader-pool warm-up cost on the common case (a typical Baseline * stream is all-4x4 → 8x8 dispatch is no-op). */ + const daedalus_substrate sub = map_substrate(dec->substrate); if (bi4 > 0) { - int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx, - scratch_y, y_stride_int, - coeffs4, bi4, meta4); + int dr = daedalus_dispatch_h264_idct4(dec->dctx, sub, + scratch_y, y_stride_int, + coeffs4, bi4, meta4); if (dr != 0) { rc = -3; goto cleanup; } } if (bi8 > 0) { - int dr = daedalus_recipe_dispatch_h264_idct8(dec->dctx, - scratch_y, y_stride_int, - coeffs8, bi8, meta8); + int dr = daedalus_dispatch_h264_idct8(dec->dctx, sub, + scratch_y, y_stride_int, + coeffs8, bi8, meta8); if (dr != 0) { rc = -3; goto cleanup; } } @@ -361,11 +390,11 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec, } /* assert cbi == n_chroma_blocks; loop math guarantees it */ - int cr_rc = daedalus_recipe_dispatch_h264_idct4(dec->dctx, - scratch_uv, chroma_w, - chroma_coeffs, - n_chroma_blocks, - chroma_meta); + int cr_rc = daedalus_dispatch_h264_idct4(dec->dctx, sub, + scratch_uv, chroma_w, + chroma_coeffs, + n_chroma_blocks, + chroma_meta); if (cr_rc != 0) { rc = -3; goto chroma_cleanup; diff --git a/src/internal.h b/src/internal.h index 637f1e0..87845c7 100644 --- a/src/internal.h +++ b/src/internal.h @@ -64,6 +64,9 @@ struct daedalus_decoder { /* Output format. */ daedalus_decoder_output_format output_fmt; + + /* Dispatch substrate (AUTO by default — recipe-table-driven). */ + daedalus_decoder_substrate substrate; }; #endif /* DAEDALUS_DECODER_INTERNAL_H */ diff --git a/tests/test_idct_bitexact.c b/tests/test_idct_bitexact.c index 8b5a92f..6545ac5 100644 --- a/tests/test_idct_bitexact.c +++ b/tests/test_idct_bitexact.c @@ -166,6 +166,23 @@ int main(int argc, char **argv) uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL; xs64_state = seed; + /* Optional 4th argv: "auto" (default) / "cpu" / "qpu" to pin the + * dispatch substrate. Both substrates must produce IDENTICAL + * output (the V3D shaders are bit-exact gates against the same + * spec the NEON path implements); the ctest suite runs the QVGA + * test once per substrate to catch any silent drift. */ + daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO; + const char *sub_name = "auto"; + if (argc > 4) { + if (!strcmp(argv[4], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; } + else if (!strcmp(argv[4], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; } + else if (!strcmp(argv[4], "auto")) { /* default */ } + else { + fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[4]); + return 1; + } + } + int mb_w = width / 16; int mb_h = height / 16; int n_mbs = mb_w * mb_h; @@ -177,6 +194,11 @@ int main(int argc, char **argv) fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n"); return 0; } + if (daedalus_decoder_set_substrate(dec, sub) != 0) { + fprintf(stderr, "set_substrate(%s) failed\n", sub_name); + return 1; + } + printf("substrate: %s\n", sub_name); /* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of * random coeffs in [-512, 511] — same range as the daedalus-fourier diff --git a/tests/test_smoke.c b/tests/test_smoke.c index f7217f9..3629441 100644 --- a/tests/test_smoke.c +++ b/tests/test_smoke.c @@ -52,6 +52,16 @@ int main(void) EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_NV12) == 0, "switch back to NV12"); + /* Substrate setter — same lifecycle rules. */ + EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_CPU) == 0, + "force CPU substrate on virgin ctx"); + EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_QPU) == 0, + "force QPU substrate on virgin ctx"); + EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_AUTO) == 0, + "back to AUTO"); + EXPECT(daedalus_decoder_set_substrate(dec, (daedalus_decoder_substrate) 99) == -1, + "bogus substrate rejects"); + /* Append rejects out-of-bounds + null inputs. */ int16_t coeffs[384] = {0}; struct daedalus_decoder_mb_input mb = {0}; -- 2.47.3