Merge pull request 'phase1: substrate selector API + cross-substrate bit-exact ctest' (#9) from noether/phase1-substrate-select into main
Reviewed-on: #9
This commit was merged in pull request #9.
This commit is contained in:
@@ -121,6 +121,14 @@ target_compile_options(test_idct_bitexact PRIVATE -O2)
|
|||||||
# 320x240 QVGA — fast inner-loop test (300 MBs, sub-second).
|
# 320x240 QVGA — fast inner-loop test (300 MBs, sub-second).
|
||||||
add_test(NAME idct_bitexact COMMAND test_idct_bitexact)
|
add_test(NAME idct_bitexact COMMAND test_idct_bitexact)
|
||||||
|
|
||||||
|
# Same QVGA test re-run on the CPU NEON path (forces fallback even on
|
||||||
|
# V3D7 hosts). Catches silent drift between the V3D shader and the
|
||||||
|
# NEON reference path — both must produce identical output for the
|
||||||
|
# same coefficient input. Also keeps the bit-exact gate alive on
|
||||||
|
# hosts without V3D7 (CI runners, x86 dev boxes).
|
||||||
|
add_test(NAME idct_bitexact_cpu COMMAND test_idct_bitexact 320 240
|
||||||
|
0xfeedface5a5a5a5a cpu)
|
||||||
|
|
||||||
# 1920x1088 1080p — deployment-scale test (8160 MBs, ~0.25 s on hertz).
|
# 1920x1088 1080p — deployment-scale test (8160 MBs, ~0.25 s on hertz).
|
||||||
# Validates the per-MB block index + pixel offset math at full coded
|
# Validates the per-MB block index + pixel offset math at full coded
|
||||||
# height (1088, not 1080 — see daedalus_decoder.h on H.264 coded vs
|
# height (1088, not 1080 — see daedalus_decoder.h on H.264 coded vs
|
||||||
|
|||||||
@@ -99,6 +99,33 @@ typedef enum {
|
|||||||
DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */
|
DAEDALUS_DECODER_OUTPUT_RGBA = 1, /* Stage 5 opt-in */
|
||||||
} daedalus_decoder_output_format;
|
} daedalus_decoder_output_format;
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* Substrate selector. Determines which backend daedalus-fourier
|
||||||
|
* dispatches the per-frame compute through.
|
||||||
|
*
|
||||||
|
* AUTO is the only sensible choice for production — it picks per the
|
||||||
|
* recipe table baked into daedalus-fourier (post 2026-05-23 decree:
|
||||||
|
* QPU when a V3D shader exists, CPU NEON otherwise). The explicit
|
||||||
|
* options exist for testing:
|
||||||
|
*
|
||||||
|
* - CPU forces the dispatch onto the NEON path even when V3D7 is
|
||||||
|
* available. Lets the bit-exact ctests run on hosts without a
|
||||||
|
* working Vulkan/V3D stack (CI runners, dev x86 boxes via
|
||||||
|
* cross-build), and lets us cross-check the V3D shader output
|
||||||
|
* against the NEON reference path on hosts that DO have V3D.
|
||||||
|
* - QPU is the dual — force QPU even on a CPU-preferred kernel.
|
||||||
|
* Useful for benchmarking specific QPU paths in isolation.
|
||||||
|
*
|
||||||
|
* A non-AUTO selection on a host that can't satisfy it
|
||||||
|
* (DAEDALUS_DECODER_SUBSTRATE_QPU on an x86 dev box) propagates a
|
||||||
|
* dispatch failure back through flush_frame as -3.
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef enum {
|
||||||
|
DAEDALUS_DECODER_SUBSTRATE_AUTO = 0,
|
||||||
|
DAEDALUS_DECODER_SUBSTRATE_CPU = 1,
|
||||||
|
DAEDALUS_DECODER_SUBSTRATE_QPU = 2,
|
||||||
|
} daedalus_decoder_substrate;
|
||||||
|
|
||||||
/* -------------------------------------------------------------------
|
/* -------------------------------------------------------------------
|
||||||
* Lifecycle
|
* Lifecycle
|
||||||
* ----------------------------------------------------------------- */
|
* ----------------------------------------------------------------- */
|
||||||
@@ -128,6 +155,12 @@ void daedalus_decoder_destroy(daedalus_decoder *dec);
|
|||||||
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
|
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
|
||||||
daedalus_decoder_output_format fmt);
|
daedalus_decoder_output_format fmt);
|
||||||
|
|
||||||
|
/* Override the dispatch substrate for subsequent flush_frame calls.
|
||||||
|
* Default is AUTO. Same mid-frame-change restriction as
|
||||||
|
* set_output_format. */
|
||||||
|
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
|
||||||
|
daedalus_decoder_substrate sub);
|
||||||
|
|
||||||
/* -------------------------------------------------------------------
|
/* -------------------------------------------------------------------
|
||||||
* Per-frame submission
|
* Per-frame submission
|
||||||
* ----------------------------------------------------------------- */
|
* ----------------------------------------------------------------- */
|
||||||
|
|||||||
+40
-11
@@ -41,6 +41,7 @@ daedalus_decoder *daedalus_decoder_create(int width, int height)
|
|||||||
dec->mb_height = height >> 4;
|
dec->mb_height = height >> 4;
|
||||||
dec->n_mbs = dec->mb_width * dec->mb_height;
|
dec->n_mbs = dec->mb_width * dec->mb_height;
|
||||||
dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
|
dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
|
||||||
|
dec->substrate = DAEDALUS_DECODER_SUBSTRATE_AUTO;
|
||||||
|
|
||||||
/* daedalus-fourier ctx — required. Phase 1 needs the QPU; if
|
/* daedalus-fourier ctx — required. Phase 1 needs the QPU; if
|
||||||
* Vulkan init fails the decoder is unusable. Caller can check
|
* Vulkan init fails the decoder is unusable. Caller can check
|
||||||
@@ -86,6 +87,33 @@ int daedalus_decoder_set_output_format(daedalus_decoder *dec,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
|
||||||
|
daedalus_decoder_substrate sub)
|
||||||
|
{
|
||||||
|
if (!dec)
|
||||||
|
return -1;
|
||||||
|
if (dec->mbs_appended != 0)
|
||||||
|
return -1;
|
||||||
|
if (sub != DAEDALUS_DECODER_SUBSTRATE_AUTO &&
|
||||||
|
sub != DAEDALUS_DECODER_SUBSTRATE_CPU &&
|
||||||
|
sub != DAEDALUS_DECODER_SUBSTRATE_QPU)
|
||||||
|
return -1;
|
||||||
|
dec->substrate = sub;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Map our public substrate enum onto daedalus-fourier's. Same
|
||||||
|
* ordering by intent — we duplicate the enum for ABI isolation. */
|
||||||
|
static daedalus_substrate map_substrate(daedalus_decoder_substrate s)
|
||||||
|
{
|
||||||
|
switch (s) {
|
||||||
|
case DAEDALUS_DECODER_SUBSTRATE_CPU: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
case DAEDALUS_DECODER_SUBSTRATE_QPU: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
|
case DAEDALUS_DECODER_SUBSTRATE_AUTO:
|
||||||
|
default: return DAEDALUS_SUBSTRATE_AUTO;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int daedalus_decoder_append_mb(daedalus_decoder *dec,
|
int daedalus_decoder_append_mb(daedalus_decoder *dec,
|
||||||
const struct daedalus_decoder_mb_input *mb)
|
const struct daedalus_decoder_mb_input *mb)
|
||||||
{
|
{
|
||||||
@@ -267,16 +295,17 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
|||||||
* Skipping the dispatch when the partition is empty avoids the
|
* Skipping the dispatch when the partition is empty avoids the
|
||||||
* shader-pool warm-up cost on the common case (a typical Baseline
|
* shader-pool warm-up cost on the common case (a typical Baseline
|
||||||
* stream is all-4x4 → 8x8 dispatch is no-op). */
|
* stream is all-4x4 → 8x8 dispatch is no-op). */
|
||||||
|
const daedalus_substrate sub = map_substrate(dec->substrate);
|
||||||
if (bi4 > 0) {
|
if (bi4 > 0) {
|
||||||
int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
|
int dr = daedalus_dispatch_h264_idct4(dec->dctx, sub,
|
||||||
scratch_y, y_stride_int,
|
scratch_y, y_stride_int,
|
||||||
coeffs4, bi4, meta4);
|
coeffs4, bi4, meta4);
|
||||||
if (dr != 0) { rc = -3; goto cleanup; }
|
if (dr != 0) { rc = -3; goto cleanup; }
|
||||||
}
|
}
|
||||||
if (bi8 > 0) {
|
if (bi8 > 0) {
|
||||||
int dr = daedalus_recipe_dispatch_h264_idct8(dec->dctx,
|
int dr = daedalus_dispatch_h264_idct8(dec->dctx, sub,
|
||||||
scratch_y, y_stride_int,
|
scratch_y, y_stride_int,
|
||||||
coeffs8, bi8, meta8);
|
coeffs8, bi8, meta8);
|
||||||
if (dr != 0) { rc = -3; goto cleanup; }
|
if (dr != 0) { rc = -3; goto cleanup; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -361,11 +390,11 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
|
|||||||
}
|
}
|
||||||
/* assert cbi == n_chroma_blocks; loop math guarantees it */
|
/* assert cbi == n_chroma_blocks; loop math guarantees it */
|
||||||
|
|
||||||
int cr_rc = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
|
int cr_rc = daedalus_dispatch_h264_idct4(dec->dctx, sub,
|
||||||
scratch_uv, chroma_w,
|
scratch_uv, chroma_w,
|
||||||
chroma_coeffs,
|
chroma_coeffs,
|
||||||
n_chroma_blocks,
|
n_chroma_blocks,
|
||||||
chroma_meta);
|
chroma_meta);
|
||||||
if (cr_rc != 0) {
|
if (cr_rc != 0) {
|
||||||
rc = -3;
|
rc = -3;
|
||||||
goto chroma_cleanup;
|
goto chroma_cleanup;
|
||||||
|
|||||||
@@ -64,6 +64,9 @@ struct daedalus_decoder {
|
|||||||
|
|
||||||
/* Output format. */
|
/* Output format. */
|
||||||
daedalus_decoder_output_format output_fmt;
|
daedalus_decoder_output_format output_fmt;
|
||||||
|
|
||||||
|
/* Dispatch substrate (AUTO by default — recipe-table-driven). */
|
||||||
|
daedalus_decoder_substrate substrate;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* DAEDALUS_DECODER_INTERNAL_H */
|
#endif /* DAEDALUS_DECODER_INTERNAL_H */
|
||||||
|
|||||||
@@ -166,6 +166,23 @@ int main(int argc, char **argv)
|
|||||||
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
|
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
|
||||||
xs64_state = seed;
|
xs64_state = seed;
|
||||||
|
|
||||||
|
/* Optional 4th argv: "auto" (default) / "cpu" / "qpu" to pin the
|
||||||
|
* dispatch substrate. Both substrates must produce IDENTICAL
|
||||||
|
* output (the V3D shaders are bit-exact gates against the same
|
||||||
|
* spec the NEON path implements); the ctest suite runs the QVGA
|
||||||
|
* test once per substrate to catch any silent drift. */
|
||||||
|
daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
|
||||||
|
const char *sub_name = "auto";
|
||||||
|
if (argc > 4) {
|
||||||
|
if (!strcmp(argv[4], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; }
|
||||||
|
else if (!strcmp(argv[4], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; }
|
||||||
|
else if (!strcmp(argv[4], "auto")) { /* default */ }
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[4]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int mb_w = width / 16;
|
int mb_w = width / 16;
|
||||||
int mb_h = height / 16;
|
int mb_h = height / 16;
|
||||||
int n_mbs = mb_w * mb_h;
|
int n_mbs = mb_w * mb_h;
|
||||||
@@ -177,6 +194,11 @@ int main(int argc, char **argv)
|
|||||||
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
|
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (daedalus_decoder_set_substrate(dec, sub) != 0) {
|
||||||
|
fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
printf("substrate: %s\n", sub_name);
|
||||||
|
|
||||||
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
|
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
|
||||||
* random coeffs in [-512, 511] — same range as the daedalus-fourier
|
* random coeffs in [-512, 511] — same range as the daedalus-fourier
|
||||||
|
|||||||
@@ -52,6 +52,16 @@ int main(void)
|
|||||||
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_NV12) == 0,
|
EXPECT(daedalus_decoder_set_output_format(dec, DAEDALUS_DECODER_OUTPUT_NV12) == 0,
|
||||||
"switch back to NV12");
|
"switch back to NV12");
|
||||||
|
|
||||||
|
/* Substrate setter — same lifecycle rules. */
|
||||||
|
EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_CPU) == 0,
|
||||||
|
"force CPU substrate on virgin ctx");
|
||||||
|
EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_QPU) == 0,
|
||||||
|
"force QPU substrate on virgin ctx");
|
||||||
|
EXPECT(daedalus_decoder_set_substrate(dec, DAEDALUS_DECODER_SUBSTRATE_AUTO) == 0,
|
||||||
|
"back to AUTO");
|
||||||
|
EXPECT(daedalus_decoder_set_substrate(dec, (daedalus_decoder_substrate) 99) == -1,
|
||||||
|
"bogus substrate rejects");
|
||||||
|
|
||||||
/* Append rejects out-of-bounds + null inputs. */
|
/* Append rejects out-of-bounds + null inputs. */
|
||||||
int16_t coeffs[384] = {0};
|
int16_t coeffs[384] = {0};
|
||||||
struct daedalus_decoder_mb_input mb = {0};
|
struct daedalus_decoder_mb_input mb = {0};
|
||||||
|
|||||||
Reference in New Issue
Block a user