Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| eb5cfb34c4 | |||
| 1085c5699c | |||
| 760f6a4060 | |||
| 5223d3cb3f | |||
| 1740e7c165 | |||
| 9c0bd72e70 | |||
| 2dd774a9ab |
+78
-1
@@ -207,7 +207,18 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV})
|
||||
set(CDEF_SPV ${CMAKE_BINARY_DIR}/v3d_cdef.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${CDEF_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${CDEF_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_cdef.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_cdef.comp
|
||||
COMMENT "glslang: v3d_cdef.comp -> v3d_cdef.spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV})
|
||||
|
||||
# v3d_runner — reusable Vulkan plumbing.
|
||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||
@@ -255,6 +266,57 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
target_link_libraries(bench_v3d_lpf8 PRIVATE v3d_runner Vulkan::Vulkan)
|
||||
target_compile_options(bench_v3d_lpf8 PRIVATE -O2)
|
||||
|
||||
# Cycle 5 — QPU CDEF bench (3-way M1 against NEON + C ref).
|
||||
add_executable(bench_v3d_cdef
|
||||
tests/bench_v3d_cdef.c
|
||||
tests/cdef_ref.c
|
||||
${DAV1D_CDEF_ASM_SOURCES}
|
||||
${DAV1D_CDEF_C_SOURCES}
|
||||
)
|
||||
add_dependencies(bench_v3d_cdef daedalus_shaders)
|
||||
target_link_libraries(bench_v3d_cdef PRIVATE v3d_runner Vulkan::Vulkan)
|
||||
target_compile_options(bench_v3d_cdef PRIVATE -O2)
|
||||
endif()
|
||||
|
||||
# ---- Phase 8 — public C API library + smoke test ---------------------------
|
||||
|
||||
add_library(daedalus_core STATIC
|
||||
src/daedalus_core.c
|
||||
src/v3d_runner.c
|
||||
${FFASM_SOURCES}
|
||||
${FFASM_LPF_SOURCES}
|
||||
${FFASM_MC_SOURCES}
|
||||
${FFC_MC_SOURCES}
|
||||
${DAV1D_CDEF_ASM_SOURCES}
|
||||
${DAV1D_CDEF_C_SOURCES}
|
||||
)
|
||||
target_include_directories(daedalus_core PUBLIC include)
|
||||
target_include_directories(daedalus_core PRIVATE src)
|
||||
target_link_libraries(daedalus_core PUBLIC Vulkan::Vulkan)
|
||||
target_compile_options(daedalus_core PRIVATE -O2)
|
||||
if (DAEDALUS_BUILD_VULKAN)
|
||||
add_dependencies(daedalus_core daedalus_shaders)
|
||||
endif()
|
||||
|
||||
add_executable(test_api_idct
|
||||
tests/test_api_idct.c
|
||||
tests/vp9_idct8_ref.c
|
||||
)
|
||||
target_link_libraries(test_api_idct PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_idct PRIVATE -O2)
|
||||
|
||||
add_executable(test_api_lpf
|
||||
tests/test_api_lpf.c
|
||||
tests/vp9_lpf_ref.c
|
||||
tests/vp9_lpf8_ref.c
|
||||
)
|
||||
target_link_libraries(test_api_lpf PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_lpf PRIVATE -O2)
|
||||
|
||||
if (DAEDALUS_BUILD_VULKAN)
|
||||
# (re-open the conditional so the closing endif() below balances)
|
||||
|
||||
|
||||
# M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON
|
||||
# snapshot so we can run real NEON kernels on pinned CPU cores
|
||||
# while the QPU runs its dispatch loop concurrently.
|
||||
@@ -293,6 +355,21 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
add_dependencies(bench_concurrent_lpf8 daedalus_shaders)
|
||||
target_link_libraries(bench_concurrent_lpf8 PRIVATE v3d_runner Vulkan::Vulkan pthread)
|
||||
target_compile_options(bench_concurrent_lpf8 PRIVATE -O3 -march=armv8-a+simd)
|
||||
|
||||
# Issue 003 — mixed-kernel M4 bench (NEON-N kernel A + QPU kernel B).
|
||||
# Links all FFmpeg + dav1d NEON sources we have.
|
||||
add_executable(bench_concurrent_mixed
|
||||
tests/bench_concurrent_mixed.c
|
||||
${FFASM_SOURCES}
|
||||
${FFASM_LPF_SOURCES}
|
||||
${FFASM_MC_SOURCES}
|
||||
${FFC_MC_SOURCES}
|
||||
${DAV1D_CDEF_ASM_SOURCES}
|
||||
${DAV1D_CDEF_C_SOURCES}
|
||||
)
|
||||
add_dependencies(bench_concurrent_mixed daedalus_shaders)
|
||||
target_link_libraries(bench_concurrent_mixed PRIVATE v3d_runner Vulkan::Vulkan pthread)
|
||||
target_compile_options(bench_concurrent_mixed PRIVATE -O3 -march=armv8-a+simd)
|
||||
endif()
|
||||
|
||||
# ---- Summary ----------------------------------------------------------------
|
||||
|
||||
@@ -1,87 +1,148 @@
|
||||
# Issue 003 — Mixed-kernel M4 bench (closes cycle 3/5 deployment verdict)
|
||||
|
||||
**Status**: open, blocks Phase 8 deployment plumbing for cycles 3+5
|
||||
**Status**: **CLOSED 2026-05-18** (partial — real QPU CDEF still deferred to cycle 5 Phase 6, but enough data to update deployment recipe)
|
||||
**Type**: measurement gap; methodology fix
|
||||
**Predicted verdict**: cycle 3 MC + cycle 5 CDEF may flip from
|
||||
"CPU only" to "opportunistic QPU helper"
|
||||
**Priority**: medium (changes deployment recipe; doesn't block other cycles)
|
||||
**Verdict shift**: cycle 3 MC verdict stands (CPU only); cycle 5 CDEF deserves "opportunistic helper" caveat; cycle 1+2+4 deployment recipe **validated by V4 result**.
|
||||
**Filed**: 2026-05-18
|
||||
**Bench**: `tests/bench_concurrent_mixed.c` (built `bench_concurrent_mixed`)
|
||||
|
||||
## Background
|
||||
|
||||
Cycles 3 (MC) and 5 (CDEF, partial) were verdict'd "stay on CPU"
|
||||
based on M4 measurements showing mixed NEON-3 + QPU running the
|
||||
**same kernel** ran SLOWER than pure NEON-4. Specifically:
|
||||
**same kernel** ran SLOWER than pure NEON-4. The user-flagged
|
||||
calibration (2026-05-18): the M4 "same-kernel" test sets the bar
|
||||
too high. A "different-kernel" test would more accurately reflect
|
||||
deployment.
|
||||
|
||||
| | NEON-4 | NEON-3 + QPU | delta |
|
||||
## Measurement results (hertz, 2026-05-18)
|
||||
|
||||
`bench_concurrent_mixed` matrix, 6-second windows, NEON-3 pinned
|
||||
to cores 0-2, QPU/fallback worker on core 3:
|
||||
|
||||
| # | CPU side | QPU side | CPU agg | QPU contrib |
|
||||
|---|---------------------------|--------------------------------|-------------|--------------|
|
||||
|V1 | MC NEON-3 | CDEF (NEON fallback, core 3) | 24.49 Mblock/s | 1.75 Mblock/s CDEF |
|
||||
|V2 | LPF4 NEON-3 | CDEF (NEON fallback, core 3) | 27.28 Medge/s | 1.70 Mblock/s CDEF |
|
||||
|V3 | MC NEON-3 (**control**) | MC (real QPU dispatch) | 22.64 Mblock/s | 0.39 Mblock/s MC |
|
||||
|V4 | MC NEON-3 | LPF4 (real QPU dispatch) | 27.87 Mblock/s | 12.74 Medge/s LPF4 |
|
||||
|V5 | LPF4 NEON-3 | MC (real QPU dispatch) | 30.82 Medge/s | 0.37 Mblock/s MC |
|
||||
|
||||
The "QPU side" cell records the substrate actually used.
|
||||
**V1 and V2 use NEON-on-core-3** as a proxy for QPU CDEF because
|
||||
cycle 5 Phase 6 (real QPU CDEF shader) is not yet implemented;
|
||||
the proxy gives a lower bound on the "QPU helper" question.
|
||||
|
||||
## Cross-variant deltas
|
||||
|
||||
**Effect on CPU MC throughput when the QPU runs a different kernel:**
|
||||
|
||||
| QPU kernel | CPU MC agg | delta vs V3 | per-core delta |
|
||||
|---|---|---|---|
|
||||
| Cycle 3 MC | 15.25 Mblock/s | 12.28 | **−19.5 %** |
|
||||
| Cycle 5 CDEF (predicted) | ~ 12-15 | ~ 10-12 | negative |
|
||||
| MC (V3, same-kernel) | 22.64 Mblock/s | — | baseline |
|
||||
| CDEF NEON fallback (V1) | 24.49 Mblock/s | +8.2 % | +0.6 Mblock/s/core |
|
||||
| LPF4 real QPU (V4) | 27.87 Mblock/s | **+23.1 %** | +1.7 Mblock/s/core |
|
||||
|
||||
But this is the **worst-case contention scenario**: both substrates
|
||||
competing for the same memory bus with the same access pattern.
|
||||
Switching the QPU off MC (the same kernel) onto LPF4 (a different
|
||||
bandwidth-bound kernel) gave the CPU MC side a **23 % per-core
|
||||
throughput uplift** — because the QPU stopped contending for the
|
||||
shared memory channel with the same access pattern.
|
||||
|
||||
**Real decoder pipeline shape**: CPU runs entropy + MC + LR + other
|
||||
work concurrently; QPU runs IDCT + LPF (currently) + (potentially)
|
||||
CDEF/MC. Different kernels on different substrates contend
|
||||
*less* than same-kernel-on-both.
|
||||
## Headline finding — V4 is the validated deployment shape
|
||||
|
||||
The user-flagged calibration (2026-05-18): the M4 "same-kernel"
|
||||
test sets the bar too high. A "different-kernel" test would more
|
||||
accurately reflect deployment.
|
||||
**V4 = NEON-3 doing MC + QPU doing LPF4** is precisely the
|
||||
daedalus-fourier deployment recipe (CPU runs cycle 3 MC; QPU runs
|
||||
cycle 2 LPF4 via the GREEN-band offload). The measurement:
|
||||
|
||||
## What to measure
|
||||
- CPU MC: 27.87 Mblock/s (per-core 8.3-10.0)
|
||||
- QPU LPF4: 12.74 Medge/s (65 % of QPU LPF4 isolation throughput,
|
||||
19.6 Medge/s from cycle 2; bandwidth contention is real but
|
||||
doesn't kill the offload)
|
||||
- **Both substrates productive concurrently.**
|
||||
|
||||
A new bench harness `tests/bench_concurrent_mixed.c` that runs:
|
||||
This is the experiment that should have run *first*; the
|
||||
same-kernel M4 was the wrong comparison. The user was right.
|
||||
|
||||
| Variant | CPU side (NEON-3 pinned) | QPU side (1 core) | Captures |
|
||||
|---|---|---|---|
|
||||
| A | LPF wd=4 (bandwidth-bound, like real LPF stage) | CDEF | CDEF helper throughput; CPU LPF throughput drop |
|
||||
| B | MC (compute-bound, like real MC stage) | CDEF | CDEF helper throughput; CPU MC throughput drop |
|
||||
| C | MC | MC | (cycle 3 M4 control) |
|
||||
| D | LPF wd=4 + MC alternating (proxy for "CPU doing mixed real work") | CDEF | Real-pipeline approximation |
|
||||
## V3 vs V4 — why same-kernel M4 was pessimistic
|
||||
|
||||
Compute "QPU helper value" = (mixed total throughput in the relevant
|
||||
kernel) − (CPU-only baseline) for each variant.
|
||||
V3 (cycle 3 same-kernel rerun in this bench): 22.64 CPU MC + 0.39
|
||||
QPU MC = 23.03 total Mblock/s. The QPU substrate is a poor
|
||||
substitute for a 4th NEON core when both are doing the same
|
||||
kernel (QPU contributes 0.39 vs ~9.0 a 4th NEON core would add).
|
||||
|
||||
If variant A or B shows the QPU adds positive CDEF throughput
|
||||
without significantly reducing the CPU kernel's throughput, then
|
||||
CDEF deserves an "opportunistic helper" verdict instead of
|
||||
"CPU only".
|
||||
V4 (different-kernel deployment): 27.87 CPU MC + 12.74 QPU LPF4.
|
||||
The QPU is "free" — it's not stealing throughput from the CPU
|
||||
side (CPU MC is *higher* than in V3), and it's adding real LPF4
|
||||
work that the CPU would otherwise have to do.
|
||||
|
||||
## Expected outcome
|
||||
**Conclusion**: the same-kernel M4 in cycles 1-5 was a
|
||||
worst-case contention bound. The real deployment shape (V4)
|
||||
performs *better* than same-kernel M4 suggested.
|
||||
|
||||
Per the user's "5 % CPU drop / 50 % bored QPU" framing:
|
||||
- Variant A (bandwidth+bandwidth): QPU contention with bandwidth-
|
||||
heavy LPF is real; QPU contribution likely ~70 % of isolation
|
||||
- Variant B (compute+CDEF): MC is the worst-saturated case from
|
||||
cycle 3; QPU likely under-contributes, CPU MC may drop. Net
|
||||
result ~ cycle 3 M4 (−19.5 % rerun)
|
||||
- Variant D (mixed): probably the closest-to-deployment number.
|
||||
Best estimate of "additional QPU helper" value.
|
||||
## V1, V2 — CDEF as opportunistic helper
|
||||
|
||||
## Acceptance criteria
|
||||
V1/V2 use NEON-on-core-3 (not real QPU) as a proxy because cycle
|
||||
5 Phase 6 isn't built. The proxy results:
|
||||
|
||||
- `tests/bench_concurrent_mixed.c` lands, 4 variants measurable
|
||||
- Verdict per variant: "+X.X %" CDEF throughput vs pure CPU baseline
|
||||
- Cycle 3 and cycle 5 deployment recipes updated either way
|
||||
- `docs/k3_mc_phase7.md §"M4 methodology caveat"` updated with
|
||||
results
|
||||
- V1: NEON-core-3 CDEF adds **1.75 Mblock/s** while NEON-3 MC
|
||||
delivers 24.49 Mblock/s (slightly *higher* than V3 control's
|
||||
22.64, because CDEF is compute-bound so it contends little on
|
||||
the memory bus).
|
||||
- V2: NEON-core-3 CDEF adds **1.70 Mblock/s** while NEON-3 LPF4
|
||||
delivers 27.28 Medge/s (close to NEON-4 LPF4 isolation 29.47).
|
||||
|
||||
## Why deferred
|
||||
So **the 4th core CAN run CDEF concurrently** without crushing
|
||||
the other 3 cores' MC or LPF work. Whether the actual *QPU*
|
||||
(after cycle 5 Phase 6 lands) does likewise is unknown:
|
||||
|
||||
User-directed cycle 5 was CDEF; M4 methodology calibration only
|
||||
surfaced AFTER cycle 5 close. The fix is its own ~half-day bench
|
||||
work, separable from any cycle's kernel implementation.
|
||||
- QPU CDEF predicted R₅ = 0.02-0.05 → at best 0.05 × 3.9
|
||||
≈ 0.2 Mblock/s of CDEF helper. That's an order of magnitude
|
||||
*below* the NEON-fallback proxy.
|
||||
- But the QPU substrate would contend on the QPU side of the
|
||||
memory hierarchy; the CPU MC side may be *less* affected than
|
||||
V1's 24.49 (which had NEON contention).
|
||||
|
||||
## Related
|
||||
The conservative read: **CDEF stays on CPU as primary path; QPU
|
||||
CDEF dispatch path should exist in the V4L2 wrapper but only used
|
||||
when no IDCT/LPF queue is pending**. Re-measure after cycle 5
|
||||
Phase 6 closes.
|
||||
|
||||
- `docs/k3_mc_phase7.md §"M4 methodology caveat"` (the calibration
|
||||
doc with the user's contribution)
|
||||
- `docs/k5_cdef_phase3_partial.md §"Deployment recommendation"`
|
||||
(softened verdict pending this issue)
|
||||
- `tests/bench_concurrent_mc.c` (cycle 3 same-kernel bench;
|
||||
template for the mixed-kernel variant)
|
||||
- `tests/bench_concurrent_lpf.c` + `bench_concurrent_lpf8.c`
|
||||
(cycle 2/4 bench templates)
|
||||
- Memory: `feedback_m4_same_kernel_worst_case.md`
|
||||
## V5 — LPF on CPU side with QPU MC
|
||||
|
||||
V5 inverts V4: NEON-3 does LPF4, QPU does MC. CPU LPF agg =
|
||||
30.82 Medge/s (essentially NEON-4 isolation), QPU MC adds 0.37
|
||||
Mblock/s. This is the **wrong deployment** — QPU has no comparative
|
||||
advantage for MC, and the LPF kernel that *should* go to QPU
|
||||
stays on CPU. Confirms that cycle 2 LPF belongs on QPU, not the
|
||||
other way around.
|
||||
|
||||
## Updated deployment recipe
|
||||
|
||||
| Cycle | Kernel | Primary substrate | QPU dispatch path | Notes |
|
||||
|---|---|---|---|---|
|
||||
| 1 IDCT 8×8 | QPU | yes | M4 +7.2 % validated |
|
||||
| 2 LPF wd=4 | QPU | yes | M4 +6.9 % validated; **V4 confirms under MC contention** |
|
||||
| 3 MC 8h | **CPU** | optional / unused | QPU MC contributes 0.39 Mblock/s under any contention scenario — keep dispatch path but don't enqueue |
|
||||
| 4 LPF wd=8 | QPU | yes | M4 +4.1 % validated |
|
||||
| 5 CDEF | **CPU** | opportunistic only | Cycle 5 Phase 6 deferred; real QPU CDEF measurement still owed |
|
||||
|
||||
## What changes in repo state
|
||||
|
||||
- `tests/bench_concurrent_mixed.c` lands (~470 LOC).
|
||||
- `CMakeLists.txt` builds `bench_concurrent_mixed` target with all
|
||||
the FFmpeg + dav1d NEON sources.
|
||||
- `docs/k3_mc_phase7.md` § "M4 methodology caveat" updated with V3
|
||||
vs V4 deltas.
|
||||
- `docs/k5_cdef_phase3_partial.md` § "Deployment recommendation"
|
||||
updated with V1/V2 fallback-proxy results.
|
||||
- Memory `feedback_m4_same_kernel_worst_case.md` annotated with
|
||||
closing numbers.
|
||||
|
||||
## What's still open after this issue
|
||||
|
||||
- Real QPU CDEF measurement (depends on cycle 5 Phase 6 landing).
|
||||
- Variant D (mixed LPF+MC alternating CPU work) skipped — the V1
|
||||
vs V4 contrast already answers the deployment question.
|
||||
- Phase 8 V4L2 wrapper should follow the recipe table above:
|
||||
dispatch paths for ALL kernels exist; the scheduler chooses
|
||||
per-kernel based on the validated recipe.
|
||||
|
||||
@@ -122,6 +122,27 @@ NEON-3 on kernel-A + QPU on kernel-B concurrently would close the
|
||||
question. ~½ day of additional bench work; would update the
|
||||
deployment recipe for cycles 3 + 5 if the result is positive.
|
||||
|
||||
### Issue 003 results (2026-05-18, closed)
|
||||
|
||||
`bench_concurrent_mixed` matrix in `docs/issues/003-mixed-kernel-m4-bench.md`
|
||||
confirms the methodology critique:
|
||||
|
||||
| QPU side | CPU MC agg | per-core MC | QPU contribution |
|
||||
|---|---|---|---|
|
||||
| MC (V3 control, same kernel) | 22.64 Mblock/s | 7.5 avg | 0.39 Mblock/s MC |
|
||||
| LPF4 real QPU (V4) | **27.87 Mblock/s** | **9.3 avg** | **12.74 Medge/s LPF4** |
|
||||
|
||||
Switching QPU off MC (same kernel) onto LPF4 (a different
|
||||
bandwidth-bound kernel) gave CPU MC **+23 % per-core uplift**.
|
||||
V4 = the actual daedalus-fourier deployment shape (CPU MC + QPU
|
||||
LPF4), and both substrates were productive concurrently.
|
||||
|
||||
**Cycle 3 MC verdict unchanged**: QPU MC contributes ~0.4
|
||||
Mblock/s under any contention scenario (V3, V5). The 4 NEON cores
|
||||
do MC dramatically better. **MC stays on CPU.** But the
|
||||
*deployment recipe overall* (cycle 1+2+4 on QPU, 3 on CPU) is
|
||||
validated by V4 as a positive-sum arrangement.
|
||||
|
||||
## Decision per Phase 1 rules + 30fps-floor calibration
|
||||
|
||||
| Rule | Result | Status |
|
||||
|
||||
@@ -0,0 +1,121 @@
|
||||
---
|
||||
cycle: 5
|
||||
phase: 3
|
||||
status: closed 2026-05-18 — M1 PASS, M3 captured
|
||||
date_opened: 2026-05-18
|
||||
date_closed: 2026-05-18
|
||||
parent: k5_cdef_phase1_2.md
|
||||
host: hertz
|
||||
---
|
||||
|
||||
# Cycle 5, Phase 3 — CDEF NEON baseline (closed)
|
||||
|
||||
Supersedes `k5_cdef_phase3_partial.md`. The M1 deferral from the
|
||||
partial doc resolved as a **one-line bench bug**, not a layout
|
||||
ambiguity in dav1d's NEON.
|
||||
|
||||
## Root cause of the previous "layout mismatch"
|
||||
|
||||
`tests/cdef_ref.c` line 104 internally advances `tmp += 2*16+2`
|
||||
(skips the padding region) before reading block data. `dav1d_cdef_
|
||||
filter8_8bpc_neon` expects the *caller* to pass that already-advanced
|
||||
pointer (i.e., pointer to the 8×8 block origin, not the padded
|
||||
buffer origin). The bench was passing the raw padded-buffer pointer
|
||||
to NEON, so NEON filtered a block shifted (+2 rows, +2 cols) from
|
||||
where the C ref filtered. The "same 6 bytes at a different position"
|
||||
trace in the partial doc is exactly that diagonal shift.
|
||||
|
||||
Fix: `tmps + i*TMP_INTS + (2 * TMP_W + 2)` for the NEON call.
|
||||
Three-line patch in `tests/bench_neon_cdef.c`.
|
||||
|
||||
## M1₅ bit-exact gate
|
||||
|
||||
```
|
||||
=== M1₅_c bit-exact (10000 random 8x8 blocks) ===
|
||||
M1₅_c correctness: 10000 / 10000 blocks bit-exact (100.0000%)
|
||||
dir coverage: min=1194 max=1332 (8 directions sampled)
|
||||
```
|
||||
|
||||
All 8 directions exercised, distribution flat. **M1 gate PASS.**
|
||||
|
||||
## M3₅ NEON throughput
|
||||
|
||||
```
|
||||
=== M3₅ NEON throughput ===
|
||||
blocks/batch: 4096
|
||||
batches done: 1801
|
||||
total blocks: 7 376 896
|
||||
elapsed (kernel)=1.937 s
|
||||
throughput = 3.809 Mblock/s
|
||||
per-block = 262.5 ns
|
||||
equiv 1080p = 117.6 FPS (32 400 blocks/frame)
|
||||
```
|
||||
|
||||
Consistent with the previously captured 3.923 Mblock/s (longer
|
||||
window). Per-block ~260 ns. **CDEF remains the most compute-
|
||||
intensive kernel cycle so far** (2.1× IDCT, 13× LPF wd=4,
|
||||
5.5× MC).
|
||||
|
||||
| | per-block ns | relative |
|
||||
|---|---|---|
|
||||
| IDCT 8×8 (k1) | 122 | 1.0× |
|
||||
| LPF wd=4 (k2) | 20.7 | 0.17× |
|
||||
| MC 8h (k3) | 47.6 | 0.39× |
|
||||
| LPF wd=8 (k4) | 19.1 | 0.16× |
|
||||
| **CDEF (k5)** | **262.5** | **2.15×** |
|
||||
|
||||
30fps@1080p floor margin: **3.9×** isolation NEON single-core.
|
||||
NEON-4 baseline would be ~12-15 Mblock/s → 12-15× margin.
|
||||
|
||||
## Methodology lessons
|
||||
|
||||
1. **Inverted-bench bugs look like layout mismatches.** The original
|
||||
diagnosis ("dav1d's NEON expects tmp built by a specific
|
||||
`dav1d_cdef_padding8_8bpc_neon` routine") was wrong; the
|
||||
filter accepts any uint16 tmp content (the pri+sec algorithm
|
||||
doesn't care if the halo is padded with sentinels or random
|
||||
pixels, as long as the constrain() math gets passed). The
|
||||
issue was *which 8×8 region NEON would filter*, not the
|
||||
semantics of the halo.
|
||||
|
||||
2. **Two pointer conventions for the same buffer**: the C ref
|
||||
does "internal advance" (caller passes padded-buffer origin),
|
||||
the NEON does "external advance" (caller passes block origin).
|
||||
Trace evidence (a diagonal shift in the output) is diagnostic
|
||||
of pointer-convention mismatch.
|
||||
|
||||
3. **dav1d_cdef_padding8_8bpc_neon** is for sentinel-padded edge
|
||||
cases (when the block is at the picture boundary). For a
|
||||
middle-of-picture block where all neighbours exist, the NEON
|
||||
filter is happy to read raw pixel values; the constrain() math
|
||||
naturally handles any halo content.
|
||||
|
||||
## What lands in this commit
|
||||
|
||||
- `tests/bench_neon_cdef.c`: 3-line fix (tmp+34 for NEON calls)
|
||||
- `docs/k5_cdef_phase3.md` (this doc) supersedes
|
||||
`k5_cdef_phase3_partial.md`
|
||||
|
||||
## Phase 4 unblocked
|
||||
|
||||
Predicted R₅ (from `k5_cdef_phase3_partial.md`):
|
||||
- CDEF is ~5× heavier per-block than MC on NEON (262 vs 48 ns)
|
||||
- NEON ~5× per-core advantage on MC → QPU likely ~25× behind on CDEF
|
||||
- R₅ isolation estimate: **0.02-0.05 (deep RED)**
|
||||
|
||||
Issue 003 V1/V2 NEON-fallback proxy showed that a 4th NEON core
|
||||
running CDEF adds 1.7 Mblock/s of CDEF helper without crushing
|
||||
the other 3 cores. Real QPU CDEF is predicted at ~0.2 Mblock/s
|
||||
(an order of magnitude below the NEON-fallback proxy).
|
||||
|
||||
**Phase 4 plan rationale**: even predicted RED, build the QPU
|
||||
CDEF kernel because:
|
||||
- Confirms or refutes the R₅ 0.02-0.05 prediction with real data
|
||||
- Completes the cycle 5 record (Phases 1-7 all closed)
|
||||
- Provides the QPU CDEF dispatch path needed for the V4L2 wrapper
|
||||
to *exist* (Phase 8), even if scheduler doesn't enqueue it by
|
||||
default
|
||||
|
||||
Expected Phase 4 effort: 2-3 hours given the kernel shape is
|
||||
similar to cycle 2/4 LPF (per-block stencil with table lookups
|
||||
for directions; primary + secondary tap accumulation).
|
||||
@@ -95,18 +95,29 @@ chasing two layout issues simultaneously).
|
||||
- 30fps floor: still PASS on isolation+mixed since NEON 4-core
|
||||
baseline likely 12+ Mblock/s, comfortably above 0.972
|
||||
|
||||
**Deployment recommendation** (provisional, pending Phase 4-7 +
|
||||
Issue 003 mixed-kernel M4): **CDEF baseline = CPU, QPU offload
|
||||
viable as opportunistic helper, not measured**.
|
||||
**Deployment recommendation** (updated 2026-05-18 after Issue 003
|
||||
closed; Phase 4-7 still deferred): **CDEF baseline = CPU, QPU
|
||||
offload path should exist in V4L2 wrapper but only enqueue when
|
||||
IDCT+LPF queue is empty**.
|
||||
|
||||
Same caveat as cycle 3 MC (see `k3_mc_phase7.md §"M4 methodology
|
||||
caveat"`): our M4 measures same-kernel concurrent contention, which
|
||||
is the worst case. In a real decoder pipeline where CPU is doing
|
||||
entropy + MC + other work, taking CDEF off the CPU's plate could
|
||||
plausibly add throughput even at R = 0.05-ish — because the QPU is
|
||||
otherwise idle, the contention is across different kernels (less
|
||||
collision than same-kernel), and the lost-CPU-core-cost shrinks
|
||||
when the CPU has other work to fill in.
|
||||
`bench_concurrent_mixed` V1 (NEON-3 MC + NEON-core-3 CDEF
|
||||
fallback) and V2 (NEON-3 LPF4 + NEON-core-3 CDEF fallback)
|
||||
results:
|
||||
|
||||
| Variant | CPU side | CPU agg | NEON-core-3 CDEF |
|
||||
|---|---|---|---|
|
||||
| V1 | MC NEON-3 | 24.49 Mblock/s | 1.75 Mblock/s |
|
||||
| V2 | LPF4 NEON-3 | 27.28 Medge/s | 1.70 Mblock/s |
|
||||
|
||||
The proxy (NEON-on-core-3 doing CDEF) adds 1.7-1.75 Mblock/s of
|
||||
CDEF work without crushing the other 3 cores' main work. CPU
|
||||
aggregate stays close to single-kernel 4-core levels. Real QPU
|
||||
CDEF (when cycle 5 Phase 6 lands) would substitute the QPU for
|
||||
core 3; the QPU contribution is predicted R₅ = 0.02-0.05 →
|
||||
~0.2 Mblock/s (much less than the NEON-fallback proxy).
|
||||
|
||||
The opportunistic-helper hypothesis is **plausible but not
|
||||
fully validated** for the actual QPU substrate. Conservative read:
|
||||
|
||||
The **bandwidth-bound vs compute-bound classification rule** still
|
||||
holds at the kernel level, but its mapping to deployment is more
|
||||
|
||||
@@ -0,0 +1,253 @@
|
||||
---
|
||||
cycle: 5
|
||||
phase: 4
|
||||
status: draft, awaiting Phase 5 review
|
||||
date_opened: 2026-05-18
|
||||
parent: k5_cdef_phase3.md
|
||||
predicted_R: 0.02-0.05 (deep RED)
|
||||
---
|
||||
|
||||
# Cycle 5, Phase 4 — QPU CDEF shader plan
|
||||
|
||||
Plan a Vulkan compute shader for the AV1 CDEF primary+secondary
|
||||
8×8 luma filter on V3D 7.1. Predicted **deep RED** (R₅ = 0.02-0.05);
|
||||
plan + build it anyway because:
|
||||
- Confirms the prediction with measured data (or refutes it).
|
||||
- Provides the dispatch path needed for Phase 8 V4L2 wrapper.
|
||||
- Closes cycle 5 (Phases 1-7 all on the record).
|
||||
|
||||
## Kernel shape (NEON reference: 263 ns/block)
|
||||
|
||||
Per 8×8 output block: 8 directions table, 2 offsets each. For
|
||||
each output pixel:
|
||||
|
||||
- 2 primary taps (off1, -off1) using `dir`
|
||||
- 4 secondary taps (off2, -off2, off3, -off3) using `(dir+2)%8` and `(dir-2+8)%8`
|
||||
- For each of 2 k-rounds (different tap weights)
|
||||
- 12 `constrain()` ops per pixel × 64 pixels = **768 constrain ops per block**
|
||||
- Plus min/max bookkeeping for iclip
|
||||
|
||||
The constrain math:
|
||||
```
|
||||
diff = p - px;
|
||||
adiff = abs(diff);
|
||||
clip = max(0, threshold - (adiff >> shift));
|
||||
constrained = sign(diff) * min(adiff, clip);
|
||||
sum += tap * constrained;
|
||||
```
|
||||
|
||||
Output: `dst[r,c] = clamp(px + ((sum - (sum<0) + 8) >> 4), min, max);`
|
||||
|
||||
## V3D substrate fit (phase0 constraints)
|
||||
|
||||
- **No DP4A**: each constrain is scalar int math; no vector packing
|
||||
helps (per cycle 3 MC finding). Predicted instruction count
|
||||
proportional to ops.
|
||||
- **16KB shared**: not needed — each pixel computes independently;
|
||||
no row sharing in compute side (tmp is read-only input).
|
||||
- **subgroupSize=16**: 1 pixel per lane × 16 lanes/sg = 16 pixels
|
||||
per sg. Block of 64 pixels = 4 sg slots. Better: 2 blocks per
|
||||
WG of 256 invocations (16 sg) → 256 pixels = 4 blocks per WG.
|
||||
Following cycle-2 pattern: aim for **64 blocks/WG**? Too high
|
||||
— 64 × 64 = 4096 pixels/WG → 256 lanes × 16 pixels/lane.
|
||||
Wait — 256 lanes total, 1 pixel/lane → 256 pixels = 4 blocks/WG.
|
||||
Settle on **4 blocks/WG**, 256 invocations.
|
||||
- **≤8 SSBO**: need 3 (meta, tmp, dst). Comfortable.
|
||||
- **No shaderFloat16/Int8 ALU**: int math everywhere. uint8 dst
|
||||
via storageBuffer8BitAccess (cycle-1 v4 pattern).
|
||||
|
||||
## SSBO layout (post Phase 5 RED-1 fix)
|
||||
|
||||
- `Meta[i]`: `uvec4(dst_off_bytes, params0, tmp_off_u16, dir)` —
|
||||
i.e. `m.x` = dst_off, `m.y` = params (pri | sec << 8 |
|
||||
damping << 16), `m.z` = tmp block-origin u16-element offset,
|
||||
`m.w` = dir (3 bits used). **Pseudo-code below uses this
|
||||
layout consistently.**
|
||||
- `Tmp[]`: `uint16_t` array via `GL_EXT_shader_16bit_storage` +
|
||||
`storageBuffer16BitAccess` — both already enabled in
|
||||
`v3d_runner.c` and used by cycle 1 IDCT shader. No uncertainty.
|
||||
- `Dst[]`: `uint8_t` array via `GL_EXT_shader_8bit_storage` (per
|
||||
cycle-1 v4 pattern).
|
||||
|
||||
## Lane decomposition
|
||||
|
||||
256 invocations / WG, 4 blocks/WG:
|
||||
- `lane_in_wg = 0..255`
|
||||
- `block_in_wg = lane_in_wg / 64` (0..3)
|
||||
- `pixel_in_block = lane_in_wg & 63` (0..63 → row=>>3, col=&7)
|
||||
- `block_idx = wg_id * 4 + block_in_wg`
|
||||
|
||||
No barrier needed; each pixel computes independently.
|
||||
|
||||
## Push constants
|
||||
|
||||
```glsl
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint tmp_stride_u16; // = 16
|
||||
uint dst_stride_u8;
|
||||
uint _pad;
|
||||
} pc;
|
||||
```
|
||||
|
||||
## Directions table (post Phase 5 RED-3 fix)
|
||||
|
||||
Use `const ivec2 dirs[14]` (8 directions + 6 wrap copies), each
|
||||
entry = `(off_k0, off_k1)`. Signed-int storage handles negative
|
||||
offsets cleanly without manual sign-extension. The OR-pack
|
||||
approach proposed earlier would corrupt negative offsets;
|
||||
abandoned.
|
||||
|
||||
Values from `tests/cdef_ref.c` `neon_directions8[14][2]`:
|
||||
```
|
||||
dirs[ 0] = ivec2(-1*16+1, -2*16+2) // (-15, -30)
|
||||
dirs[ 1] = ivec2( 0*16+1, -1*16+2) // (1, -14)
|
||||
... (etc.)
|
||||
```
|
||||
|
||||
## Shader pseudo-code
|
||||
|
||||
```glsl
|
||||
void main() {
|
||||
uint gid = gl_GlobalInvocationID.x;
|
||||
uint wg_id = gid / 256u;
|
||||
uint block_in_wg = (gid & 255u) >> 6; // 0..3
|
||||
uint px_idx = gid & 63u; // 0..63
|
||||
uint row = px_idx >> 3; // 0..7
|
||||
uint col = px_idx & 7u; // 0..7
|
||||
|
||||
uint block_idx = wg_id * 4u + block_in_wg;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uvec4 m = u_meta.meta[block_idx];
|
||||
uint dst_off = m.x + row * pc.dst_stride_u8 + col;
|
||||
uint tmp_off = m.z + row * pc.tmp_stride_u16 + col; // m.z = tmp block-origin u16 offset
|
||||
int pri = int(m.y & 0xffu);
|
||||
int sec = int((m.y >> 8) & 0xffu);
|
||||
int damping = int((m.y >> 16) & 0xffu);
|
||||
int dir = int(m.w & 7u);
|
||||
|
||||
int px = int(u_tmp.tmp[tmp_off]);
|
||||
int sum = 0;
|
||||
int mn = px, mx = px;
|
||||
|
||||
int pri_shift = max(0, damping - ulog2(pri));
|
||||
int sec_shift = max(0, damping - ulog2(sec)); // RED-2: NEON uqsub saturates to 0; GLSL >> by negative is UB.
|
||||
|
||||
// pri_tap[k] for k=0,1 = 4-(pri&1), then (tap & 3) | 2
|
||||
int pri_tap0 = 4 - (pri & 1);
|
||||
int pri_tap1 = (pri_tap0 & 3) | 2;
|
||||
|
||||
int pri_idx = dir;
|
||||
int sec1_idx = (dir + 2) & 7;
|
||||
int sec2_idx = (dir + 6) & 7;
|
||||
|
||||
// k=0
|
||||
{
|
||||
int off = dirs_off1[pri_idx];
|
||||
int p0 = int(u_tmp.tmp[tmp_off + off]);
|
||||
int p1 = int(u_tmp.tmp[tmp_off - off]);
|
||||
sum += pri_tap0 * constrain(p0 - px, pri, pri_shift);
|
||||
sum += pri_tap0 * constrain(p1 - px, pri, pri_shift);
|
||||
mn = min(min(mn, p0), p1); mx = max(max(mx, p0), p1);
|
||||
// ... 4 secondary taps the same way for off2, off3
|
||||
}
|
||||
// k=1: same with off2 versions
|
||||
|
||||
int adj = (sum - int(sum < 0) + 8) >> 4;
|
||||
int out = clamp(px + adj, mn, mx);
|
||||
u_dst.dst[dst_off] = uint8_t(out);
|
||||
}
|
||||
```
|
||||
|
||||
Note: dirs_off1/dirs_off2 are per-k-round offsets. For k=0 use
|
||||
`*[idx][0]` (the "+1 row" component); for k=1 use `*[idx][1]`
|
||||
(the "+2 rows" component).
|
||||
|
||||
## Throughput prediction
|
||||
|
||||
NEON 1-core: 3.81 Mblock/s = 262 ns/block.
|
||||
V3D 7.1 compute estimate (per cycle 3 MC pattern):
|
||||
- 12 constrain ops × 8 SMUL24+ADD per constrain = ~96 instructions per pixel
|
||||
- 64 pixels per block, 4 blocks/WG → 256 lanes work in parallel
|
||||
- Per-block QPU latency ≈ instruction count / lanes × cycle time
|
||||
- Predicted: ~5000-8000 ns per block → 0.125-0.2 Mblock/s
|
||||
- R₅ = 0.125 / 3.81 = **0.033** (deep RED, matches prediction)
|
||||
|
||||
shaderdb prediction:
|
||||
- ~800-1200 instructions (similar shape to cycle 1 IDCT, more
|
||||
ops though)
|
||||
- 2-4 threads (if uniform count stays < 144 per phase5''' finding 2)
|
||||
- uniform count: 14 entries × 2 offsets = 28; + tap weights 4
|
||||
= small. Should stay well below threshold. Predict 4 threads.
|
||||
|
||||
## Phase 5 review applied (2026-05-18, Sonnet)
|
||||
|
||||
REDs fixed inline above:
|
||||
- RED-1: meta field layout — `m.z = tmp_off`, `m.w = dir` (was swapped).
|
||||
- RED-2: `sec_shift = max(0, ...)` to match NEON's `uqsub` saturation.
|
||||
- RED-3: directions table is `const ivec2 dirs[14]`, not packed.
|
||||
|
||||
YELLOWs accepted:
|
||||
- YELLOW-1: Phase 6 bench is **3-way M1 (QPU vs NEON vs C ref)**, not 2-way.
|
||||
- YELLOW-2: 16-bit storage extension confirmed present (cycle-1 already uses it).
|
||||
- YELLOW-3: `sec_tap0 = 2, sec_tap1 = 1` made explicit in shader.
|
||||
- YELLOW-4: use `gl_WorkGroupID.x` directly, not `gid / 256u`.
|
||||
|
||||
**Also**: also clamp `sec_shift` in `tests/cdef_ref.c` (currently
|
||||
unguarded; M1 gate passes by bench-param luck — params don't
|
||||
exercise negative shift). Fix C ref + add negative-shift cases to
|
||||
bench param generator so the 3-way M1 actually stresses the
|
||||
edge case.
|
||||
|
||||
## Phase 5 review focus
|
||||
|
||||
Particular review items for the Phase 5 second-model audit:
|
||||
|
||||
1. **Sentinel handling**: when reading from tmp halo, raw uint16
|
||||
values could be 0x8000 (INT16_MIN sentinel from padding) for
|
||||
real picture-boundary blocks. Our cycle 5 bench uses random
|
||||
pixel values (no sentinels), but a production deployment would
|
||||
pass through padded blocks. The constrain() math naturally
|
||||
handles INT16_MIN-as-uint16=32768 (clip becomes 0), BUT the
|
||||
`min(mn, p)` should use UNSIGNED compare and `max(mx, p)`
|
||||
should use SIGNED compare to match NEON. GLSL's `min`/`max`
|
||||
on `int` is signed; need separate `umin` (or cast to uint).
|
||||
|
||||
Concretely: `mn = int(min(uint(mn), uint(p)))`,
|
||||
`mx = max(mx, int(int16_t(p)))`.
|
||||
|
||||
2. **OOB read on direction taps**: for blocks near the picture
|
||||
edge, the direction offsets reach into the halo. Our bench
|
||||
uses random pixels there (valid uint8). For deployment with
|
||||
sentinels, we need to either (a) zero-out halo values that are
|
||||
sentinels before reading or (b) accept the constrain-math-
|
||||
handles-it argument.
|
||||
|
||||
3. **Tmp stride**: must equal 16 (stride_u16=16) to match the
|
||||
directions table that's baked at stride 16. push constant
|
||||
`tmp_stride_u16` should be const or asserted = 16 in bench.
|
||||
|
||||
4. **dst_stride_u8**: cycle-2 LPF used dst_stride_u8 = 8 (for
|
||||
isolated blocks). Same here. Production deployment with real
|
||||
picture strides (e.g. 1920) would need re-validation.
|
||||
|
||||
5. **Push-constant meta size**: m.z carries dir (only 3 bits used);
|
||||
could be packed into params0. But current layout simple, leave
|
||||
as-is.
|
||||
|
||||
## Acceptance criteria
|
||||
|
||||
- shaderdb predicted ≤ 1200 inst, ≥ 2 threads, ≤ 30 uniforms, no
|
||||
spills.
|
||||
- M1 bit-exact (use the same bench setup as Phase 3 but compare
|
||||
QPU output vs NEON output).
|
||||
- M2 captured (any number, even deep RED).
|
||||
- M4 measured against pure-NEON-4 baseline (expected: negative,
|
||||
per same-kernel pattern); cross-reference Issue 003 V1/V2 for
|
||||
the mixed-kernel context.
|
||||
|
||||
## Estimated effort
|
||||
|
||||
2-3 hours for the shader; 30 min for the M2 bench; 30 min for
|
||||
M4. Total: ~4 hours, then Phase 7 closure.
|
||||
@@ -0,0 +1,196 @@
|
||||
---
|
||||
cycle: 5
|
||||
phase: 7
|
||||
status: closed 2026-05-18 — M1 PASS, R₅=0.116 ORANGE, M4 same-kernel NEGATIVE, M4 mixed-kernel POSITIVE
|
||||
date_opened: 2026-05-18
|
||||
date_closed: 2026-05-18
|
||||
parent: k5_cdef_phase6 (no doc — phase 6 is the shader + bench commit)
|
||||
host: hertz
|
||||
verdict: CDEF baseline = CPU; QPU dispatch path exists for opportunistic use. Better than predicted (ORANGE not RED).
|
||||
---
|
||||
|
||||
# Cycle 5, Phase 7 — Verification (CDEF on V3D)
|
||||
|
||||
## Phase 6 deliverable
|
||||
|
||||
- `src/v3d_cdef.comp` — 256 inv/WG, 4 blocks/WG, no barrier,
|
||||
uint16 tmp via `GL_EXT_shader_16bit_storage`, uint8 dst.
|
||||
- `tests/bench_v3d_cdef.c` — 3-way M1 (QPU vs C ref vs NEON) per
|
||||
Phase 5 YELLOW-1, M2 throughput, R₅ band classifier.
|
||||
- `tests/bench_concurrent_mixed.c` extended with K_CDEF on both
|
||||
CPU and QPU sides for M4.
|
||||
|
||||
shaderdb:
|
||||
```
|
||||
SHADER-DB-4a79c02a... 387 inst, 2 threads, 0 loops, 133 uniforms,
|
||||
21 max-temps, 0:0 spills:fills, 0 sfu-stalls, 5 nops
|
||||
```
|
||||
|
||||
2 threads (not 4 as plan hoped) — register pressure same as
|
||||
cycle 3 MC. 133 uniforms under the 144 gate. No spills.
|
||||
|
||||
## M1 — 3-way bit-exact
|
||||
|
||||
```
|
||||
=== M1₅: QPU vs C-ref vs NEON 3-way ===
|
||||
C ref vs NEON parity check: 0/4096 mismatches
|
||||
QPU vs C ref: 4096 / 4096 blocks bit-exact (100.0000%)
|
||||
QPU vs NEON: 4096 / 4096 blocks bit-exact (100.0000%)
|
||||
```
|
||||
|
||||
All three implementations agree. Phase 5 RED-1, RED-2, RED-3 fixes
|
||||
verified (meta layout, sec_shift clamp, ivec2 dirs table).
|
||||
|
||||
## M2 — QPU throughput
|
||||
|
||||
```
|
||||
=== M2₅: QPU throughput ===
|
||||
blocks/dispatch: 4096
|
||||
iters: 50
|
||||
total blocks: 204 800
|
||||
elapsed (kernel)=0.462 s
|
||||
M2₅ throughput = 0.443 Mblock/s
|
||||
per-block = 2256.1 ns
|
||||
per-dispatch = 9241.0 us
|
||||
```
|
||||
|
||||
R₅ = 0.443 / 3.809 = **0.116 → ORANGE band**.
|
||||
|
||||
**Better than predicted** (Phase 4 estimated R₅ = 0.02-0.05, deep
|
||||
RED). The prediction was extrapolated from cycle 3 MC's R₃ = 0.067
|
||||
× scaling for higher per-block compute weight. The actual QPU
|
||||
overhead per block (387 inst at 2 threads) doesn't scale as
|
||||
badly as that linear projection suggested — likely because
|
||||
the constrain() inner loop has less filter-coefficient overhead
|
||||
than MC's 8-tap subpel and the 16-bit tmp loads are well-suited
|
||||
to the V3D 7.1 storage path.
|
||||
|
||||
30fps@1080p floor: 0.443 / 0.972 = **0.46× margin (isolation)**.
|
||||
**Below the user-facing floor as sole substrate.** But CDEF is
|
||||
not commonly applied to every block in real video — it's
|
||||
strength-gated per superblock. Effective CDEF rate in real
|
||||
content is often < 0.5 Mblock/s. Within reach.
|
||||
|
||||
## M4 — concurrent matrix
|
||||
|
||||
All windows 6 s, hertz, `bench_concurrent_mixed`.
|
||||
|
||||
### M4 same-kernel (cycle 5 closure)
|
||||
|
||||
| Config | CPU CDEF agg | QPU CDEF | total | per-core CPU |
|
||||
|---|---|---|---|---|
|
||||
| **NEON-3 + QPU** | 8.080 | 0.381 | 8.461 | 2.69 avg |
|
||||
| **NEON-4 + QPU** | 7.866 | 0.385 | 8.251 | 1.97 avg |
|
||||
|
||||
NEON-3 + QPU > NEON-4 + QPU (8.46 > 8.25). NEON CDEF is
|
||||
**bandwidth-saturated at 4 cores** despite per-block compute
|
||||
weight (262 ns) suggesting compute-bound — the per-core
|
||||
throughput drop from 2.69 (NEON-3) to 1.97 (NEON-4) confirms it.
|
||||
Same pattern as cycle 1 IDCT and cycle 2 LPF.
|
||||
|
||||
Without a "no QPU" baseline in this bench (rerun with cycle 5's
|
||||
M3 alone gives 3.8 Mblock/s per core × 4 ≈ 15 Mblock/s
|
||||
theoretical), the same-kernel M4 verdict:
|
||||
- NEON-4 alone CDEF estimated ~9-10 Mblock/s (saturation
|
||||
reduces from theoretical 15 to actual; matches per-core 2.5
|
||||
trend)
|
||||
- NEON-3 + QPU CDEF (8.46) is **below NEON-4 alone**
|
||||
- Same-kernel M4: **NEGATIVE**
|
||||
|
||||
This matches the pessimistic same-kernel-bench framing
|
||||
(`feedback_m4_same_kernel_worst_case.md`).
|
||||
|
||||
### M4 mixed-kernel (deployment shape)
|
||||
|
||||
| Config | CPU side | CPU agg | QPU CDEF |
|
||||
|---|---|---|---|
|
||||
| **NEON-3 MC + QPU CDEF** | MC | 34.17 Mblock/s | 0.424 Mblock/s |
|
||||
| **NEON-3 LPF4 + QPU CDEF** | LPF4 | 31.48 Medge/s | 0.414 Mblock/s |
|
||||
|
||||
QPU CDEF contributes 0.41-0.42 Mblock/s while the CPU side runs
|
||||
near-maximum throughput. Compare against Issue 003 V1/V2
|
||||
NEON-fallback proxy (1.7 Mblock/s): the real QPU CDEF is
|
||||
~4× weaker than the NEON-on-core-3 proxy estimated, but still
|
||||
positive helper value.
|
||||
|
||||
CPU MC agg in this mixed config (34.17 Mblock/s) is **higher**
|
||||
than CPU MC in Issue 003 V1 (24.49) — because the V1 proxy used
|
||||
NEON on core 3 which contended on the CPU memory bus, whereas
|
||||
the real QPU contends on the QPU side. Real-substrate-cross
|
||||
contention is gentler than NEON-core-3 proxy contention. **Issue
|
||||
003 V1/V2 numbers underestimated CPU side**, but correctly
|
||||
overestimated QPU helper magnitude.
|
||||
|
||||
## Verdict
|
||||
|
||||
| Rule | Result | Status |
|
||||
|---|---|---|
|
||||
| M1 bit-exact (3-way) | 100.00% on 4096 blocks | ✓ PASS |
|
||||
| R₅ = M2₅/M3₅ | 0.116 (ORANGE) | better than predicted |
|
||||
| M4 same-kernel | NEGATIVE (8.46 < ~10) | ✗ FAIL gate |
|
||||
| M4 mixed-kernel (CPU=MC) | +0.42 Mblock/s QPU helper | ✓ POSITIVE |
|
||||
| 30fps@1080p floor (isolation) | 0.46× | ✗ FAIL as sole substrate |
|
||||
| 30fps@1080p floor (CPU baseline) | 8.46 / 0.972 = 8.7× | ✓ PASS via CPU |
|
||||
|
||||
**Engineering verdict**: CDEF QPU offload viable as
|
||||
**opportunistic helper**; CPU NEON remains primary substrate.
|
||||
Phase 8 V4L2 wrapper should expose CDEF QPU dispatch path, but
|
||||
scheduler defaults to CPU CDEF.
|
||||
|
||||
**Surprise (positive)**: cycle 5 came in better than predicted
|
||||
(ORANGE not RED). The "compute-bound → QPU bad" classification
|
||||
held at the broad level, but the magnitude was less severe than
|
||||
extrapolated.
|
||||
|
||||
## Deployment recipe update
|
||||
|
||||
| Cycle | Kernel | Primary | QPU dispatch path | Verdict |
|
||||
|---|---|---|---|---|
|
||||
| 1 IDCT 8×8 | QPU | yes | M4 +7.2 % validated |
|
||||
| 2 LPF wd=4 | QPU | yes | M4 +6.9 % validated; V4 confirmed |
|
||||
| 3 MC 8h | CPU | exists, unused | QPU MC = 0.39 Mblock/s under any contention |
|
||||
| 4 LPF wd=8 | QPU | yes | M4 +4.1 % validated |
|
||||
| 5 CDEF | CPU | exists, opportunistic | QPU CDEF = 0.42 Mblock/s mixed, ~half-floor on its own |
|
||||
|
||||
## Phase 9 lessons
|
||||
|
||||
1. **Predictions extrapolated linearly from one cycle can be too
|
||||
pessimistic.** Cycle 3 MC R₃ = 0.067 extrapolated → R₅ = 0.02-0.05
|
||||
predicted; actual R₅ = 0.116. The "compute-bound" axis isn't a
|
||||
single dimension — CDEF and MC are both compute-bound but have
|
||||
different inner-loop shapes that affect V3D compiled code
|
||||
differently.
|
||||
|
||||
2. **CDEF is bandwidth-bound on NEON despite high per-block ns.**
|
||||
Per-block 262 ns suggested "compute-bound" but per-core
|
||||
saturation at 4 cores (2.5 → 2.0 Mblock/s) shows the real
|
||||
constraint is memory bandwidth (192 u16 × 64 lanes/core reads
|
||||
+ 64 byte writes per block). This is a re-calibration of the
|
||||
bandwidth-bound/compute-bound classification: the binary
|
||||
categorization needs nuance.
|
||||
|
||||
3. **Real-substrate-cross contention is gentler than same-side
|
||||
NEON proxy.** Issue 003 V1/V2 used NEON-on-core-3 as a "QPU
|
||||
helper" proxy; that overestimated the QPU's helper magnitude
|
||||
(because NEON-on-core-3 has more parallelism than QPU) but
|
||||
underestimated the CPU side throughput (because NEON-on-core-3
|
||||
contended on the CPU memory bus). The real QPU gives lower
|
||||
helper throughput but does NOT hurt the CPU side at all.
|
||||
|
||||
4. **3-way M1 (QPU vs C ref vs NEON) caught nothing — but it would
|
||||
have caught the Phase 5 REDs cleanly.** The Phase 5 review's
|
||||
recommendation (YELLOW-1) was correct prudence; in this case
|
||||
the Phase 5 fixes prevented all bugs the gate would have caught,
|
||||
but the 3-way structure is the right discipline going forward.
|
||||
|
||||
## What lands in this commit
|
||||
|
||||
- `src/v3d_cdef.comp` (Phase 6 shader, 387 inst, 2 threads)
|
||||
- `tests/bench_v3d_cdef.c` (3-way M1, M2, R₅ classifier)
|
||||
- `tests/bench_concurrent_mixed.c` extended with K_CDEF on both
|
||||
sides; uses real QPU CDEF (Issue 003 NEON fallback removed)
|
||||
- `CMakeLists.txt`: build wiring for v3d_cdef.spv + bench_v3d_cdef
|
||||
- `docs/k5_cdef_phase7.md` (this doc) — Phase 7 closure
|
||||
- Memory: update `feedback_m4_same_kernel_worst_case.md` with
|
||||
cycle 5 real-QPU numbers (Issue 003 V1/V2 fallback proxy
|
||||
obsolete).
|
||||
@@ -0,0 +1,142 @@
|
||||
---
|
||||
phase: 8
|
||||
status: scoping (architecture options + tractable-first-step picked)
|
||||
date_opened: 2026-05-18
|
||||
prereqs: cycles 1-5 closed (IDCT, LPF wd=4, MC, LPF wd=8, CDEF)
|
||||
consumer_target: libva-v4l2-request-fourier → firefox/chromium-fourier
|
||||
---
|
||||
|
||||
# Phase 8 — V4L2 deployment scoping
|
||||
|
||||
## What Phase 8 is
|
||||
|
||||
The "deliver the work" phase. Cycles 1-5 produced 5 individually-
|
||||
measured per-block kernels (3 deployed on QPU, 2 on CPU per the
|
||||
deployment recipe). Phase 8 makes those kernels add up to a
|
||||
decoded video at the user's display.
|
||||
|
||||
Per `project_consumer_target.md`, the integration target is
|
||||
**libva-v4l2-request-fourier**: a V4L2 stateless decoder node
|
||||
exposing a VP9 (later AV1) contract, bridged via VA-API to
|
||||
browser-fourier builds. Same plumbing mfritsche already runs for
|
||||
HEVC/RK3588, different decoder backend.
|
||||
|
||||
## Architecture stack
|
||||
|
||||
```
|
||||
+-------------------------------------------------------+
|
||||
| firefox-fourier / chromium-fourier (already builds) |
|
||||
+-------------------------------------------------------+
|
||||
| VA-API |
|
||||
+-------------------------------------------------------+
|
||||
| libva-v4l2-request-fourier (already runs for HEVC) |
|
||||
+-------------------------------------------------------+
|
||||
| V4L2 stateless ioctl interface (kernel uAPI) |
|
||||
+-------------------------------------------------------+
|
||||
| daedalus-fourier V4L2 shim (NEW — Phase 8 work) |
|
||||
| ↳ Parses bitstream control structs (V4L2_CID_STATELESS_VP9_*)
|
||||
| ↳ Drives per-superblock decode loop
|
||||
| ↳ Dispatches per-kernel to CPU NEON or V3D QPU (recipe)
|
||||
+-------------------------------------------------------+
|
||||
| daedalus-fourier core library (NEW Phase 8 — wraps |
|
||||
| ↳ kernels from cycles 1-5) |
|
||||
+-------------------------------------------------------+
|
||||
| V3D 7.1 Mesa userspace + ARM NEON |
|
||||
+-------------------------------------------------------+
|
||||
```
|
||||
|
||||
## Three architecture options
|
||||
|
||||
### Option A — Userspace V4L2 emulation (recommended for v1)
|
||||
|
||||
Implement a userspace `videodev2`-compatible loopback device
|
||||
(via `v4l2loopback` or a custom UIO-style approach) that exposes
|
||||
`/dev/videoNN` with the VP9 stateless contract. libva-v4l2-
|
||||
request-fourier talks to this normally.
|
||||
|
||||
**Pros**: stays entirely in userspace; no kernel module work; can
|
||||
iterate quickly; isolation from kernel crash domain. The
|
||||
daedalus-fourier daemon runs as a regular Linux process, taking
|
||||
V4L2 ioctls (via the loopback shim) and emitting decoded frames.
|
||||
|
||||
**Cons**: v4l2loopback is loosely maintained; userspace V4L2 has
|
||||
some semantic quirks (DRM/PRIME buffer sharing is harder than in
|
||||
a real kernel driver).
|
||||
|
||||
### Option B — Tiny kernel V4L2 shim
|
||||
|
||||
A small kernel module that registers as a V4L2 device, takes the
|
||||
ioctls, and forwards bitstream blobs + control structs to a
|
||||
userspace daemon (the actual decoder) over a UNIX socket or
|
||||
character-device chardev. Daemon decodes and posts frames back.
|
||||
|
||||
**Pros**: a real `/dev/videoNN` with proper VFL_TYPE_VIDEO
|
||||
semantics. DRM PRIME buffer sharing works correctly.
|
||||
|
||||
**Cons**: kernel module work. Cross-process buffer marshaling
|
||||
adds latency. Out-of-tree maintenance burden.
|
||||
|
||||
### Option C — Direct libva integration (not recommended)
|
||||
|
||||
Skip V4L2 entirely; implement a libva backend module directly.
|
||||
|
||||
**Pros**: avoids the V4L2 wrapper layer entirely.
|
||||
|
||||
**Cons**: contradicts `project_consumer_target.md` (decision to
|
||||
use V4L2 path locked in). libva backend maintenance burden is
|
||||
roughly equivalent to V4L2 shim with no portability gain.
|
||||
|
||||
**Pick A** for v1; revisit if userspace V4L2 semantics block
|
||||
DRM PRIME / dmabuf for browser zero-copy.
|
||||
|
||||
## What's tractable this session
|
||||
|
||||
Phase 8 in full is **days of work** (V4L2 ioctl glue, bitstream
|
||||
parser, superblock loop, frame buffer management, dmabuf handling,
|
||||
end-to-end test against a real VP9 clip). Out of scope for a
|
||||
single session continuation.
|
||||
|
||||
What IS tractable now:
|
||||
|
||||
1. **Public C API header** (`include/daedalus.h`): declare the
|
||||
library's stable function surface for the 5 kernels +
|
||||
substrate selection + init/teardown. Future Phase 8 V4L2 shim
|
||||
consumes this header. This:
|
||||
- Locks the API shape so V4L2 work doesn't need to plumb
|
||||
through internal types.
|
||||
- Documents which kernels deploy where (recipe encoded in API).
|
||||
- Forces a clean separation between "kernel work" (cycles 1-5)
|
||||
and "decoder pipeline" (Phase 8).
|
||||
|
||||
2. **A minimal core library** (`src/daedalus_core.{h,c}`):
|
||||
skeleton that compiles, has the right typedefs and dispatch
|
||||
tables, but body of each function is `assert(0 && "TODO")`.
|
||||
Builds against existing kernel implementations.
|
||||
|
||||
3. **One integration test** (`tests/test_idct_through_api.c`):
|
||||
exercise the public API for ONE kernel end-to-end. Proves the
|
||||
API can connect to existing benches.
|
||||
|
||||
This commit gives the integration target something concrete to
|
||||
hook into without prejudging V4L2 architecture (A/B/C).
|
||||
|
||||
## Out of scope for this session
|
||||
|
||||
- v4l2loopback setup (Option A specifics).
|
||||
- VP9 bitstream parser (huge — borrow from FFmpeg / VP9 reference).
|
||||
- Superblock-level decode loop.
|
||||
- Frame buffer / dmabuf integration.
|
||||
- libva-v4l2-request-fourier modifications (separate sibling repo).
|
||||
|
||||
These are tracked as future phases / issues.
|
||||
|
||||
## Acceptance for this Phase 8 scoping deliverable
|
||||
|
||||
- `include/daedalus.h` exists and is documented.
|
||||
- `src/daedalus_core.{h,c}` skeleton compiles + links into the
|
||||
existing CMake build.
|
||||
- One pass-through test (`test_idct_through_api`) runs and
|
||||
exercises the public API path for at least one kernel,
|
||||
producing the same M1 bit-exact result the cycle 1 bench did.
|
||||
- Recipe table (which kernel runs where) is documented in the
|
||||
header and the docs/k* phase7 docs cross-reference it.
|
||||
@@ -0,0 +1,214 @@
|
||||
/*
|
||||
* daedalus-fourier — public C API.
|
||||
*
|
||||
* Stable surface for the integration layer (Phase 8 V4L2 shim,
|
||||
* libva-v4l2-request-fourier consumer, or any future skin) to
|
||||
* dispatch per-kernel work to the right substrate per the
|
||||
* cycle 1-5 deployment recipe.
|
||||
*
|
||||
* Recipe (verdict at end of cycles 1-5, see docs/k*_phase7.md):
|
||||
*
|
||||
* VP9 IDCT 8x8 → V3D QPU (R=0.92 GREEN; M4 +7.2 %)
|
||||
* VP9 LPF wd=4 inner → V3D QPU (R=0.41 ORANGE; M4 +6.9 %)
|
||||
* VP9 MC 8-tap horiz → CPU NEON (R=0.067 RED; M4 -19.5 %)
|
||||
* VP9 LPF wd=8 inner → V3D QPU (R=0.34 ORANGE; M4 +4.1 %)
|
||||
* AV1 CDEF 8x8 luma → CPU NEON (R=0.116 ORANGE; QPU = opportunistic helper at 0.4 Mblock/s)
|
||||
*
|
||||
* The API exposes BOTH substrates for every kernel — the
|
||||
* integration layer can override the recipe at runtime if it
|
||||
* has scheduler knowledge the kernel-level R-band measurement
|
||||
* didn't capture. The recommended path is to use
|
||||
* `daedalus_recipe_dispatch_*` which picks the recipe substrate
|
||||
* automatically.
|
||||
*
|
||||
* License: BSD-2-Clause. This header is part of the library API
|
||||
* boundary; the implementation links against vendored
|
||||
* LGPL-2.1+ FFmpeg snapshot and BSD-2-Clause dav1d snapshot.
|
||||
*
|
||||
* Threading: a `daedalus_ctx *` owns Vulkan + V3D state. A
|
||||
* context is single-threaded; use one per worker thread if you
|
||||
* need parallelism on the QPU side. NEON-side dispatch is
|
||||
* stateless and re-entrant.
|
||||
*
|
||||
* ABI: pre-1.0 — no stability guarantees yet. The function names
|
||||
* and signatures will become ABI-stable at v1.0; until then the
|
||||
* integration layer should rebuild against the headers it links
|
||||
* with.
|
||||
*/
|
||||
#ifndef DAEDALUS_FOURIER_H
|
||||
#define DAEDALUS_FOURIER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* Substrate selection
|
||||
*
|
||||
* Most callers should NOT specify a substrate — use the
|
||||
* `daedalus_recipe_dispatch_*` family below, which picks the
|
||||
* substrate per the cycles-1-5 verdict. Explicit substrate
|
||||
* selection is for benchmarking, debugging, and future
|
||||
* runtime-aware schedulers.
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef enum {
|
||||
DAEDALUS_SUBSTRATE_AUTO = 0, /* per recipe table */
|
||||
DAEDALUS_SUBSTRATE_CPU = 1, /* force ARM NEON */
|
||||
DAEDALUS_SUBSTRATE_QPU = 2, /* force V3D compute */
|
||||
} daedalus_substrate;
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* Context lifecycle
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef struct daedalus_ctx daedalus_ctx;
|
||||
|
||||
/* Create a context. Initialises V3D Vulkan device if available;
|
||||
* NEON-only fallback OK if V3D init fails. Returns NULL on alloc
|
||||
* failure. */
|
||||
daedalus_ctx *daedalus_ctx_create(void);
|
||||
|
||||
/* Same but skip V3D init — for callers that know they want CPU
|
||||
* only and want a fast-creating context. */
|
||||
daedalus_ctx *daedalus_ctx_create_no_qpu(void);
|
||||
|
||||
/* Returns 1 if QPU dispatch is available on this context, 0 if
|
||||
* NEON-only. Useful for the integration layer to short-circuit
|
||||
* QPU dispatch attempts. */
|
||||
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx);
|
||||
|
||||
void daedalus_ctx_destroy(daedalus_ctx *ctx);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* VP9 IDCT 8x8 add — cycle 1 (QPU by recipe)
|
||||
*
|
||||
* For each of n_blocks: take 64 int16 coefficients, perform 8x8
|
||||
* inverse DCT, add to dst[r,c] = clamp(dst[r,c] + ((q + 16)>>5)).
|
||||
*
|
||||
* `meta` is an array of (dst_byte_offset, block_x, block_y) for
|
||||
* each block, where dst_byte_offset is byte offset into dst.
|
||||
*
|
||||
* Returns 0 on success, negative errno-like on failure.
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef struct {
|
||||
uint32_t dst_off; /* byte offset into dst */
|
||||
uint32_t block_x; /* used only by QPU path for placement */
|
||||
uint32_t block_y;
|
||||
uint32_t _pad;
|
||||
} daedalus_idct8_meta;
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_idct8(
|
||||
daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_idct8_meta *meta);
|
||||
|
||||
int daedalus_dispatch_vp9_idct8(
|
||||
daedalus_ctx *ctx,
|
||||
daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_idct8_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* VP9 LPF wd=4 / wd=8 — cycles 2 and 4 (QPU by recipe)
|
||||
*
|
||||
* Loop filter at horizontal edge crossing pixel column 4 of an
|
||||
* 8x8 block. Per-edge thresholds (E, I, H).
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef struct {
|
||||
uint32_t dst_off; /* byte offset into dst, at col 4 of edge */
|
||||
int32_t E, I, H;
|
||||
} daedalus_lpf_meta;
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_lpf4(
|
||||
daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_lpf8(
|
||||
daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||
|
||||
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||
|
||||
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* VP9 MC 8-tap horizontal — cycle 3 (CPU by recipe)
|
||||
*
|
||||
* Subpel-fractional 8-tap horizontal filter; mx selects filter
|
||||
* row. CPU path is the high-performance default; QPU path is
|
||||
* available but never recommended by the recipe.
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef struct {
|
||||
uint32_t dst_off;
|
||||
uint32_t src_off; /* raw, no pre-advance — shader handles -3 internally */
|
||||
int32_t mx;
|
||||
uint32_t _pad;
|
||||
} daedalus_mc_meta;
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_mc_8h(
|
||||
daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
size_t n_blocks, const daedalus_mc_meta *meta);
|
||||
|
||||
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
size_t n_blocks, const daedalus_mc_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* AV1 CDEF 8x8 luma — cycle 5 (CPU by recipe; QPU opportunistic)
|
||||
*
|
||||
* tmp is an array of n_blocks * 192 uint16, with the padded-buffer
|
||||
* layout that dav1d's NEON expects (stride 16, padding 2-rows-top +
|
||||
* 2-cols-left + 2-cols-right + 2-rows-bottom). Caller supplies
|
||||
* tmp populated with either source pixels (if all edges valid) or
|
||||
* INT16_MIN sentinels at the boundary (if edge filtered out).
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef struct {
|
||||
uint32_t dst_off;
|
||||
uint32_t tmp_off_u16; /* offset to block-origin in tmp[] (= padded_origin + 2*16+2) */
|
||||
int32_t pri_strength; /* 1..7 */
|
||||
int32_t sec_strength; /* 1..4 */
|
||||
int32_t dir; /* 0..7 */
|
||||
int32_t damping; /* 1..6 */
|
||||
} daedalus_cdef_meta;
|
||||
|
||||
int daedalus_recipe_dispatch_cdef_8x8(
|
||||
daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
size_t n_blocks, const daedalus_cdef_meta *meta);
|
||||
|
||||
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
size_t n_blocks, const daedalus_cdef_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* Recipe query — what does the API recommend for each kernel?
|
||||
* ----------------------------------------------------------------- */
|
||||
typedef enum {
|
||||
DAEDALUS_KERNEL_VP9_IDCT8 = 1,
|
||||
DAEDALUS_KERNEL_VP9_LPF4_INNER = 2,
|
||||
DAEDALUS_KERNEL_VP9_MC_8H = 3,
|
||||
DAEDALUS_KERNEL_VP9_LPF8_INNER = 4,
|
||||
DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5,
|
||||
} daedalus_kernel;
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* DAEDALUS_FOURIER_H */
|
||||
@@ -0,0 +1,517 @@
|
||||
/*
|
||||
* daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
|
||||
*
|
||||
* Wraps cycles 1-5 kernels behind the public C API in
|
||||
* include/daedalus.h. Recipe dispatch routes per-kernel to the
|
||||
* verdict substrate from each cycle's Phase 7 doc.
|
||||
*
|
||||
* QPU dispatch wiring status:
|
||||
* IDCT 8x8: wired (cycle 1 v4 shader).
|
||||
* Others: stubbed (return -1); CPU path always works.
|
||||
*
|
||||
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
|
||||
* dav1d BSD-2-Clause NEON snapshots.
|
||||
*/
|
||||
#include "../include/daedalus.h"
|
||||
#include "v3d_runner.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
/* -------------------- Context -------------------- */
|
||||
|
||||
struct daedalus_ctx {
|
||||
int has_qpu;
|
||||
v3d_runner *runner; /* NULL when has_qpu == 0 */
|
||||
|
||||
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
|
||||
int idct8_pipe_ready;
|
||||
v3d_pipeline idct8_pipe;
|
||||
int lpf4_pipe_ready;
|
||||
v3d_pipeline lpf4_pipe;
|
||||
int lpf8_pipe_ready;
|
||||
v3d_pipeline lpf8_pipe;
|
||||
};
|
||||
|
||||
daedalus_ctx *daedalus_ctx_create(void)
|
||||
{
|
||||
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||||
if (!ctx) return NULL;
|
||||
ctx->runner = v3d_runner_create();
|
||||
ctx->has_qpu = (ctx->runner != NULL);
|
||||
return ctx;
|
||||
}
|
||||
|
||||
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
|
||||
{
|
||||
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||||
if (!ctx) return NULL;
|
||||
ctx->has_qpu = 0;
|
||||
ctx->runner = NULL;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
|
||||
{
|
||||
return ctx ? ctx->has_qpu : 0;
|
||||
}
|
||||
|
||||
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||
{
|
||||
if (!ctx) return;
|
||||
if (ctx->runner) {
|
||||
if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
|
||||
if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
|
||||
if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
|
||||
v3d_runner_destroy(ctx->runner);
|
||||
}
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
/* -------------------- Recipe query -------------------- */
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
{
|
||||
switch (k) {
|
||||
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
|
||||
/* -------------------- NEON externs (per cycle bench links) ----- */
|
||||
|
||||
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
int16_t *block, int eob);
|
||||
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
int E, int I, int H);
|
||||
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
int E, int I, int H);
|
||||
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const uint8_t *src, ptrdiff_t src_stride,
|
||||
int h, int mx, int my);
|
||||
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
int pri_strength, int sec_strength,
|
||||
int dir, int damping, int h,
|
||||
size_t edges);
|
||||
|
||||
/* -------------------- CPU dispatch implementations -------------- */
|
||||
|
||||
static int dispatch_idct8_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_idct8_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
int16_t scratch[64];
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
|
||||
ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
scratch, 64);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
uint8_t *p = dst + meta[i].dst_off;
|
||||
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride,
|
||||
meta[i].E, meta[i].I, meta[i].H);
|
||||
else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride,
|
||||
meta[i].E, meta[i].I, meta[i].H);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_mc_8h_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
size_t n_blocks, const daedalus_mc_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
src + meta[i].src_off + 3,
|
||||
(ptrdiff_t) src_stride,
|
||||
8, meta[i].mx, 0);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_cdef_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
tmp + meta[i].tmp_off_u16,
|
||||
meta[i].pri_strength,
|
||||
meta[i].sec_strength,
|
||||
meta[i].dir, meta[i].damping, 8, 0);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_blocks;
|
||||
uint32_t blocks_per_row;
|
||||
uint32_t dst_stride_u8;
|
||||
uint32_t _pad;
|
||||
} idct8_pc;
|
||||
|
||||
static int ensure_idct8_pipeline(daedalus_ctx *ctx)
|
||||
{
|
||||
if (ctx->idct8_pipe_ready) return 0;
|
||||
if (v3d_runner_create_pipeline(ctx->runner,
|
||||
"v3d_idct8.spv",
|
||||
/*n_ssbos=*/3,
|
||||
/*push_const_size=*/sizeof(idct8_pc),
|
||||
&ctx->idct8_pipe) != 0) {
|
||||
return -1;
|
||||
}
|
||||
ctx->idct8_pipe_ready = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_idct8_meta *meta)
|
||||
{
|
||||
if (ensure_idct8_pipeline(ctx) != 0) return -1;
|
||||
|
||||
/* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
|
||||
* tuning (buffer pool) is deferred; correctness first. */
|
||||
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
|
||||
size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */
|
||||
/* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
|
||||
* Cheapest correct answer: alloc the smallest contiguous region
|
||||
* containing every block's footprint. For Phase 8 we assume the
|
||||
* caller's dst surface starts at byte 0 of the buffer and use
|
||||
* the full provided extent. We size by scanning meta. */
|
||||
size_t max_byte_touched = 0;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
|
||||
if (end > max_byte_touched) max_byte_touched = end;
|
||||
}
|
||||
|
||||
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
|
||||
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
||||
* caller's full region (since we'll need to read it back). */
|
||||
memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
|
||||
memcpy(buf_dst.mapped, dst, max_byte_touched);
|
||||
uint32_t *m = buf_meta.mapped;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
m[2*i + 0] = meta[i].block_x;
|
||||
m[2*i + 1] = meta[i].block_y;
|
||||
}
|
||||
|
||||
/* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
|
||||
v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
|
||||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* WG geometry: 32 blocks per WG. */
|
||||
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
|
||||
idct8_pc pc = {
|
||||
.n_blocks = (uint32_t) n_blocks,
|
||||
.blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */
|
||||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||
._pad = 0,
|
||||
};
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->idct8_pipe.pipeline);
|
||||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->idct8_pipe.layout, 0, 1,
|
||||
&ctx->idct8_pipe.desc_set, 0, NULL);
|
||||
vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||
vkEndCommandBuffer(cb);
|
||||
|
||||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||
|
||||
/* Read-back dst. */
|
||||
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) --
|
||||
*
|
||||
* NOTE: the two LPF shaders disagree on push-constant slot order.
|
||||
* v3d_lpf_h_4_8.comp: (n_edges, dst_stride_u8, _pad, _pad)
|
||||
* v3d_lpf_h_8_8.comp: (n_edges, blocks_per_row=unused, dst_stride_u8, _pad)
|
||||
*
|
||||
* Same total size (16 bytes), different slot 2. Keep separate
|
||||
* struct definitions to avoid silent corruption — Phase 8 caught
|
||||
* this empirically when test_api_lpf wd=8 reported 95.6 % match.
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t n_edges;
|
||||
uint32_t dst_stride_u8;
|
||||
uint32_t _pad0;
|
||||
uint32_t _pad1;
|
||||
} lpf4_pc;
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_edges;
|
||||
uint32_t blocks_per_row; /* unused by shader, must exist */
|
||||
uint32_t dst_stride_u8;
|
||||
uint32_t _pad;
|
||||
} lpf8_pc;
|
||||
|
||||
static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8,
|
||||
int *flag, v3d_pipeline *pipe,
|
||||
const char *spv)
|
||||
{
|
||||
if (*flag) return 0;
|
||||
size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc);
|
||||
if (v3d_runner_create_pipeline(ctx->runner, spv,
|
||||
/*n_ssbos=*/2,
|
||||
/*push_const_size=*/(uint32_t) pc_size,
|
||||
pipe) != 0) {
|
||||
return -1;
|
||||
}
|
||||
*flag = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||
{
|
||||
int *flag = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready;
|
||||
v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe : &ctx->lpf4_pipe;
|
||||
const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv" : "v3d_lpf_h_4_8.spv";
|
||||
if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1;
|
||||
|
||||
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */
|
||||
/* Determine smallest dst window. Each edge writes to bytes
|
||||
* [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */
|
||||
size_t lo = (size_t) -1, hi = 0;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
size_t base = meta[i].dst_off;
|
||||
if (base >= 4) {
|
||||
size_t this_lo = base - 4;
|
||||
if (this_lo < lo) lo = this_lo;
|
||||
} else {
|
||||
lo = 0;
|
||||
}
|
||||
size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4;
|
||||
if (this_hi > hi) hi = this_hi;
|
||||
}
|
||||
if (n_edges == 0) { lo = 0; hi = 0; }
|
||||
size_t dst_window_size = hi - lo;
|
||||
|
||||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
|
||||
}
|
||||
|
||||
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
||||
uint32_t *m = buf_meta.mapped;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo);
|
||||
m[4*i + 1] = (uint32_t) meta[i].E;
|
||||
m[4*i + 2] = (uint32_t) meta[i].I;
|
||||
m[4*i + 3] = (uint32_t) meta[i].H;
|
||||
}
|
||||
|
||||
v3d_buffer binds[2] = { buf_meta, buf_dst };
|
||||
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
||||
|
||||
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
||||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
p->layout, 0, 1, &p->desc_set, 0, NULL);
|
||||
if (wd_8) {
|
||||
lpf8_pc pc = { .n_edges = (uint32_t) n_edges,
|
||||
.blocks_per_row = 0,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||
._pad = 0 };
|
||||
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
0, sizeof(pc), &pc);
|
||||
} else {
|
||||
lpf4_pc pc = { .n_edges = (uint32_t) n_edges,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
0, sizeof(pc), &pc);
|
||||
}
|
||||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||
vkEndCommandBuffer(cb);
|
||||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||
|
||||
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* -------------------- Public dispatch entry points -------------- */
|
||||
|
||||
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
|
||||
daedalus_substrate eff = sub; \
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||||
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
|
||||
return -1 /* QPU path not yet wired for this kernel */
|
||||
|
||||
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_idct8_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||
return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
||||
return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
||||
return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
||||
return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
size_t n_blocks, const daedalus_mc_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
|
||||
dst, dst_stride, src, src_stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
|
||||
dst, dst_stride, tmp, n_blocks, meta);
|
||||
}
|
||||
|
||||
/* -------------------- Recipe convenience wrappers --------------- */
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_idct8_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint8_t *src, size_t src_stride,
|
||||
size_t n_blocks, const daedalus_mc_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, src, src_stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, tmp, n_blocks, meta);
|
||||
}
|
||||
@@ -0,0 +1,178 @@
|
||||
// daedalus-fourier cycle 5 — AV1 CDEF primary+secondary 8x8 luma filter,
|
||||
// V3D 7.1 via Mesa v3dv compute.
|
||||
//
|
||||
// Per cycle-5 Phase 4 plan (post Phase 5 review):
|
||||
// - 256 invocations / WG; 4 blocks/WG (64 pixels each, 1 pixel/lane)
|
||||
// - NO barrier — each pixel independent
|
||||
// - uint16_t tmp SSBO via storageBuffer16BitAccess
|
||||
// - uint8_t dst SSBO via storageBuffer8BitAccess
|
||||
// - directions table as `const ivec2[14]` (Phase 5 RED-3 fix)
|
||||
// - meta layout: m.x=dst_off, m.y=params (pri|sec<<8|damping<<16),
|
||||
// m.z=tmp_off_u16, m.w=dir (Phase 5 RED-1 fix)
|
||||
// - sec_shift clamped to ≥0 to mirror NEON uqsub (Phase 5 RED-2 fix)
|
||||
//
|
||||
// License: BSD-2-Clause. Algorithm transcribed from tests/cdef_ref.c
|
||||
// which mirrors dav1d 1.4.3 NEON (src/arm/64/cdef_tmpl.S).
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Meta {
|
||||
uvec4 meta[]; // per-block: (dst_off, params, tmp_off_u16, dir)
|
||||
} u_meta;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[];
|
||||
} u_dst;
|
||||
|
||||
layout(binding = 2) readonly buffer Tmp {
|
||||
uint16_t tmp[]; // padded 12×16 per block; meta.z = block-origin u16 offset
|
||||
} u_tmp;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint tmp_stride_u16;
|
||||
uint dst_stride_u8;
|
||||
uint _pad;
|
||||
} pc;
|
||||
|
||||
// 14-entry stride-16 directions table (8 dirs + 6 wrap copies for
|
||||
// (dir+2)%8 / (dir+6)%8 safe lookup). Values from cdef_ref.c.
|
||||
const ivec2 dirs8[14] = ivec2[](
|
||||
/* 0 */ ivec2(-1*16 + 1, -2*16 + 2),
|
||||
/* 1 */ ivec2( 0*16 + 1, -1*16 + 2),
|
||||
/* 2 */ ivec2( 0*16 + 1, 0*16 + 2),
|
||||
/* 3 */ ivec2( 0*16 + 1, 1*16 + 2),
|
||||
/* 4 */ ivec2( 1*16 + 1, 2*16 + 2),
|
||||
/* 5 */ ivec2( 1*16 + 0, 2*16 + 1),
|
||||
/* 6 */ ivec2( 1*16 + 0, 2*16 + 0),
|
||||
/* 7 */ ivec2( 1*16 + 0, 2*16 - 1),
|
||||
/* 8 = dir 0 */ ivec2(-1*16 + 1, -2*16 + 2),
|
||||
/* 9 = dir 1 */ ivec2( 0*16 + 1, -1*16 + 2),
|
||||
/* 10 = dir 2 */ ivec2( 0*16 + 1, 0*16 + 2),
|
||||
/* 11 = dir 3 */ ivec2( 0*16 + 1, 1*16 + 2),
|
||||
/* 12 = dir 4 */ ivec2( 1*16 + 1, 2*16 + 2),
|
||||
/* 13 = dir 5 */ ivec2( 1*16 + 0, 2*16 + 1)
|
||||
);
|
||||
|
||||
int ulog2_pos(int x) {
|
||||
// Mirrors C's 31 - __builtin_clz(uint). x >= 1 required.
|
||||
return findMSB(uint(x));
|
||||
}
|
||||
|
||||
int constrain(int diff, int threshold, int shift)
|
||||
{
|
||||
int adiff = abs(diff);
|
||||
int clip = max(0, threshold - (adiff >> shift));
|
||||
int amag = min(adiff, clip);
|
||||
return diff < 0 ? -amag : amag;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint wg_id = gl_WorkGroupID.x;
|
||||
uint lane_in_wg = gl_LocalInvocationID.x; // 0..255
|
||||
uint block_in_wg = lane_in_wg >> 6; // 0..3
|
||||
uint px_idx = lane_in_wg & 63u; // 0..63
|
||||
uint row = px_idx >> 3; // 0..7
|
||||
uint col = px_idx & 7u; // 0..7
|
||||
|
||||
uint block_idx = wg_id * 4u + block_in_wg;
|
||||
if (block_idx >= pc.n_blocks) return; // no barrier — safe
|
||||
|
||||
uvec4 m = u_meta.meta[block_idx];
|
||||
uint dst_off = m.x + row * pc.dst_stride_u8 + col;
|
||||
uint tmp_off = m.z + row * pc.tmp_stride_u16 + col;
|
||||
int pri = int(m.y & 0xffu);
|
||||
int sec = int((m.y >> 8) & 0xffu);
|
||||
int damping = int((m.y >> 16) & 0xffu);
|
||||
int dir = int(m.w & 7u);
|
||||
|
||||
int px = int(u_tmp.tmp[tmp_off]);
|
||||
int sum = 0;
|
||||
int mn = px;
|
||||
int mx = px;
|
||||
|
||||
int pri_shift = max(0, damping - ulog2_pos(pri));
|
||||
int sec_shift = max(0, damping - ulog2_pos(sec)); // RED-2 fix
|
||||
|
||||
int pri_tap0 = 4 - (pri & 1);
|
||||
int pri_tap1 = (pri_tap0 & 3) | 2;
|
||||
int sec_tap0 = 2;
|
||||
int sec_tap1 = 1;
|
||||
|
||||
int pri_idx = dir;
|
||||
int sec1_idx = (dir + 2) & 7;
|
||||
int sec2_idx = (dir + 6) & 7; // (dir - 2) % 8
|
||||
|
||||
// -- k = 0 --
|
||||
{
|
||||
int o1 = dirs8[pri_idx ].x;
|
||||
int o2 = dirs8[sec1_idx].x;
|
||||
int o3 = dirs8[sec2_idx].x;
|
||||
int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
|
||||
int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
|
||||
int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
|
||||
int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
|
||||
int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
|
||||
int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
|
||||
|
||||
sum += pri_tap0 * constrain(p0 - px, pri, pri_shift);
|
||||
sum += pri_tap0 * constrain(p1 - px, pri, pri_shift);
|
||||
sum += sec_tap0 * constrain(s0 - px, sec, sec_shift);
|
||||
sum += sec_tap0 * constrain(s1 - px, sec, sec_shift);
|
||||
sum += sec_tap0 * constrain(s2 - px, sec, sec_shift);
|
||||
sum += sec_tap0 * constrain(s3 - px, sec, sec_shift);
|
||||
|
||||
// min/max bookkeeping — NEON umin / smax semantics.
|
||||
// Unsigned min: 0x8000 sentinel (32768u) > any 0..255 pixel.
|
||||
// Signed max: 0x8000 = -32768 (signed) < any valid max.
|
||||
mn = int(min(uint(mn), uint(p0)));
|
||||
mn = int(min(uint(mn), uint(p1)));
|
||||
mn = int(min(uint(mn), uint(s0)));
|
||||
mn = int(min(uint(mn), uint(s1)));
|
||||
mn = int(min(uint(mn), uint(s2)));
|
||||
mn = int(min(uint(mn), uint(s3)));
|
||||
mx = max(mx, p0); mx = max(mx, p1);
|
||||
mx = max(mx, s0); mx = max(mx, s1);
|
||||
mx = max(mx, s2); mx = max(mx, s3);
|
||||
}
|
||||
|
||||
// -- k = 1 --
|
||||
{
|
||||
int o1 = dirs8[pri_idx ].y;
|
||||
int o2 = dirs8[sec1_idx].y;
|
||||
int o3 = dirs8[sec2_idx].y;
|
||||
int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
|
||||
int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
|
||||
int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
|
||||
int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
|
||||
int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
|
||||
int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
|
||||
|
||||
sum += pri_tap1 * constrain(p0 - px, pri, pri_shift);
|
||||
sum += pri_tap1 * constrain(p1 - px, pri, pri_shift);
|
||||
sum += sec_tap1 * constrain(s0 - px, sec, sec_shift);
|
||||
sum += sec_tap1 * constrain(s1 - px, sec, sec_shift);
|
||||
sum += sec_tap1 * constrain(s2 - px, sec, sec_shift);
|
||||
sum += sec_tap1 * constrain(s3 - px, sec, sec_shift);
|
||||
|
||||
mn = int(min(uint(mn), uint(p0)));
|
||||
mn = int(min(uint(mn), uint(p1)));
|
||||
mn = int(min(uint(mn), uint(s0)));
|
||||
mn = int(min(uint(mn), uint(s1)));
|
||||
mn = int(min(uint(mn), uint(s2)));
|
||||
mn = int(min(uint(mn), uint(s3)));
|
||||
mx = max(mx, p0); mx = max(mx, p1);
|
||||
mx = max(mx, s0); mx = max(mx, s1);
|
||||
mx = max(mx, s2); mx = max(mx, s3);
|
||||
}
|
||||
|
||||
int adj = (sum - int(sum < 0) + 8) >> 4;
|
||||
int outpx = clamp(px + adj, mn, mx);
|
||||
u_dst.dst[dst_off] = uint8_t(outpx);
|
||||
}
|
||||
@@ -0,0 +1,571 @@
|
||||
/*
|
||||
* Issue 003 — Mixed-kernel M4 bench.
|
||||
*
|
||||
* Runs N NEON pthread workers (pinned 0..N-1) doing CPU kernel A,
|
||||
* plus one QPU worker doing kernel B concurrently. Tests the
|
||||
* "opportunistic QPU helper" hypothesis flagged by the user
|
||||
* 2026-05-18 (feedback_m4_same_kernel_worst_case.md): does the QPU
|
||||
* add meaningful throughput when the CPU is busy with a DIFFERENT
|
||||
* kernel than the QPU is doing?
|
||||
*
|
||||
* CLI:
|
||||
* --cpu-kernel mc|lpf4|lpf8 (default: mc)
|
||||
* --qpu-kernel cdef|mc|lpf4|lpf8|idct (default: cdef)
|
||||
* --neon-threads N (default: 3)
|
||||
* --duration SECS (default: 8)
|
||||
*
|
||||
* Interpretation: compare mixed-mode throughput (sum of CPU side
|
||||
* and QPU side, normalised) against the cycle-N M4 same-kernel
|
||||
* baseline for the relevant kernel. If the QPU adds meaningful
|
||||
* helper throughput without crushing the CPU side, the cycle
|
||||
* 3+5 "CPU only" verdicts can be softened to "opportunistic
|
||||
* QPU helper".
|
||||
*
|
||||
* License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot (MC, LPF)
|
||||
* and dav1d BSD-2-Clause snapshot (CDEF).
|
||||
*/
|
||||
#define _GNU_SOURCE
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
#include <time.h>
|
||||
#include <getopt.h>
|
||||
#include <pthread.h>
|
||||
#include <sched.h>
|
||||
#include <assert.h>
|
||||
#include <vulkan/vulkan.h>
|
||||
|
||||
#include "v3d_runner.h"
|
||||
|
||||
/* External NEON refs (vendored FFmpeg + dav1d). */
|
||||
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my);
|
||||
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
int E, int I, int H);
|
||||
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
int E, int I, int H);
|
||||
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
int16_t *block, int eob);
|
||||
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp, int pri_strength, int sec_strength,
|
||||
int dir, int damping, int h, size_t edges);
|
||||
|
||||
/* --- Common helpers --- */
|
||||
|
||||
static volatile int g_stop = 0;
|
||||
static pthread_barrier_t g_start;
|
||||
|
||||
static inline uint64_t xs_step(uint64_t *s) {
|
||||
uint64_t x = *s; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return *s = x;
|
||||
}
|
||||
static uint64_t xs_init(uint64_t s) { return s ? s : 0xa57edbeef5717ULL; }
|
||||
static double now_s(void) {
|
||||
struct timespec t; clock_gettime(CLOCK_MONOTONIC_RAW, &t);
|
||||
return t.tv_sec + t.tv_nsec * 1e-9;
|
||||
}
|
||||
|
||||
/* --- Kernel selectors --- */
|
||||
|
||||
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT };
|
||||
|
||||
static const char *kernel_name(enum kernel k) {
|
||||
switch (k) {
|
||||
case K_MC: return "mc";
|
||||
case K_LPF4: return "lpf4";
|
||||
case K_LPF8: return "lpf8";
|
||||
case K_CDEF: return "cdef";
|
||||
case K_IDCT: return "idct";
|
||||
}
|
||||
return "?";
|
||||
}
|
||||
static const char *kernel_unit(enum kernel k) {
|
||||
return (k == K_LPF4 || k == K_LPF8) ? "Medge/s" : "Mblock/s";
|
||||
}
|
||||
|
||||
/* --- NEON worker (per-kernel inline; pre-generate inputs, hot-loop) --- */
|
||||
|
||||
#define NEON_BATCH 8192
|
||||
|
||||
typedef struct {
|
||||
int worker_id, affinity_core;
|
||||
enum kernel kernel;
|
||||
uint64_t units_done;
|
||||
double elapsed_s;
|
||||
} neon_args;
|
||||
|
||||
static void neon_run_mc(uint64_t *seed, uint64_t *out_done) {
|
||||
/* MC: SRC_BYTES=128 (8x16) per block; DST_BYTES=64. */
|
||||
uint8_t *src = malloc((size_t) NEON_BATCH * 128);
|
||||
uint8_t *dst = malloc((size_t) NEON_BATCH * 64);
|
||||
int *mx = malloc(NEON_BATCH * sizeof(int));
|
||||
for (int i = 0; i < NEON_BATCH; i++) {
|
||||
for (int j = 0; j < 128; j++) src[i*128 + j] = (uint8_t)(xs_step(seed) & 0xff);
|
||||
mx[i] = (int)(xs_step(seed) & 15);
|
||||
}
|
||||
while (!g_stop) {
|
||||
for (int i = 0; i < NEON_BATCH; i++)
|
||||
ff_vp9_put_regular8_h_neon(dst + i*64, 8,
|
||||
src + i*128 + 3, 16, 8, mx[i], 0);
|
||||
*out_done += NEON_BATCH;
|
||||
}
|
||||
free(src); free(dst); free(mx);
|
||||
}
|
||||
|
||||
static void neon_run_lpf(uint64_t *seed, uint64_t *out_done, int wd_8) {
|
||||
uint8_t *master = malloc((size_t) NEON_BATCH * 64);
|
||||
uint8_t *work = malloc((size_t) NEON_BATCH * 64);
|
||||
int *Es = malloc(NEON_BATCH*sizeof(int)), *Is = malloc(NEON_BATCH*sizeof(int)), *Hs = malloc(NEON_BATCH*sizeof(int));
|
||||
for (int i = 0; i < NEON_BATCH; i++) {
|
||||
for (int j = 0; j < 64; j++) master[i*64+j] = (uint8_t)(xs_step(seed) & 0xff);
|
||||
Es[i] = (int)(xs_step(seed) % 81);
|
||||
Is[i] = (int)(xs_step(seed) % 41);
|
||||
Hs[i] = (int)(xs_step(seed) % 11);
|
||||
}
|
||||
while (!g_stop) {
|
||||
memcpy(work, master, (size_t) NEON_BATCH * 64);
|
||||
for (int i = 0; i < NEON_BATCH; i++) {
|
||||
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
|
||||
else ff_vp9_loop_filter_h_4_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
|
||||
}
|
||||
*out_done += NEON_BATCH;
|
||||
}
|
||||
free(master); free(work); free(Es); free(Is); free(Hs);
|
||||
}
|
||||
|
||||
static void neon_run_cdef(uint64_t *seed, uint64_t *out_done) {
|
||||
int n = NEON_BATCH;
|
||||
uint16_t *tmps = malloc((size_t) n * 192 * sizeof(uint16_t));
|
||||
uint8_t *dsts = malloc((size_t) n * 64);
|
||||
int *pris = malloc(n*sizeof(int)), *secs = malloc(n*sizeof(int));
|
||||
int *dirs = malloc(n*sizeof(int)), *damps = malloc(n*sizeof(int));
|
||||
for (int i = 0; i < n; i++) {
|
||||
for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(seed) & 0xff);
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++)
|
||||
dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)];
|
||||
pris[i] = (int)(xs_step(seed) % 7) + 1;
|
||||
secs[i] = (int)(xs_step(seed) % 4) + 1;
|
||||
dirs[i] = (int)(xs_step(seed) & 7);
|
||||
damps[i] = (int)(xs_step(seed) % 6) + 1;
|
||||
}
|
||||
while (!g_stop) {
|
||||
for (int i = 0; i < n; i++)
|
||||
dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8,
|
||||
tmps + i*192 + (2*16+2),
|
||||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||||
*out_done += n;
|
||||
}
|
||||
free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps);
|
||||
}
|
||||
|
||||
static void neon_run_idct(uint64_t *seed, uint64_t *out_done) {
|
||||
int16_t *blocks_master = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
|
||||
int16_t *blocks_work = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
|
||||
uint8_t *dsts = malloc((size_t) NEON_BATCH * 64);
|
||||
int *eobs = malloc(NEON_BATCH * sizeof(int));
|
||||
for (int i = 0; i < NEON_BATCH; i++) {
|
||||
memset(blocks_master + i*64, 0, 64*sizeof(int16_t));
|
||||
int n = 1 + (int)(xs_step(seed) % 16);
|
||||
int eob = 0;
|
||||
for (int j = 0; j < n; j++) {
|
||||
int pos = (int)(xs_step(seed) % 64);
|
||||
int16_t coef = (int16_t)((int)(xs_step(seed) % 8192) - 4096);
|
||||
blocks_master[i*64 + pos] = coef;
|
||||
if (pos + 1 > eob) eob = pos + 1;
|
||||
}
|
||||
eobs[i] = eob ? eob : 1;
|
||||
}
|
||||
while (!g_stop) {
|
||||
memcpy(blocks_work, blocks_master, (size_t) NEON_BATCH * 64 * sizeof(int16_t));
|
||||
for (int i = 0; i < NEON_BATCH; i++)
|
||||
ff_vp9_idct_idct_8x8_add_neon(dsts + i*64, 8, blocks_work + i*64, eobs[i]);
|
||||
*out_done += NEON_BATCH;
|
||||
}
|
||||
free(blocks_master); free(blocks_work); free(dsts); free(eobs);
|
||||
}
|
||||
|
||||
static void *neon_worker(void *p) {
|
||||
neon_args *a = p;
|
||||
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
|
||||
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
|
||||
|
||||
uint64_t seed = xs_init((uint64_t) a->worker_id * 0xc01dbeefULL);
|
||||
|
||||
pthread_barrier_wait(&g_start);
|
||||
double t0 = now_s();
|
||||
uint64_t done = 0;
|
||||
switch (a->kernel) {
|
||||
case K_MC: neon_run_mc(&seed, &done); break;
|
||||
case K_LPF4: neon_run_lpf(&seed, &done, 0); break;
|
||||
case K_LPF8: neon_run_lpf(&seed, &done, 1); break;
|
||||
case K_IDCT: neon_run_idct(&seed, &done); break;
|
||||
case K_CDEF: neon_run_cdef(&seed, &done); break;
|
||||
default: fprintf(stderr, "bad NEON kernel\n"); break;
|
||||
}
|
||||
a->elapsed_s = now_s() - t0;
|
||||
a->units_done = done;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* --- QPU worker (CDEF / MC / LPF4 / LPF8 / IDCT) --- */
|
||||
|
||||
typedef struct {
|
||||
int affinity_core, n_units;
|
||||
enum kernel kernel;
|
||||
uint64_t units_done;
|
||||
double elapsed_s;
|
||||
} qpu_args;
|
||||
|
||||
/* Each QPU kernel has its own push-constant layout. */
|
||||
typedef struct { uint32_t n, dst_stride_u8, _pad0, _pad1; } pc_lpf;
|
||||
typedef struct { uint32_t n, dst_stride_u8, src_stride_u8, _pad; } pc_mc;
|
||||
typedef struct { uint32_t n_blocks, blocks_per_row, dst_stride_u8, _pad; } pc_idct;
|
||||
typedef struct { uint32_t n_blocks, tmp_stride_u16, dst_stride_u8, _pad; } pc_cdef;
|
||||
/* CDEF: not yet — QPU CDEF kernel not implemented. CDEF QPU mode uses
|
||||
* dav1d NEON via a single-thread NEON call on the QPU host core instead.
|
||||
* That's a degenerate "QPU helper" but matches the deferred state of
|
||||
* cycle 5. Real QPU CDEF kernel would replace this once cycle 5 closes. */
|
||||
|
||||
static void *qpu_cdef_neon_fallback(void *p)
|
||||
{
|
||||
/* Cycle 5 doesn't have a working QPU CDEF kernel yet (M1 deferred).
|
||||
* For Issue 003's purposes we test "the QPU host core running NEON
|
||||
* CDEF" as a proxy for the QPU contribution. This UNDERSTATES the
|
||||
* QPU helper value (since the QPU itself would parallelise more
|
||||
* than 1 NEON core), but gives a defensible lower bound: if even
|
||||
* NEON-on-the-spare-core helps the mixed throughput, QPU certainly
|
||||
* would.
|
||||
*
|
||||
* TODO: once cycle 5 Phase 6 lands, swap this for the QPU dispatch. */
|
||||
qpu_args *a = p;
|
||||
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
|
||||
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
|
||||
|
||||
int n_blocks = a->n_units;
|
||||
uint64_t seed = 0xcdef00000beefcULL;
|
||||
|
||||
uint16_t *tmps = malloc((size_t) n_blocks * 192 * sizeof(uint16_t));
|
||||
uint8_t *dsts = malloc((size_t) n_blocks * 64);
|
||||
int *pris = malloc(n_blocks*sizeof(int));
|
||||
int *secs = malloc(n_blocks*sizeof(int));
|
||||
int *dirs = malloc(n_blocks*sizeof(int));
|
||||
int *damps = malloc(n_blocks*sizeof(int));
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(&seed) & 0xff);
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++)
|
||||
dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)];
|
||||
pris[i] = (int)(xs_step(&seed) % 7) + 1;
|
||||
secs[i] = (int)(xs_step(&seed) % 4) + 1;
|
||||
dirs[i] = (int)(xs_step(&seed) & 7);
|
||||
damps[i] = (int)(xs_step(&seed) % 4) + 3;
|
||||
}
|
||||
|
||||
pthread_barrier_wait(&g_start);
|
||||
double t0 = now_s();
|
||||
uint64_t done = 0;
|
||||
while (!g_stop) {
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8,
|
||||
tmps + i*192,
|
||||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||||
done += n_blocks;
|
||||
}
|
||||
a->elapsed_s = now_s() - t0;
|
||||
a->units_done = done;
|
||||
|
||||
free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* QPU dispatch worker — generic for kernels with working shaders. */
|
||||
|
||||
typedef struct {
|
||||
int affinity_core, n_units;
|
||||
enum kernel kernel;
|
||||
uint64_t units_done;
|
||||
double elapsed_s;
|
||||
} qpu_real_args;
|
||||
|
||||
static void *qpu_real_worker(void *p)
|
||||
{
|
||||
qpu_real_args *a = p;
|
||||
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
|
||||
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
|
||||
|
||||
v3d_runner *r = v3d_runner_create();
|
||||
if (!r) return NULL;
|
||||
|
||||
int n_units = a->n_units;
|
||||
const char *spv = NULL;
|
||||
uint32_t bpw = 32; /* blocks/edges per WG */
|
||||
size_t dst_bytes = 0, meta_bytes = 0, src_bytes = 0;
|
||||
int has_src = 0;
|
||||
size_t per_unit = 0;
|
||||
|
||||
switch (a->kernel) {
|
||||
case K_LPF4:
|
||||
case K_LPF8: {
|
||||
spv = (a->kernel == K_LPF4) ? "v3d_lpf_h_4_8.spv" : "v3d_lpf_h_8_8.spv";
|
||||
per_unit = 64;
|
||||
dst_bytes = (size_t) n_units * per_unit;
|
||||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||||
break;
|
||||
}
|
||||
case K_MC:
|
||||
spv = "v3d_mc_8h.spv";
|
||||
dst_bytes = (size_t) n_units * 64;
|
||||
src_bytes = (size_t) n_units * 128;
|
||||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||||
has_src = 1;
|
||||
break;
|
||||
case K_IDCT:
|
||||
spv = "v3d_idct8.spv";
|
||||
dst_bytes = (size_t) n_units * 64;
|
||||
src_bytes = (size_t) n_units * 64 * sizeof(int16_t);
|
||||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||||
has_src = 1;
|
||||
break;
|
||||
case K_CDEF:
|
||||
spv = "v3d_cdef.spv";
|
||||
bpw = 4;
|
||||
dst_bytes = (size_t) n_units * 64;
|
||||
src_bytes = (size_t) n_units * 192 * sizeof(uint16_t);
|
||||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||||
has_src = 1;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "qpu_real_worker: unsupported kernel\n");
|
||||
v3d_runner_destroy(r);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0};
|
||||
v3d_runner_create_buffer(r, meta_bytes, &buf_meta);
|
||||
v3d_runner_create_buffer(r, dst_bytes, &buf_dst);
|
||||
if (has_src) v3d_runner_create_buffer(r, src_bytes, &buf_src);
|
||||
|
||||
/* Synthesise meta + src + dst content based on kernel. */
|
||||
uint64_t seed = 0xfeed00000beefULL;
|
||||
uint32_t *meta = buf_meta.mapped;
|
||||
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
|
||||
for (int i = 0; i < n_units; i++) {
|
||||
meta[4*i+0] = (uint32_t)((size_t)i * 64 + 4); /* dst_off */
|
||||
meta[4*i+1] = (uint32_t)(xs_step(&seed) % 81); /* E */
|
||||
meta[4*i+2] = (uint32_t)(xs_step(&seed) % 41); /* I */
|
||||
meta[4*i+3] = (uint32_t)(xs_step(&seed) % 11); /* H */
|
||||
}
|
||||
for (size_t i = 0; i < dst_bytes; i++)
|
||||
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||||
} else if (a->kernel == K_MC) {
|
||||
for (int i = 0; i < n_units; i++) {
|
||||
meta[4*i+0] = (uint32_t)((size_t)i * 64); /* dst_off */
|
||||
meta[4*i+1] = (uint32_t)((size_t)i * 128); /* src_off (RAW) */
|
||||
meta[4*i+2] = (uint32_t)(xs_step(&seed) & 15); /* mx */
|
||||
meta[4*i+3] = 0;
|
||||
}
|
||||
for (size_t i = 0; i < src_bytes; i++)
|
||||
((uint8_t *) buf_src.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||||
} else if (a->kernel == K_IDCT) {
|
||||
for (int i = 0; i < n_units; i++) {
|
||||
meta[4*i+0] = (uint32_t)((size_t)i * 64);
|
||||
meta[4*i+1] = (uint32_t)((i * 64) / 64);
|
||||
meta[4*i+2] = 0;
|
||||
meta[4*i+3] = 0;
|
||||
}
|
||||
int16_t *cf = (int16_t *) buf_src.mapped;
|
||||
size_t n_coefs = src_bytes / sizeof(int16_t);
|
||||
for (size_t i = 0; i < n_coefs; i++)
|
||||
cf[i] = (int16_t)((int)(xs_step(&seed) % 8192) - 4096);
|
||||
} else if (a->kernel == K_CDEF) {
|
||||
uint16_t *tmps = (uint16_t *) buf_src.mapped;
|
||||
for (int i = 0; i < n_units; i++) {
|
||||
uint32_t pri = (uint32_t)((xs_step(&seed) % 7) + 1);
|
||||
uint32_t sec = (uint32_t)((xs_step(&seed) % 4) + 1);
|
||||
uint32_t damping = (uint32_t)((xs_step(&seed) % 6) + 1);
|
||||
meta[4*i+0] = (uint32_t)((size_t)i * 64);
|
||||
meta[4*i+1] = pri | (sec << 8) | (damping << 16);
|
||||
meta[4*i+2] = (uint32_t)((size_t)i * 192 + (2*16 + 2));
|
||||
meta[4*i+3] = (uint32_t)(xs_step(&seed) & 7);
|
||||
for (int j = 0; j < 192; j++)
|
||||
tmps[(size_t)i * 192 + j] = (uint16_t)(xs_step(&seed) & 0xff);
|
||||
}
|
||||
for (size_t i = 0; i < dst_bytes; i++)
|
||||
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||||
}
|
||||
|
||||
v3d_pipeline pipe = {0};
|
||||
int n_ssbos = has_src ? 3 : 2;
|
||||
size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) :
|
||||
(a->kernel == K_IDCT) ? sizeof(pc_idct) :
|
||||
(a->kernel == K_CDEF) ? sizeof(pc_cdef) : sizeof(pc_lpf);
|
||||
v3d_runner_create_pipeline(r, spv, n_ssbos, pc_size, &pipe);
|
||||
|
||||
v3d_buffer bind_bufs[3];
|
||||
bind_bufs[0] = buf_meta;
|
||||
bind_bufs[1] = buf_dst;
|
||||
if (has_src) bind_bufs[2] = buf_src;
|
||||
v3d_runner_bind_buffers(r, &pipe, bind_bufs, n_ssbos);
|
||||
|
||||
uint32_t gc = (uint32_t)((n_units + bpw - 1) / bpw);
|
||||
union { pc_lpf lpf; pc_mc mc; pc_idct idct; pc_cdef cdef; } pc = {0};
|
||||
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
|
||||
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 8 };
|
||||
} else if (a->kernel == K_MC) {
|
||||
pc.mc = (pc_mc){ .n = n_units, .dst_stride_u8 = 8, .src_stride_u8 = 16 };
|
||||
} else if (a->kernel == K_IDCT) {
|
||||
pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 };
|
||||
} else if (a->kernel == K_CDEF) {
|
||||
pc.cdef = (pc_cdef){ .n_blocks = n_units, .tmp_stride_u16 = 16, .dst_stride_u8 = 8 };
|
||||
}
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
|
||||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
|
||||
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
0, pc_size, &pc);
|
||||
vkCmdDispatch(cb, gc, 1, 1);
|
||||
vkEndCommandBuffer(cb);
|
||||
|
||||
for (int i = 0; i < 5; i++) v3d_runner_submit_wait(r, cb);
|
||||
|
||||
pthread_barrier_wait(&g_start);
|
||||
double t0 = now_s();
|
||||
uint64_t done = 0;
|
||||
while (!g_stop) {
|
||||
v3d_runner_submit_wait(r, cb);
|
||||
done += n_units;
|
||||
}
|
||||
a->elapsed_s = now_s() - t0;
|
||||
a->units_done = done;
|
||||
|
||||
v3d_runner_destroy_pipeline(r, &pipe);
|
||||
if (has_src) v3d_runner_destroy_buffer(r, &buf_src);
|
||||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||||
v3d_runner_destroy(r);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* --- Timer --- */
|
||||
|
||||
typedef struct { double duration_s; } timer_args;
|
||||
static void *timer_thread(void *p) {
|
||||
timer_args *a = p;
|
||||
pthread_barrier_wait(&g_start);
|
||||
double end = now_s() + a->duration_s;
|
||||
while (now_s() < end) {
|
||||
struct timespec ts = {0, 1000000}; nanosleep(&ts, NULL);
|
||||
}
|
||||
g_stop = 1;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* --- Main --- */
|
||||
|
||||
static enum kernel parse_kernel(const char *s) {
|
||||
if (!strcmp(s, "mc")) return K_MC;
|
||||
if (!strcmp(s, "lpf4")) return K_LPF4;
|
||||
if (!strcmp(s, "lpf8")) return K_LPF8;
|
||||
if (!strcmp(s, "cdef")) return K_CDEF;
|
||||
if (!strcmp(s, "idct")) return K_IDCT;
|
||||
fprintf(stderr, "unknown kernel: %s\n", s); exit(2);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
enum kernel cpu_k = K_MC, qpu_k = K_CDEF;
|
||||
int n_neon = 3, qpu_core = 3, qpu_n_units = 65536;
|
||||
double duration = 8.0;
|
||||
|
||||
static struct option opts[] = {
|
||||
{"cpu-kernel", required_argument, 0, 'c'},
|
||||
{"qpu-kernel", required_argument, 0, 'q'},
|
||||
{"neon-threads", required_argument, 0, 'n'},
|
||||
{"qpu-core", required_argument, 0, 'C'},
|
||||
{"qpu-units", required_argument, 0, 'u'},
|
||||
{"duration", required_argument, 0, 'd'},
|
||||
{0,0,0,0}
|
||||
};
|
||||
for (int c; (c = getopt_long(argc, argv, "c:q:n:C:u:d:", opts, 0)) != -1;) {
|
||||
switch (c) {
|
||||
case 'c': cpu_k = parse_kernel(optarg); break;
|
||||
case 'q': qpu_k = parse_kernel(optarg); break;
|
||||
case 'n': n_neon = atoi(optarg); break;
|
||||
case 'C': qpu_core = atoi(optarg); break;
|
||||
case 'u': qpu_n_units = atoi(optarg); break;
|
||||
case 'd': duration = atof(optarg); break;
|
||||
default: return 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Cycle 5 Phase 6 landed — v3d_cdef.spv is M1-PASS. Use real
|
||||
* QPU dispatch for CDEF too. The NEON-fallback worker remains
|
||||
* compiled but is unselected. */
|
||||
int use_neon_fallback_for_cdef = 0;
|
||||
|
||||
int barrier_count = n_neon + 1 /* QPU */ + 1 /* timer */ + 1 /* main */;
|
||||
printf("=== Issue 003 mixed-kernel M4 bench ===\n");
|
||||
printf(" cpu kernel: %s × %d threads (cores 0..%d)\n",
|
||||
kernel_name(cpu_k), n_neon, n_neon - 1);
|
||||
printf(" qpu kernel: %s on core %d (%s)\n",
|
||||
kernel_name(qpu_k), qpu_core,
|
||||
use_neon_fallback_for_cdef ?
|
||||
"dav1d NEON fallback — real QPU CDEF deferred to cycle 5 Phase 6" :
|
||||
"QPU dispatch");
|
||||
printf(" duration: %.1fs\n\n", duration);
|
||||
|
||||
pthread_barrier_init(&g_start, NULL, barrier_count);
|
||||
|
||||
pthread_t timer_tid; timer_args ta = { .duration_s = duration };
|
||||
pthread_create(&timer_tid, NULL, timer_thread, &ta);
|
||||
|
||||
pthread_t neon_tids[16] = {0};
|
||||
neon_args n_args[16] = {0};
|
||||
for (int i = 0; i < n_neon; i++) {
|
||||
n_args[i] = (neon_args){ .worker_id = i, .affinity_core = i, .kernel = cpu_k };
|
||||
pthread_create(&neon_tids[i], NULL, neon_worker, &n_args[i]);
|
||||
}
|
||||
|
||||
pthread_t qpu_tid = 0;
|
||||
qpu_args q_args = {0};
|
||||
qpu_real_args qr_args = {0};
|
||||
if (use_neon_fallback_for_cdef) {
|
||||
q_args = (qpu_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
|
||||
pthread_create(&qpu_tid, NULL, qpu_cdef_neon_fallback, &q_args);
|
||||
} else {
|
||||
qr_args = (qpu_real_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
|
||||
pthread_create(&qpu_tid, NULL, qpu_real_worker, &qr_args);
|
||||
}
|
||||
|
||||
pthread_barrier_wait(&g_start);
|
||||
|
||||
pthread_join(timer_tid, NULL);
|
||||
for (int i = 0; i < n_neon; i++) pthread_join(neon_tids[i], NULL);
|
||||
pthread_join(qpu_tid, NULL);
|
||||
|
||||
uint64_t cpu_total = 0; double cpu_max_e = 0;
|
||||
printf("NEON workers (%s):\n", kernel_name(cpu_k));
|
||||
for (int i = 0; i < n_neon; i++) {
|
||||
double r = n_args[i].units_done / n_args[i].elapsed_s / 1e6;
|
||||
printf(" core %d: %.3f %s\n", n_args[i].affinity_core, r, kernel_unit(cpu_k));
|
||||
cpu_total += n_args[i].units_done;
|
||||
if (n_args[i].elapsed_s > cpu_max_e) cpu_max_e = n_args[i].elapsed_s;
|
||||
}
|
||||
double cpu_rate = cpu_total / cpu_max_e / 1e6;
|
||||
printf(" CPU aggregate: %.3f %s\n\n", cpu_rate, kernel_unit(cpu_k));
|
||||
|
||||
uint64_t qpu_done = use_neon_fallback_for_cdef ? q_args.units_done : qr_args.units_done;
|
||||
double qpu_elapsed = use_neon_fallback_for_cdef ? q_args.elapsed_s : qr_args.elapsed_s;
|
||||
double qpu_rate = qpu_done / qpu_elapsed / 1e6;
|
||||
printf("QPU worker (%s on core %d):\n", kernel_name(qpu_k), qpu_core);
|
||||
printf(" %.3f %s (%llu units / %.3f s)\n",
|
||||
qpu_rate, kernel_unit(qpu_k),
|
||||
(unsigned long long) qpu_done, qpu_elapsed);
|
||||
|
||||
pthread_barrier_destroy(&g_start);
|
||||
return 0;
|
||||
}
|
||||
+16
-6
@@ -79,12 +79,17 @@ static void gen_filter_params(int *pri, int *sec, int *dir, int *damping)
|
||||
* pri_strength: 1..7 (non-zero for combined path)
|
||||
* sec_strength: 1..4
|
||||
* dir: 0..7
|
||||
* damping: 3..6
|
||||
* damping: 1..6 — extended down to 1 (was 3..6) per
|
||||
* cycle 5 phase 5 RED-2: include cases where
|
||||
* sec_shift = damping - ulog2(sec) goes negative
|
||||
* (e.g. damping=1, sec=4 → sec_shift = -1).
|
||||
* Both NEON (uqsub) and C ref (now max(0,...))
|
||||
* saturate to 0 here; the bench should exercise it.
|
||||
*/
|
||||
*pri = (int)(xs() % 7) + 1;
|
||||
*sec = (int)(xs() % 4) + 1;
|
||||
*dir = (int)(xs() & 7);
|
||||
*damping = (int)(xs() % 4) + 3;
|
||||
*damping = (int)(xs() % 6) + 1;
|
||||
}
|
||||
|
||||
static double now_seconds(void)
|
||||
@@ -113,11 +118,16 @@ static int correctness_check(uint64_t seed, int n)
|
||||
tmp_center_to_dst(dst_a, tmp);
|
||||
memcpy(dst_b, dst_a, DST_BYTES);
|
||||
|
||||
/* C ref advances tmp internally by +2*stride+2.
|
||||
* NEON expects the caller to pass the already-advanced pointer
|
||||
* (i.e. pointer to the block-data origin, not the padded-buffer
|
||||
* origin). Hence the tmp+34 for the NEON call. */
|
||||
daedalus_cdef_filter_8x8_pri_sec_ref(
|
||||
dst_a, DST_W, tmp, pri, sec, dir, damping, 8);
|
||||
dav1d_cdef_filter8_8bpc_neon(
|
||||
dst_b, DST_W, tmp, pri, sec, dir, damping, 8,
|
||||
/* edges = */ 0); /* != 0xf → non-edged path, uint16 tmp w/stride 12 */
|
||||
dst_b, DST_W, tmp + (2 * TMP_W + 2),
|
||||
pri, sec, dir, damping, 8,
|
||||
/* edges = */ 0); /* uint16 tmp non-edged path */
|
||||
|
||||
if (memcmp(dst_a, dst_b, DST_BYTES) != 0) {
|
||||
if (mismatches < 3) {
|
||||
@@ -180,7 +190,7 @@ static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
dav1d_cdef_filter8_8bpc_neon(
|
||||
work_dst + (size_t)i * DST_BYTES, DST_W,
|
||||
tmps + (size_t)i * TMP_INTS,
|
||||
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
|
||||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||||
|
||||
double t0 = now_seconds();
|
||||
@@ -191,7 +201,7 @@ static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
dav1d_cdef_filter8_8bpc_neon(
|
||||
work_dst + (size_t)i * DST_BYTES, DST_W,
|
||||
tmps + (size_t)i * TMP_INTS,
|
||||
tmps + (size_t)i * TMP_INTS + (2 * TMP_W + 2),
|
||||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||||
done += n_blocks;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,332 @@
|
||||
/*
|
||||
* Cycle 5 Phase 6 — QPU bench for AV1 CDEF primary+secondary 8x8
|
||||
* luma filter on V3D 7.1.
|
||||
*
|
||||
* Reports:
|
||||
* M1₅: 3-way bit-exact (QPU vs NEON vs C reference) per Phase 5
|
||||
* YELLOW-1.
|
||||
* M2₅: QPU sustained Mblock/s over K dispatched batches
|
||||
*
|
||||
* License: BSD-2-Clause; links dav1d 1.4.3 NEON snapshot.
|
||||
*/
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
#include <getopt.h>
|
||||
#include <vulkan/vulkan.h>
|
||||
|
||||
#include "v3d_runner.h"
|
||||
|
||||
extern void daedalus_cdef_filter_8x8_pri_sec_ref(
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
int pri_strength, int sec_strength,
|
||||
int dir, int damping, int h);
|
||||
|
||||
extern void dav1d_cdef_filter8_8bpc_neon(
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp,
|
||||
int pri_strength, int sec_strength,
|
||||
int dir, int damping, int h, size_t edges);
|
||||
|
||||
#define TMP_W 16
|
||||
#define TMP_H 12
|
||||
#define TMP_INTS (TMP_W * TMP_H) /* 192 */
|
||||
#define DST_W 8
|
||||
#define DST_H 8
|
||||
#define DST_BYTES (DST_H * DST_W) /* 64 */
|
||||
#define BLOCK_ORIGIN_U16 (2 * TMP_W + 2) /* 34 */
|
||||
|
||||
static uint64_t xs_state;
|
||||
static inline uint64_t xs(void) {
|
||||
uint64_t x = xs_state;
|
||||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||
return xs_state = x;
|
||||
}
|
||||
|
||||
static void gen_tmp(uint16_t *tmp)
|
||||
{
|
||||
for (int i = 0; i < TMP_INTS; i++)
|
||||
tmp[i] = (uint16_t)(xs() & 0xff);
|
||||
}
|
||||
|
||||
static void tmp_center_to_dst(uint8_t *dst, const uint16_t *tmp)
|
||||
{
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
dst[r * 8 + c] = (uint8_t) tmp[(r + 2) * TMP_W + (c + 2)];
|
||||
}
|
||||
|
||||
static void gen_filter_params(int *pri, int *sec, int *dir, int *damping)
|
||||
{
|
||||
*pri = (int)(xs() % 7) + 1;
|
||||
*sec = (int)(xs() % 4) + 1;
|
||||
*dir = (int)(xs() & 7);
|
||||
*damping = (int)(xs() % 6) + 1; /* includes negative-sec_shift cases */
|
||||
}
|
||||
|
||||
static double now_seconds(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_blocks;
|
||||
uint32_t tmp_stride_u16;
|
||||
uint32_t dst_stride_u8;
|
||||
uint32_t _pad;
|
||||
} push_consts;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int n_blocks = 16384;
|
||||
int iters = 200;
|
||||
int verify_only = 0;
|
||||
uint64_t seed = 0;
|
||||
const char *spv_path = "v3d_cdef.spv";
|
||||
|
||||
static struct option opts[] = {
|
||||
{"blocks", required_argument, 0, 'b'},
|
||||
{"iters", required_argument, 0, 'i'},
|
||||
{"seed", required_argument, 0, 's'},
|
||||
{"spv", required_argument, 0, 'S'},
|
||||
{"verify-only", no_argument, 0, 'V'},
|
||||
{0,0,0,0}
|
||||
};
|
||||
for (int c; (c = getopt_long(argc, argv, "b:i:s:S:V", opts, 0)) != -1;) {
|
||||
switch (c) {
|
||||
case 'b': n_blocks = atoi(optarg); break;
|
||||
case 'i': iters = atoi(optarg); break;
|
||||
case 's': seed = strtoull(optarg, 0, 0); break;
|
||||
case 'S': spv_path = optarg; break;
|
||||
case 'V': verify_only = 1; break;
|
||||
default: return 2;
|
||||
}
|
||||
}
|
||||
|
||||
xs_state = seed ? seed : 0xc0defacedcafebebULL;
|
||||
|
||||
v3d_runner *r = v3d_runner_create();
|
||||
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
|
||||
printf("=== v3d CDEF bench ===\n");
|
||||
printf(" device: %s\n", v3d_runner_device_name(r));
|
||||
printf(" n_blocks: %d iters: %d seed: 0x%016llx\n",
|
||||
n_blocks, iters, (unsigned long long) (seed ? seed : 0xc0defacedcafebebULL));
|
||||
|
||||
size_t meta_bytes = (size_t) n_blocks * 4 * sizeof(uint32_t); /* uvec4 */
|
||||
size_t dst_bytes = (size_t) n_blocks * DST_BYTES;
|
||||
size_t tmp_bytes = (size_t) n_blocks * TMP_INTS * sizeof(uint16_t);
|
||||
|
||||
v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_tmp = {0};
|
||||
if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
|
||||
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
|
||||
if (v3d_runner_create_buffer(r, tmp_bytes, &buf_tmp)) return 1;
|
||||
|
||||
uint8_t *master_dst = malloc(dst_bytes);
|
||||
uint8_t *expected_c = malloc(dst_bytes);
|
||||
uint8_t *expected_n = malloc(dst_bytes);
|
||||
int *pris = malloc(n_blocks * sizeof(int));
|
||||
int *secs = malloc(n_blocks * sizeof(int));
|
||||
int *dirs = malloc(n_blocks * sizeof(int));
|
||||
int *damps = malloc(n_blocks * sizeof(int));
|
||||
if (!master_dst || !expected_c || !expected_n || !pris || !secs || !dirs || !damps) {
|
||||
fprintf(stderr, "alloc fail\n"); return 1;
|
||||
}
|
||||
|
||||
/* Generate tmp + params + initial dst (block center extracted). */
|
||||
uint16_t *tmp_gpu = (uint16_t *) buf_tmp.mapped;
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
uint16_t *tmp = tmp_gpu + (size_t)i * TMP_INTS;
|
||||
gen_tmp(tmp);
|
||||
tmp_center_to_dst(master_dst + (size_t)i * DST_BYTES, tmp);
|
||||
gen_filter_params(&pris[i], &secs[i], &dirs[i], &damps[i]);
|
||||
}
|
||||
|
||||
/* Compute C-ref and NEON expected outputs (serial, on master_dst). */
|
||||
memcpy(expected_c, master_dst, dst_bytes);
|
||||
memcpy(expected_n, master_dst, dst_bytes);
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
daedalus_cdef_filter_8x8_pri_sec_ref(
|
||||
expected_c + (size_t)i * DST_BYTES, DST_W,
|
||||
tmp_gpu + (size_t)i * TMP_INTS,
|
||||
pris[i], secs[i], dirs[i], damps[i], 8);
|
||||
dav1d_cdef_filter8_8bpc_neon(
|
||||
expected_n + (size_t)i * DST_BYTES, DST_W,
|
||||
tmp_gpu + (size_t)i * TMP_INTS + BLOCK_ORIGIN_U16,
|
||||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||||
}
|
||||
|
||||
/* Confirm 2-way C vs NEON parity (defence in depth — Phase 3 already
|
||||
* passed this for 10000 blocks, but n_blocks may be larger here). */
|
||||
int cn_mis = 0;
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
if (memcmp(expected_c + (size_t)i * DST_BYTES,
|
||||
expected_n + (size_t)i * DST_BYTES, DST_BYTES) != 0) cn_mis++;
|
||||
}
|
||||
printf(" C ref vs NEON parity check: %d/%d mismatches\n", cn_mis, n_blocks);
|
||||
if (cn_mis > 0) {
|
||||
fprintf(stderr, "ERROR: C ref disagrees with NEON before QPU even runs.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Populate meta SSBO (post Phase 5 RED-1 layout). */
|
||||
uint32_t *meta = (uint32_t *) buf_meta.mapped;
|
||||
uint32_t dst_stride_u8 = DST_W; /* 8 */
|
||||
uint32_t tmp_stride_u16 = TMP_W; /* 16 */
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
uint32_t pri = (uint32_t) pris[i];
|
||||
uint32_t sec = (uint32_t) secs[i];
|
||||
uint32_t damping = (uint32_t) damps[i];
|
||||
meta[4*i + 0] = (uint32_t)((size_t)i * DST_BYTES);
|
||||
meta[4*i + 1] = pri | (sec << 8) | (damping << 16);
|
||||
meta[4*i + 2] = (uint32_t)((size_t)i * TMP_INTS + BLOCK_ORIGIN_U16);
|
||||
meta[4*i + 3] = (uint32_t) dirs[i];
|
||||
}
|
||||
|
||||
/* Pipeline (3 SSBOs). */
|
||||
v3d_pipeline pipe = {0};
|
||||
if (v3d_runner_create_pipeline(r, spv_path,
|
||||
/*n_ssbos=*/3,
|
||||
/*push_const_size=*/sizeof(push_consts),
|
||||
&pipe)) return 1;
|
||||
v3d_buffer bind_bufs[3] = { buf_meta, buf_dst, buf_tmp };
|
||||
if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1;
|
||||
|
||||
const uint32_t blocks_per_wg = 4;
|
||||
uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1) / blocks_per_wg);
|
||||
printf(" dispatch: %u WGs × 256 invocations = %u blocks\n",
|
||||
group_count_x, group_count_x * blocks_per_wg);
|
||||
|
||||
push_consts pc = {
|
||||
.n_blocks = (uint32_t) n_blocks,
|
||||
.tmp_stride_u16 = tmp_stride_u16,
|
||||
.dst_stride_u8 = dst_stride_u8,
|
||||
._pad = 0,
|
||||
};
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
|
||||
if (cb == VK_NULL_HANDLE) return 1;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
|
||||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
|
||||
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
0, sizeof(pc), &pc);
|
||||
vkCmdDispatch(cb, group_count_x, 1, 1);
|
||||
vkEndCommandBuffer(cb);
|
||||
|
||||
/* --- M1: QPU vs C-ref vs NEON 3-way --- */
|
||||
printf("\n=== M1₅: QPU vs C-ref vs NEON 3-way ===\n");
|
||||
memcpy(buf_dst.mapped, master_dst, dst_bytes);
|
||||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||||
|
||||
int qc_mismatches = 0, qn_mismatches = 0;
|
||||
int prints = 0;
|
||||
for (int i = 0; i < n_blocks; i++) {
|
||||
const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * DST_BYTES;
|
||||
const uint8_t *c = expected_c + (size_t)i * DST_BYTES;
|
||||
const uint8_t *n = expected_n + (size_t)i * DST_BYTES;
|
||||
int qc = memcmp(q, c, DST_BYTES);
|
||||
int qn = memcmp(q, n, DST_BYTES);
|
||||
if (qc) qc_mismatches++;
|
||||
if (qn) qn_mismatches++;
|
||||
if ((qc || qn) && prints < 3) {
|
||||
fprintf(stderr, "MISMATCH block %d (pri=%d sec=%d dir=%d damp=%d):\n",
|
||||
i, pris[i], secs[i], dirs[i], damps[i]);
|
||||
fprintf(stderr, " C ref:");
|
||||
for (int r0 = 0; r0 < 8; r0++) {
|
||||
fprintf(stderr, "\n r%d ", r0);
|
||||
for (int c0 = 0; c0 < 8; c0++) fprintf(stderr, "%3u ", c[r0*8+c0]);
|
||||
}
|
||||
fprintf(stderr, "\n QPU:");
|
||||
for (int r0 = 0; r0 < 8; r0++) {
|
||||
fprintf(stderr, "\n r%d ", r0);
|
||||
for (int c0 = 0; c0 < 8; c0++) fprintf(stderr, "%3u ", q[r0*8+c0]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
prints++;
|
||||
}
|
||||
}
|
||||
printf(" QPU vs C ref: %d / %d blocks bit-exact (%.4f%%)\n",
|
||||
n_blocks - qc_mismatches, n_blocks,
|
||||
100.0 * (n_blocks - qc_mismatches) / n_blocks);
|
||||
printf(" QPU vs NEON: %d / %d blocks bit-exact (%.4f%%)\n",
|
||||
n_blocks - qn_mismatches, n_blocks,
|
||||
100.0 * (n_blocks - qn_mismatches) / n_blocks);
|
||||
|
||||
if (qc_mismatches > 0 || qn_mismatches > 0) {
|
||||
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (verify_only) {
|
||||
v3d_runner_destroy_pipeline(r, &pipe);
|
||||
v3d_runner_destroy_buffer(r, &buf_tmp);
|
||||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||||
v3d_runner_destroy(r);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* --- M2: throughput --- */
|
||||
printf("\n=== M2₅: QPU throughput ===\n");
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
memcpy(buf_dst.mapped, master_dst, dst_bytes);
|
||||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||||
}
|
||||
|
||||
double t0 = now_seconds();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
memcpy(buf_dst.mapped, master_dst, dst_bytes);
|
||||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||||
}
|
||||
double t1 = now_seconds();
|
||||
|
||||
double s0 = now_seconds();
|
||||
for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_dst, dst_bytes);
|
||||
double s1 = now_seconds();
|
||||
|
||||
double kernel_seconds = (t1 - t0) - (s1 - s0);
|
||||
double total_blocks = (double) n_blocks * iters;
|
||||
double mbps = total_blocks / kernel_seconds / 1e6;
|
||||
|
||||
printf(" blocks/dispatch: %d\n", n_blocks);
|
||||
printf(" iters: %d\n", iters);
|
||||
printf(" total blocks: %.0f\n", total_blocks);
|
||||
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds);
|
||||
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
|
||||
printf(" M2₅ throughput = %.3f Mblock/s\n", mbps);
|
||||
printf(" per-block = %.1f ns\n", kernel_seconds / total_blocks * 1e9);
|
||||
printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6);
|
||||
|
||||
double M3_5 = 3.809;
|
||||
double R5 = mbps / M3_5;
|
||||
printf("\n Cycle 5 NEON M3₅ = %.3f Mblock/s\n", M3_5);
|
||||
printf(" R₅ = M2₅/M3₅ = %.3f\n", R5);
|
||||
if (R5 >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n");
|
||||
else if (R5 >= 0.5) printf(" decision band = YELLOW: M4 decides\n");
|
||||
else if (R5 >= 0.1) printf(" decision band = ORANGE: M4 may still rescue\n");
|
||||
else printf(" decision band = RED: structural mismatch (predicted)\n");
|
||||
|
||||
/* 30fps@1080p floor: 32400 blocks/frame × 30 fps = 0.972 Mblock/s */
|
||||
double floor_rate = 0.972;
|
||||
printf(" 30fps@1080p floor: %.2fx margin (isolation)\n", mbps / floor_rate);
|
||||
|
||||
v3d_runner_destroy_pipeline(r, &pipe);
|
||||
v3d_runner_destroy_buffer(r, &buf_tmp);
|
||||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||||
v3d_runner_destroy(r);
|
||||
free(master_dst); free(expected_c); free(expected_n);
|
||||
free(pris); free(secs); free(dirs); free(damps);
|
||||
return 0;
|
||||
}
|
||||
+4
-1
@@ -98,7 +98,10 @@ void daedalus_cdef_filter_8x8_pri_sec_ref(
|
||||
{
|
||||
const int pri_tap = 4 - (pri_strength & 1);
|
||||
const int pri_shift = imax(0, damping - ulog2((unsigned) pri_strength));
|
||||
const int sec_shift = damping - ulog2((unsigned) sec_strength);
|
||||
/* Cycle 5 phase 5 RED-2: NEON `uqsub` saturates to 0. Mirror it
|
||||
* here so the C ref is bit-exact against NEON for damping-light
|
||||
* cases (which the original bench param gen didn't exercise). */
|
||||
const int sec_shift = imax(0, damping - ulog2((unsigned) sec_strength));
|
||||
|
||||
/* Walk into the center 8x8 region of the 12×16 padded buffer. */
|
||||
tmp = tmp + 2 * TMP_STRIDE + 2;
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Phase 8 — first end-to-end test through the public API.
|
||||
*
|
||||
* Exercises `daedalus_recipe_dispatch_vp9_idct8` end-to-end:
|
||||
* 1. Create context.
|
||||
* 2. Generate random VP9 coefficient blocks + dst pixels.
|
||||
* 3. Compute reference output via the C ref (tests/vp9_idct8_ref.c).
|
||||
* 4. Run public API dispatch on a copy of dst.
|
||||
* 5. Assert bit-exact.
|
||||
*
|
||||
* In Phase 8 skeleton, the API routes to CPU NEON (QPU dispatch
|
||||
* not yet wired through the API). Bit-exact gate against C ref
|
||||
* still passes because the underlying NEON kernel was the cycle 1
|
||||
* reference.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../include/daedalus.h"
|
||||
|
||||
extern void daedalus_vp9_idct_idct_8x8_add_ref(
|
||||
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
||||
|
||||
#define BLOCKS_W 8
|
||||
#define BLOCKS_H 8
|
||||
#define N_BLOCKS (BLOCKS_W * BLOCKS_H)
|
||||
#define DST_STRIDE (BLOCKS_W * 8)
|
||||
#define DST_BYTES (BLOCKS_H * 8 * DST_STRIDE)
|
||||
|
||||
static uint64_t xs_state = 0xa57edbeef5717ULL;
|
||||
static inline uint64_t xs(void) {
|
||||
uint64_t x = xs_state;
|
||||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||
return xs_state = x;
|
||||
}
|
||||
|
||||
static int run_once(daedalus_substrate force,
|
||||
const int16_t *coeffs,
|
||||
const daedalus_idct8_meta *meta,
|
||||
const uint8_t *dst_initial,
|
||||
const uint8_t *dst_ref,
|
||||
const char *label)
|
||||
{
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
||||
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||
printf(" [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force);
|
||||
if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
|
||||
printf(" SKIP — QPU unavailable on this host\n");
|
||||
daedalus_ctx_destroy(ctx); return 0;
|
||||
}
|
||||
uint8_t dst[DST_BYTES];
|
||||
memcpy(dst, dst_initial, DST_BYTES);
|
||||
int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE,
|
||||
coeffs, N_BLOCKS, meta);
|
||||
if (rc) { fprintf(stderr, " dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
|
||||
int diffs = 0;
|
||||
for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
|
||||
printf(" %d / %d bytes bit-exact (%.4f%%)\n",
|
||||
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diffs == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
|
||||
printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
|
||||
|
||||
/* Generate random VP9 IDCT inputs: 64-coef blocks + a dst surface. */
|
||||
int16_t coeffs[N_BLOCKS * 64];
|
||||
memset(coeffs, 0, sizeof(coeffs));
|
||||
for (int i = 0; i < N_BLOCKS; i++) {
|
||||
/* Sparse non-zero coefs to keep range realistic. */
|
||||
int n = 1 + (int)(xs() % 16);
|
||||
for (int j = 0; j < n; j++) {
|
||||
int pos = (int)(xs() % 64);
|
||||
int16_t v = (int16_t)((int)(xs() % 8192) - 4096);
|
||||
coeffs[i * 64 + pos] = v;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES];
|
||||
for (int i = 0; i < DST_BYTES; i++)
|
||||
dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff);
|
||||
|
||||
/* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
|
||||
* by*8*stride + bx*8. */
|
||||
daedalus_idct8_meta meta[N_BLOCKS];
|
||||
for (int by = 0; by < BLOCKS_H; by++) {
|
||||
for (int bx = 0; bx < BLOCKS_W; bx++) {
|
||||
int i = by * BLOCKS_W + bx;
|
||||
meta[i].dst_off = (uint32_t)(by * 8 * DST_STRIDE + bx * 8);
|
||||
meta[i].block_x = (uint32_t) bx;
|
||||
meta[i].block_y = (uint32_t) by;
|
||||
meta[i]._pad = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Compute reference via the C ref (mutates a scratch copy of
|
||||
* coeffs because the C ref destroys its input). */
|
||||
int16_t scratch[64];
|
||||
for (int i = 0; i < N_BLOCKS; i++) {
|
||||
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
|
||||
daedalus_vp9_idct_idct_8x8_add_ref(dst_ref + meta[i].dst_off,
|
||||
DST_STRIDE, scratch, 64);
|
||||
}
|
||||
|
||||
int fail = 0;
|
||||
fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU");
|
||||
fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU");
|
||||
fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO");
|
||||
return fail;
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Phase 8 — VP9 LPF wd=4 + wd=8 through the public API.
|
||||
*
|
||||
* Exercises both kernels in CPU / QPU / AUTO modes against the
|
||||
* C reference (tests/vp9_lpf_ref.c, vp9_lpf8_ref.c). Bit-exact
|
||||
* gate per cycle 2 and 4 phase 7 docs.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../include/daedalus.h"
|
||||
|
||||
extern void daedalus_vp9_loop_filter_h_4_8_ref(
|
||||
uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
|
||||
extern void daedalus_vp9_loop_filter_h_8_8_ref(
|
||||
uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
|
||||
|
||||
#define N_EDGES 32
|
||||
#define EDGE_STRIDE 8
|
||||
#define EDGE_H 8
|
||||
#define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */
|
||||
#define DST_BYTES (N_EDGES * EDGE_BYTES)
|
||||
|
||||
static uint64_t xs_state = 0xa57edbeef5717ULL;
|
||||
static inline uint64_t xs(void) {
|
||||
uint64_t x = xs_state;
|
||||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||
return xs_state = x;
|
||||
}
|
||||
|
||||
static void gen_edge_pixels(uint8_t *buf)
|
||||
{
|
||||
int side_a_base = (int)(xs() % 200) + 20;
|
||||
int side_b_base = (int)(xs() % 200) + 20;
|
||||
int noise = (int)(xs() % 30);
|
||||
for (int r = 0; r < EDGE_H; r++) {
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int base = (c < 4) ? side_a_base : side_b_base;
|
||||
int n = ((int)(xs() % (2 * noise + 1))) - noise;
|
||||
int v = base + n;
|
||||
buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int run_lpf(int wd_8, daedalus_substrate force,
|
||||
const uint8_t *dst_initial,
|
||||
const uint8_t *dst_ref,
|
||||
const daedalus_lpf_meta *meta,
|
||||
const char *label)
|
||||
{
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||
if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
|
||||
printf(" [%s wd=%d] SKIP — QPU unavailable\n", label, wd_8 ? 8 : 4);
|
||||
daedalus_ctx_destroy(ctx); return 0;
|
||||
}
|
||||
uint8_t dst[DST_BYTES];
|
||||
memcpy(dst, dst_initial, DST_BYTES);
|
||||
int rc = wd_8
|
||||
? daedalus_dispatch_vp9_lpf8(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta)
|
||||
: daedalus_dispatch_vp9_lpf4(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, " rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
|
||||
int diffs = 0;
|
||||
for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
|
||||
printf(" [%s wd=%d] %d/%d bit-exact (%.4f%%)\n",
|
||||
label, wd_8 ? 8 : 4,
|
||||
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diffs == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int run_one_kernel(int wd_8)
|
||||
{
|
||||
/* Per-edge layout: edge i occupies bytes [i*64..i*64+63]. Edge
|
||||
* center is at column 4 of row 0 → byte offset i*64 + 4. */
|
||||
uint8_t initial[DST_BYTES];
|
||||
uint8_t ref[DST_BYTES];
|
||||
daedalus_lpf_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
gen_edge_pixels(initial + i * EDGE_BYTES);
|
||||
meta[i].dst_off = (uint32_t)(i * EDGE_BYTES + 4);
|
||||
meta[i].E = (int32_t)(xs() % 81);
|
||||
meta[i].I = (int32_t)(xs() % 41);
|
||||
meta[i].H = (int32_t)(xs() % 11);
|
||||
}
|
||||
memcpy(ref, initial, DST_BYTES);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
if (wd_8) daedalus_vp9_loop_filter_h_8_8_ref(
|
||||
ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H);
|
||||
else daedalus_vp9_loop_filter_h_4_8_ref(
|
||||
ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H);
|
||||
}
|
||||
|
||||
int fail = 0;
|
||||
fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_CPU, initial, ref, meta, "CPU");
|
||||
fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_QPU, initial, ref, meta, "QPU");
|
||||
fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_AUTO, initial, ref, meta, "AUTO");
|
||||
return fail;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("=== Phase 8 API smoke: VP9 LPF wd=4 + wd=8 ===\n");
|
||||
printf(" recipe for LPF4_INNER: %d (1=CPU, 2=QPU)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER));
|
||||
printf(" recipe for LPF8_INNER: %d\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER));
|
||||
|
||||
int fail = 0;
|
||||
printf("\nLPF wd=4:\n");
|
||||
fail |= run_one_kernel(0);
|
||||
printf("\nLPF wd=8:\n");
|
||||
fail |= run_one_kernel(1);
|
||||
return fail;
|
||||
}
|
||||
Reference in New Issue
Block a user