Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a2575d5e42 | |||
| c3301b0c2e | |||
| 9abc73d308 | |||
| d7100459f2 | |||
| dff610e13d | |||
| c43ee84d8e | |||
| fad600000b | |||
| ce6703a862 | |||
| 5306bf0f61 | |||
| 9b1c106dc5 | |||
| ce436bfd96 | |||
| a5c47aa51c | |||
| f4af24020f | |||
| 818e71560e | |||
| 9d5451e0fe | |||
| 0d54d68f38 | |||
| 79553c6e22 | |||
| a092ee34aa | |||
| c01754e849 | |||
| 74687d9def | |||
| 65bd5c3fe3 | |||
| 737e87980d | |||
| 98553278dd | |||
| 0a042a8e95 | |||
| 3ecfc8b0ef | |||
| c154253432 | |||
| b3de96b21c | |||
| 68dccd2911 | |||
| 7d6f106919 |
@@ -11,3 +11,4 @@ build-*/
|
||||
# Forensic snapshot of the corrupted .git from 2026-05-18 10:25
|
||||
# working-tree wipe. Retained on disk for inspection; not tracked.
|
||||
.git-broken-2026-05-18/
|
||||
.claude/
|
||||
|
||||
+96
-2
@@ -284,7 +284,40 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV})
|
||||
set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264_IDCT4_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264_IDCT4_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct4.comp
|
||||
COMMENT "glslang: v3d_h264_idct4.comp -> v3d_h264_idct4.spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264_IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct8.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264_IDCT8_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264_IDCT8_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_idct8.comp
|
||||
COMMENT "glslang: v3d_h264_idct8.comp -> v3d_h264_idct8.spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264_QPEL_MC20_SPV ${CMAKE_BINARY_DIR}/v3d_h264_qpel_mc20.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264_QPEL_MC20_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264_QPEL_MC20_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc20.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc20.comp
|
||||
COMMENT "glslang: v3d_h264_qpel_mc20.comp -> v3d_h264_qpel_mc20.spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV})
|
||||
|
||||
# v3d_runner — reusable Vulkan plumbing.
|
||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||
@@ -412,6 +445,9 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
${LPF8_SPV}
|
||||
${CDEF_SPV}
|
||||
${H264DEBLOCK_SPV}
|
||||
${H264_IDCT4_SPV}
|
||||
${H264_IDCT8_SPV}
|
||||
${H264_QPEL_MC20_SPV}
|
||||
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
|
||||
)
|
||||
endif()
|
||||
@@ -419,9 +455,33 @@ endif()
|
||||
# pkg-config file. Vulkan goes in Requires.private (consumer's
|
||||
# pkg-config call gets it via --static). pthread + dl are needed
|
||||
# by the static archive's runtime helpers.
|
||||
#
|
||||
# `prefix` is derived from ${pcfiledir} so the .pc is relocatable:
|
||||
# pkg-config substitutes ${pcfiledir} with the directory holding the
|
||||
# .pc at lookup time, and the relative path from
|
||||
# <prefix>/<libdir>/pkgconfig back to <prefix> tells pkg-config the
|
||||
# install prefix without baking it in. This is why
|
||||
# `cmake --install build --prefix /foo` produces a .pc that correctly
|
||||
# resolves `prefix=/foo` instead of baking whatever CMAKE_INSTALL_PREFIX
|
||||
# was at *configure* time (default /usr/local). DESTDIR-staged
|
||||
# installs work too: at runtime pkg-config sees the .pc at its real
|
||||
# install path and computes the right prefix.
|
||||
#
|
||||
# Relative-path depth is computed from CMAKE_INSTALL_LIBDIR (and
|
||||
# whatever multiarch tuple GNUInstallDirs adds) so Debian-style
|
||||
# `lib/aarch64-linux-gnu/pkgconfig/...` resolves with the right number
|
||||
# of `..` components. Layouts where libdir is *not* under prefix are
|
||||
# not supported by this scheme; if a packager overrides libdir to an
|
||||
# absolute path the relative-path machinery falls back to the absolute
|
||||
# value (CMake's file(RELATIVE_PATH) prepends `..` until they meet),
|
||||
# which is also relocatable but no longer prefix-agnostic.
|
||||
file(RELATIVE_PATH PKGCONFIG_PCDIR_TO_PREFIX
|
||||
"${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig"
|
||||
"${CMAKE_INSTALL_PREFIX}")
|
||||
|
||||
set(PKGCONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/daedalus-fourier.pc)
|
||||
file(WRITE ${PKGCONFIG_OUT}
|
||||
"prefix=${CMAKE_INSTALL_PREFIX}
|
||||
"prefix=\${pcfiledir}/${PKGCONFIG_PCDIR_TO_PREFIX}
|
||||
exec_prefix=\${prefix}
|
||||
libdir=\${prefix}/${CMAKE_INSTALL_LIBDIR}
|
||||
includedir=\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}
|
||||
@@ -459,7 +519,11 @@ add_executable(test_api_h264
|
||||
tests/h264_idct4_ref.c
|
||||
tests/h264_idct8_ref.c
|
||||
tests/h264_deblock_ref.c
|
||||
tests/h264_h_loop_filter_luma_ref.c
|
||||
tests/h264_chroma_loop_filter_ref.c
|
||||
tests/h264_intra_loop_filter_ref.c
|
||||
tests/h264_qpel8_mc20_ref.c
|
||||
tests/h264_qpel8_mc02_ref.c
|
||||
)
|
||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_h264 PRIVATE -O2)
|
||||
@@ -468,6 +532,36 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
|
||||
target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_4x4 luma prediction (9 modes) — reference + tests.
|
||||
# Pure CPU + spec-derived; no daedalus_core dependency yet (this is
|
||||
# the bit-exact gate for the eventual shader / dispatch wiring).
|
||||
add_executable(test_intra_pred_4x4
|
||||
tests/test_intra_pred_4x4.c
|
||||
tests/h264_intra_pred_4x4_ref.c
|
||||
)
|
||||
target_compile_options(test_intra_pred_4x4 PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_16x16 luma prediction (4 modes: V, H, DC, Plane) —
|
||||
# reference + tests. Same spec-gate role as the 4x4 sibling.
|
||||
add_executable(test_intra_pred_16x16
|
||||
tests/test_intra_pred_16x16.c
|
||||
tests/h264_intra_pred_16x16_ref.c
|
||||
)
|
||||
target_compile_options(test_intra_pred_16x16 PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_8x8 chroma prediction (4 modes: DC, H, V, Plane) —
|
||||
# reference + tests. DC is per-quadrant (asymmetric); Plane uses
|
||||
# slope coefficient 34 instead of luma's 5.
|
||||
add_executable(test_intra_pred_chroma8x8
|
||||
tests/test_intra_pred_chroma8x8.c
|
||||
tests/h264_intra_pred_chroma8x8_ref.c
|
||||
)
|
||||
target_compile_options(test_intra_pred_chroma8x8 PRIVATE -O2)
|
||||
|
||||
add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
|
||||
target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
|
||||
target_compile_options(bench_pool_overhead PRIVATE -O2)
|
||||
|
||||
if (DAEDALUS_BUILD_VULKAN)
|
||||
# (re-open the conditional so the closing endif() below balances)
|
||||
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
|
||||
This document is forward-looking. It describes the generalized multi-SoC daedalus daemon architecture, but the immediate work block stays "finish Pi 5". Re-read this when:
|
||||
|
||||
- A second aarch64 host without a working kernel-side V4L2 stateless decoder shows up in the fleet (most likely candidate: Pi 4, which has V3D 4.x and no rpivid stable upstream).
|
||||
- A specific working-copy slowdown that the current Pi-5-only daedalus can't address motivates the generalization.
|
||||
- libva-v4l2-request-fourier evolves to need multi-node negotiation (currently it picks the first matching V4L2 node).
|
||||
- HW decode on noether (Pi 4, the user's interactive workstation) becomes a real ask and rpivid upstream is still unstable. This is the most likely trigger — same SoC class as Pi 5 but weaker V3D 4.x, so the caps-file mechanism plus an extra row's worth of substrate measurements.
|
||||
- AV1 playback on boltzmann (RK3588) starts mattering. rkvdec doesn't cover AV1, so the daedalus path becomes the only HW-accelerated option, and Mali Valhall compute substrate decisions need their own caps row.
|
||||
- libva-v4l2-request-fourier evolves to need multi-node negotiation (today it picks the first matching V4L2 node; a host with both rkvdec and daedalus-v4l2 nodes wants a preference policy).
|
||||
|
||||
Until then: this is decision context, not a TODO.
|
||||
|
||||
@@ -51,13 +51,17 @@ The mfritsche fleet has heterogeneous aarch64 hardware decoders:
|
||||
|
||||
| SoC | Host(s) | H.264 | HEVC | VP9 | AV1 | GPU compute |
|
||||
|---|---|---|---|---|---|---|
|
||||
| BCM2712 (Pi 5) | higgs, broglie | none | V3D7 (rpi-hevc-dec — SPS quirks) | none | none | V3D7 (Vulkan compute, queryable) |
|
||||
| BCM2711 (Pi 4) | dcw3 | rpivid (out of tree, unstable) | rpivid (out of tree, unstable) | none | none | V3D4 (Vulkan compute, weaker) |
|
||||
| RK3588 | hertz, tesla | rkvdec V4L2 stateless (upstream) | rkvdec V4L2 stateless | rkvdec V4L2 stateless | none (rkvdec lacks AV1) | Mali Valhall (panvk) + RK NPU |
|
||||
| Allwinner H6 | (not in current fleet, but Cedrus exists) | Cedrus V4L2 | Cedrus V4L2 | none | none | Mali Bifrost |
|
||||
| BCM2712 (Pi 5) | higgs, hertz, broglie, tesla (LXD on hertz) | none | V3D7 (rpi-hevc-dec — SPS quirks) | none | none | V3D7 (Vulkan compute, queryable) |
|
||||
| BCM2711 (Pi 4) | noether (interactive workstation), dcw3, dcw2 | rpivid (out of tree, unstable) | rpivid (out of tree, unstable) | none | none | V3D4 (Vulkan compute, weaker) |
|
||||
| RK3588 | boltzmann (32 GB, kernel-dev / MCP hub, 8 W always-on) | rkvdec V4L2 stateless (upstream) | rkvdec V4L2 stateless | rkvdec V4L2 stateless | none (rkvdec lacks AV1) | Mali Valhall (panvk-bifrost-video in dev) + RK NPU |
|
||||
| Allwinner H6 | (not in current fleet, but Cedrus exists upstream) | Cedrus V4L2 | Cedrus V4L2 | none | none | Mali Bifrost |
|
||||
|
||||
No single SoC has a complete codec set. RK3588 lacks AV1; Pi 5 lacks H.264 + VP9 + AV1; Pi 4 has rpivid (out-of-tree, kernel-version-fragile); Allwinner Cedrus is H.264/HEVC only.
|
||||
|
||||
A note on the Pi 5 row: hertz and tesla share hardware (tesla is an LXD container hosted on hertz) but are operationally distinct — tesla is the distcc/MCP worker, hertz is the LXD host with all the cron automations and the 17-tool lmcp hub. From a daedalus deployment perspective they count as **one** Pi 5 substrate; from a workflow perspective they're separate boxes.
|
||||
|
||||
A note on noether: it's the user's interactive workstation (Pi 4, BCM2711). Firefox + mpv run here. Any "I want HW decode on my main box" pressure lands first on this host, which puts Pi 4 (V3D4 + maybe-rpivid) closer to the front of the queue than the original draft of this document suggested.
|
||||
|
||||
The current daedalus model — "kernel substitution + libavcodec front end" — is the right answer for **Pi 5 specifically**, where no usable kernel V4L2 stateless decoder exists for the codecs we care about, and a Vulkan-capable GPU (V3D7) is available to help on a few kernels.
|
||||
|
||||
The model is **not** the right answer for SoCs that already have working V4L2 stateless decoders for the requested codec — those should be passed through, not re-implemented through libavcodec + kernel substitution.
|
||||
@@ -207,15 +211,15 @@ Pass-through plugins are *thin* — they translate the daedalus daemon's wire pr
|
||||
|
||||
**Today's calculus:**
|
||||
|
||||
- Pi 5 daedalus path is the only thing in the fleet that uses daedalus daemon. Generalizing for a single user is overdesign.
|
||||
- RK3588 uses rkvdec directly through libva-v4l2-request-fourier; daedalus daemon is **not in the path** for any RK3588 codec. The "RK3588 support" the architecture above proposes is mostly a no-op routing decision plus an AV1 fallback that doesn't yet measure on Mali.
|
||||
- Pi 4 with rpivid is the only realistic second motivator. rpivid upstream stability is the gate — if it lands cleanly, Pi 4 takes the pass-through path with no kernel substitution needed. If it stays out-of-tree-fragile, **then** the substrate-composed path with V3D4 + NEON becomes the right backend for Pi 4, and we need the per-SoC caps mechanism to handle V3D4's weaker compute.
|
||||
- Pi 5 (higgs + hertz + broglie + tesla) is **four hosts**, but **one SoC**. Adding the fifth Pi 5 host wouldn't pressure-test the architecture; they all share BCM2712 caps so the substrate decisions are identical across the row.
|
||||
- boltzmann (RK3588) is the only non-Pi-5 always-on host in the fleet, and it uses rkvdec directly through libva-v4l2-request-fourier — daedalus daemon is **not in the path** for any RK3588 codec on it. The "RK3588 support" the architecture above proposes is mostly a no-op routing decision plus an AV1 fallback that doesn't yet measure on Mali. No forcing pressure from boltzmann today.
|
||||
- noether (Pi 4, this user's interactive workstation) and dcw3/dcw2 (also Pi 4) are the real second-SoC candidates. The gate is rpivid upstream stability: if it lands cleanly, Pi 4 takes the pass-through path with zero kernel substitution work. If it stays out-of-tree-fragile, **then** the substrate-composed path with V3D4 + NEON becomes the right backend for Pi 4, and we need the per-SoC caps mechanism to handle V3D4's weaker compute.
|
||||
- The recipe layer in daedalus-fourier already scales cleanly. Adding more substrates is incremental, not architectural.
|
||||
|
||||
**The forcing function that flips this from "deferred" to "do it":**
|
||||
|
||||
- Pi 4 enters daily use and rpivid is still not stable upstream — implies we need a Pi 4 substrate-composed path, which means at minimum a second caps file and the loader for it. At that point, building the full pluggable scaffold becomes proportionate.
|
||||
- **Or:** an x86 host enters the fleet running mesa-panvk on a Pi-CM5-like board, and we need the daedalus daemon to discover it dynamically rather than being baked at build time.
|
||||
- **noether-as-Firefox-host** — the user starts wanting HW decode on their main workstation and rpivid is still not stable upstream. Implies a Pi 4 substrate-composed path, which means at minimum a second caps file and the loader for it. At that point, building the full pluggable scaffold becomes proportionate. This is the most likely trigger; noether is already a daily-driver Pi 4.
|
||||
- **boltzmann-as-AV1-decoder** — RK3588 has no AV1 HW decoder, and the user wants AV1 playback there (currently CPU-only). Triggers a cycle-5–equivalent measurement campaign on Mali Valhall to see whether `daedalus_recipe_dispatch_cdef_8x8` (or follow-on AV1 kernels) is worth running on Mali compute. If yes, we need an RK3588 caps file that overrides only the AV1 row while leaving H.264/HEVC/VP9 on rkvdec pass-through.
|
||||
- **Or:** a third-party Pi 5 user needs to swap shaders for V3D firmware experiments without rebuilding the daemon — at that point dynamic shader loading + caps overrides become a feature ask.
|
||||
|
||||
Until one of those happens: keep daedalus daemon Pi 5 specific. Push cross-SoC abstraction *up* to libva-v4l2-request-fourier (which already does most of it) rather than *down* into the daemon.
|
||||
@@ -242,6 +246,7 @@ Until one of those happens: keep daedalus daemon Pi 5 specific. Push cross-SoC a
|
||||
|---|---|---|
|
||||
| 2026-05-23 | **Defer generalization.** Finish Pi 5 substitution arc (cycle 9 PR #90 pending), then pivot to bug-fix backlog (daemon SEGV #145, D-state #146) before architecture work. | Architecture pivot is a multi-week scope; Pi 5 path is the only user-visible motivator today; deferring loses nothing because the recipe layer already abstracts kernels and libva-v4l2-request-fourier already abstracts V4L2 nodes. |
|
||||
| 2026-05-23 | **Document the design now, even though it's deferred.** | Captures the conceptual gap (shaders ≠ hardware decoders) and the two-backend conclusion while the analysis is fresh; saves re-litigating in 3–6 months. |
|
||||
| 2026-05-23 | **Correct fleet hardware mapping.** Original draft had hertz/tesla under RK3588 and omitted boltzmann + noether entirely. Verified via `/proc/device-tree/compatible`: hertz + tesla are Pi 5 (BCM2712), noether is Pi 4 (BCM2711), boltzmann is the only RK3588 in the fleet. Adjusted "Why deferred" / forcing-function reasoning accordingly — Pi 5 row is now 4 hosts (one SoC), noether is the realistic Pi 4 trigger, boltzmann is the realistic RK3588 trigger via AV1. | Original draft was speculative on host-to-SoC mapping; verified state changes which forcing functions are credible. |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -263,6 +263,102 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* H.264 luma "h_loop_filter" — sibling of _v, applies filter
|
||||
* HORIZONTALLY across a VERTICAL edge (16 rows tall; pix points to
|
||||
* row 0 of the right block, col 0 = leftmost output column). Same
|
||||
* non-intra (bS < 4) variant.
|
||||
*
|
||||
* Each tile is 8 cols x 16 rows of context (cols -4..+3 around the
|
||||
* edge). dst_off points to row 0 col 0 of the RIGHT block.
|
||||
*
|
||||
* Constraint: (dst_off % dst_stride) >= 4 (the kernel reads p3 at
|
||||
* pix[-4]). Caller must ensure this.
|
||||
*
|
||||
* QPU shader for the H variant is not yet implemented; recipe table
|
||||
* routes AUTO to CPU NEON. An explicit DAEDALUS_SUBSTRATE_QPU on
|
||||
* the _h dispatch returns -1 rather than silently degrading.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* H.264 chroma (4:2:0) loop filters — bS<4 variant. Chroma uses
|
||||
* the SAME daedalus_h264_deblock_meta struct as luma but on smaller
|
||||
* tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
|
||||
* rows for H (4 segments of 2 rows). Each segment has its own tc0
|
||||
* strength (tc0[s] applies to both cells in segment s).
|
||||
*
|
||||
* Algorithm difference vs luma: chroma updates only p0 and q0
|
||||
* (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
|
||||
* luma-style ap/aq side-condition bonus).
|
||||
*
|
||||
* QPU shaders for chroma deblock not implemented yet; recipe table
|
||||
* routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* H.264 bS=4 "intra" loop filters — used at I-MB and inter
|
||||
* macroblock boundaries where boundary strength is forced to 4 per
|
||||
* H.264 §8.7.2.1. Different algorithm from bS<4: per-side strong
|
||||
* vs weak filter decided by quad-tree condition (luma only);
|
||||
* chroma is always weak. No tc0 — the daedalus_h264_deblock_meta
|
||||
* struct's tc0[] field is IGNORED for intra dispatches (callers can
|
||||
* leave it uninitialised or share a single edge list across both
|
||||
* intra and non-intra kernels).
|
||||
*
|
||||
* Reuses the same meta layout as bS<4 dispatches for alpha + beta +
|
||||
* dst_off; tile geometry per orientation is identical to the bS<4
|
||||
* sibling (16-col / 16-row luma; 8-col / 8-row chroma).
|
||||
*
|
||||
* QPU shaders not implemented for any of the four; recipe routes
|
||||
* AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1 (fast fail).
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
|
||||
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
|
||||
@@ -296,6 +392,29 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* H.264 luma qpel mc02 (vertical half-pel) — mirror of mc20.
|
||||
* 6-tap filter applied vertically:
|
||||
* dst[r,c] = clip255((s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
|
||||
* + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
|
||||
* + 16) >> 5)
|
||||
*
|
||||
* Same single-stride convention as mc20. src + src_off points at
|
||||
* row 0 col 0 of the OUTPUT block; the filter reads rows -2..+3, so
|
||||
* the caller must guarantee 2 rows of top context and 3 rows of
|
||||
* bottom context per block (FFmpeg edge-emulated buffer handles
|
||||
* frame boundaries; same contract as mc20).
|
||||
*
|
||||
* QPU shader not implemented yet; recipe table routes AUTO to CPU
|
||||
* NEON. Explicit DAEDALUS_SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* Recipe query — what does the API recommend for each kernel?
|
||||
* ----------------------------------------------------------------- */
|
||||
@@ -309,6 +428,14 @@ typedef enum {
|
||||
DAEDALUS_KERNEL_H264_IDCT8 = 7,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC20 = 9,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA = 13,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC02 = 17,
|
||||
} daedalus_kernel;
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||
|
||||
+679
-63
@@ -40,6 +40,12 @@ struct daedalus_ctx {
|
||||
v3d_pipeline cdef_pipe;
|
||||
int h264deblock_pipe_ready;
|
||||
v3d_pipeline h264deblock_pipe;
|
||||
int h264_idct4_pipe_ready;
|
||||
v3d_pipeline h264_idct4_pipe;
|
||||
int h264_idct8_pipe_ready;
|
||||
v3d_pipeline h264_idct8_pipe;
|
||||
int h264_qpel_mc20_pipe_ready;
|
||||
v3d_pipeline h264_qpel_mc20_pipe;
|
||||
};
|
||||
|
||||
daedalus_ctx *daedalus_ctx_create(void)
|
||||
@@ -53,6 +59,25 @@ daedalus_ctx *daedalus_ctx_create(void)
|
||||
|
||||
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
|
||||
{
|
||||
/*
|
||||
* Per the "QPU is default substrate" decree 2026-05-23:
|
||||
* setting DAEDALUS_FORCE_QPU=1 in the process env escalates this
|
||||
* function to a full daedalus_ctx_create(), letting the libavcodec
|
||||
* substitution shims (which call create_no_qpu via pthread_once)
|
||||
* fire the V3D shaders that exist for cycles 1/2/4/5/8. Without
|
||||
* this hook each consumer process (firefox, mpv, daemon) would
|
||||
* need its own shim build to opt into QPU.
|
||||
*
|
||||
* Default behaviour (env var unset / not "1") is unchanged: pure
|
||||
* NEON ctx, no implicit Vulkan init. Firefox / mpv consumers
|
||||
* that dlopen libavcodec without opting in stay on the
|
||||
* Vulkan-free path; the daemon explicitly sets
|
||||
* DAEDALUS_FORCE_QPU=1 before loading libavcodec.
|
||||
*/
|
||||
const char *force = getenv("DAEDALUS_FORCE_QPU");
|
||||
if (force && force[0] == '1' && force[1] == 0)
|
||||
return daedalus_ctx_create();
|
||||
|
||||
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||||
if (!ctx) return NULL;
|
||||
ctx->has_qpu = 0;
|
||||
@@ -75,6 +100,9 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||
if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
|
||||
if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
|
||||
if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
|
||||
if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
|
||||
if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
|
||||
if (ctx->h264_qpel_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc20_pipe);
|
||||
v3d_runner_destroy(ctx->runner);
|
||||
}
|
||||
free(ctx);
|
||||
@@ -84,16 +112,33 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
{
|
||||
/*
|
||||
* Recipe table per the "QPU is default substrate" decree
|
||||
* 2026-05-23. Any kernel that has a V3D compute shader returns
|
||||
* SUBSTRATE_QPU; CPU is the fallback for kernels without a
|
||||
* shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel
|
||||
* mc20 — covered by follow-on task 165). The dispatch
|
||||
* wrappers already fall back to CPU automatically when the
|
||||
* ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0).
|
||||
*/
|
||||
switch (k) {
|
||||
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */
|
||||
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */
|
||||
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */
|
||||
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -118,8 +163,24 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride
|
||||
extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
/* -------------------- CPU dispatch implementations -------------- */
|
||||
|
||||
@@ -229,6 +290,108 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||||
meta[i].tc0[2], meta[i].tc0[3] };
|
||||
ff_h264_h_loop_filter_luma_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||||
meta[i].tc0[2], meta[i].tc0[3] };
|
||||
ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||||
meta[i].tc0[2], meta[i].tc0[3] };
|
||||
ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta
|
||||
* struct's tc0[] field is unused for intra (the spec hardcodes the
|
||||
* strength). We accept the same meta type so callers can build a
|
||||
* single edge-list and route by kernel — saves an extra struct.
|
||||
*/
|
||||
static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
@@ -245,6 +408,19 @@ static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
(ptrdiff_t) stride);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||
|
||||
typedef struct {
|
||||
@@ -291,13 +467,13 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
|
||||
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
||||
@@ -325,8 +501,8 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
._pad = 0,
|
||||
};
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->idct8_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
@@ -344,15 +520,15 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
/* Read-back dst. */
|
||||
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -424,9 +600,9 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
size_t dst_window_size = hi - lo;
|
||||
|
||||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
|
||||
}
|
||||
|
||||
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
||||
@@ -442,8 +618,8 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
||||
|
||||
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail;
|
||||
VkCommandBuffer cb = p->cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
||||
@@ -468,12 +644,12 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
|
||||
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -509,9 +685,9 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0}, bs = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
memcpy(bs.mapped, src, src_max);
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
@@ -530,8 +706,8 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||
.src_stride_u8 = (uint32_t) src_stride };
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->mc8h_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
|
||||
@@ -545,14 +721,14 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -588,9 +764,9 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0}, bt = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_create_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
/* tmp may need padding before block-origin offset (caller-allocated). Just
|
||||
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
|
||||
@@ -615,8 +791,8 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||||
.tmp_stride_u16 = 16,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->cdef_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
|
||||
@@ -630,14 +806,14 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -670,8 +846,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
uint32_t *m = bm.mapped;
|
||||
@@ -691,8 +867,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
||||
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->h264deblock_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
|
||||
@@ -706,12 +882,294 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_blocks;
|
||||
uint32_t dst_stride_u8;
|
||||
uint32_t _pad0;
|
||||
uint32_t _pad1;
|
||||
} h264_idct4_pc;
|
||||
|
||||
static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
if (!ctx->h264_idct4_pipe_ready) {
|
||||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv",
|
||||
3, sizeof(h264_idct4_pc),
|
||||
&ctx->h264_idct4_pipe) != 0)
|
||||
return -1;
|
||||
ctx->h264_idct4_pipe_ready = 1;
|
||||
}
|
||||
|
||||
size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t);
|
||||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */
|
||||
size_t dst_max = 0;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4;
|
||||
if (e > dst_max) dst_max = e;
|
||||
}
|
||||
|
||||
v3d_buffer bc = {0}, bd = {0}, bm = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||
}
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||
}
|
||||
|
||||
memcpy(bc.mapped, coeffs, coeff_bytes);
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
uint32_t *m = bm.mapped;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
m[4*i+0] = meta[i].dst_off;
|
||||
m[4*i+1] = 0;
|
||||
m[4*i+2] = 0;
|
||||
m[4*i+3] = 0;
|
||||
}
|
||||
|
||||
v3d_buffer binds[3] = { bc, bd, bm };
|
||||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3))
|
||||
goto fail;
|
||||
|
||||
uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */
|
||||
h264_idct4_pc pc = {
|
||||
.n_blocks = (uint32_t) n_blocks,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||
};
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->h264_idct4_pipe.pipeline);
|
||||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->h264_idct4_pipe.layout, 0, 1,
|
||||
&ctx->h264_idct4_pipe.desc_set, 0, NULL);
|
||||
vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||
vkEndCommandBuffer(cb);
|
||||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
/* H.264/FFmpeg convention: zero the coeffs block after the
|
||||
* transform (matches the C ref + NEON .S behaviour). */
|
||||
memset(coeffs, 0, coeff_bytes);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_blocks;
|
||||
uint32_t dst_stride_u8;
|
||||
uint32_t _pad0;
|
||||
uint32_t _pad1;
|
||||
} h264_idct8_pc;
|
||||
|
||||
static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
if (!ctx->h264_idct8_pipe_ready) {
|
||||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
|
||||
3, sizeof(h264_idct8_pc),
|
||||
&ctx->h264_idct8_pipe) != 0)
|
||||
return -1;
|
||||
ctx->h264_idct8_pipe_ready = 1;
|
||||
}
|
||||
|
||||
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
|
||||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||||
size_t dst_max = 0;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
|
||||
if (e > dst_max) dst_max = e;
|
||||
}
|
||||
|
||||
v3d_buffer bc = {0}, bd = {0}, bm = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||
}
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||||
}
|
||||
|
||||
memcpy(bc.mapped, coeffs, coeff_bytes);
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
uint32_t *m = bm.mapped;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
m[4*i+0] = meta[i].dst_off;
|
||||
m[4*i+1] = 0;
|
||||
m[4*i+2] = 0;
|
||||
m[4*i+3] = 0;
|
||||
}
|
||||
|
||||
v3d_buffer binds[3] = { bc, bd, bm };
|
||||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
|
||||
goto fail;
|
||||
|
||||
uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8); /* 8 blocks/WG */
|
||||
h264_idct8_pc pc = {
|
||||
.n_blocks = (uint32_t) n_blocks,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||
};
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->h264_idct8_pipe.pipeline);
|
||||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->h264_idct8_pipe.layout, 0, 1,
|
||||
&ctx->h264_idct8_pipe.desc_set, 0, NULL);
|
||||
vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||
vkEndCommandBuffer(cb);
|
||||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
memset(coeffs, 0, coeff_bytes);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* -------------------- H.264 qpel mc20 QPU dispatch (cycle 9) --- */
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_blocks;
|
||||
uint32_t stride_u8;
|
||||
uint32_t _pad0;
|
||||
uint32_t _pad1;
|
||||
} h264_qpel_mc20_pc;
|
||||
|
||||
static int dispatch_h264_qpel_mc20_qpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
if (!ctx->h264_qpel_mc20_pipe_ready) {
|
||||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc20.spv",
|
||||
3, sizeof(h264_qpel_mc20_pc),
|
||||
&ctx->h264_qpel_mc20_pipe) != 0)
|
||||
return -1;
|
||||
ctx->h264_qpel_mc20_pipe_ready = 1;
|
||||
}
|
||||
|
||||
/* Compute the smallest contiguous src/dst window that covers
|
||||
* every block's read/write footprint.
|
||||
*
|
||||
* src: filter reads cols (c-2)..(c+3) for c=0..7 across rows 0..7.
|
||||
* Highest read = src_off + 7*stride + (7 + 3) = src_off + 7*stride + 10.
|
||||
* Plus 1 for the byte-count semantic of memcpy (length=N copies
|
||||
* indices 0..N-1) → src_max = src_off + 7*stride + 11.
|
||||
*
|
||||
* dst: writes cols 0..7 across rows 0..7.
|
||||
* Highest write = dst_off + 7*stride + 7; +1 → dst_off + 7*stride + 8. */
|
||||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||||
size_t src_max = 0, dst_max = 0;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
size_t s_end = meta[i].src_off + (size_t) 7 * stride + 11;
|
||||
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
|
||||
if (s_end > src_max) src_max = s_end;
|
||||
if (d_end > dst_max) dst_max = d_end;
|
||||
}
|
||||
|
||||
v3d_buffer bs = {0}, bd = {0}, bm = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||||
}
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||||
}
|
||||
|
||||
/* Copy src window (filter needs cols -2..+3, captured by src_max
|
||||
* upper bound above; the lower bound is implicit in src_off >= 2
|
||||
* which the caller guarantees per the public API contract). */
|
||||
memcpy(bs.mapped, src, src_max);
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
uint32_t *m = bm.mapped;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
m[4*i+0] = meta[i].dst_off;
|
||||
m[4*i+1] = meta[i].src_off;
|
||||
m[4*i+2] = 0;
|
||||
m[4*i+3] = 0;
|
||||
}
|
||||
|
||||
v3d_buffer binds[3] = { bs, bd, bm };
|
||||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc20_pipe, binds, 3))
|
||||
goto fail;
|
||||
|
||||
uint32_t wg_count = (uint32_t) n_blocks; /* 1 block per WG */
|
||||
h264_qpel_mc20_pc pc = {
|
||||
.n_blocks = (uint32_t) n_blocks,
|
||||
.stride_u8 = (uint32_t) stride,
|
||||
};
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->h264_qpel_mc20_pipe.pipeline);
|
||||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
ctx->h264_qpel_mc20_pipe.layout, 0, 1,
|
||||
&ctx->h264_qpel_mc20_pipe.desc_set, 0, NULL);
|
||||
vkCmdPushConstants(cb, ctx->h264_qpel_mc20_pipe.layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||
vkEndCommandBuffer(cb);
|
||||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -803,8 +1261,16 @@ int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
|
||||
dst, dst_stride, coeffs, n_blocks, meta);
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||
return dispatch_h264_idct4_cpu(ctx, dst, dst_stride,
|
||||
coeffs, n_blocks, meta);
|
||||
return dispatch_h264_idct4_qpu(ctx, dst, dst_stride,
|
||||
coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
@@ -812,8 +1278,16 @@ int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
int16_t *coeffs, size_t n_blocks,
|
||||
const daedalus_h264_block_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
|
||||
dst, dst_stride, coeffs, n_blocks, meta);
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||
return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
|
||||
coeffs, n_blocks, meta);
|
||||
return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
|
||||
coeffs, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
@@ -830,12 +1304,106 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
|
||||
return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH);
|
||||
/* No QPU shader for the H variant yet — always falls through to
|
||||
* CPU. Mirror the _v shape anyway so the substrate switch is
|
||||
* uniform; QPU just isn't a real option here yet. */
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU) {
|
||||
/* QPU shader for H deblock isn't implemented yet; recipe
|
||||
* table returns CPU, so AUTO never lands here. An explicit
|
||||
* QPU request fails fast rather than silently degrading to
|
||||
* CPU — matches the principle from the IDCT QPU substrate
|
||||
* (explicit means explicit). */
|
||||
return -1;
|
||||
}
|
||||
return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||
return -1; /* No chroma QPU shader yet. */
|
||||
return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||
return -1;
|
||||
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \
|
||||
int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||||
daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||||
{ \
|
||||
daedalus_substrate eff = sub; \
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||||
eff = daedalus_recipe_substrate_for(kernel); \
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
|
||||
return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \
|
||||
}
|
||||
|
||||
DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu)
|
||||
DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu)
|
||||
DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu)
|
||||
DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu)
|
||||
|
||||
#undef DEFINE_INTRA_DISPATCH
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_QPEL_MC20, dispatch_h264_qpel_mc20_cpu,
|
||||
dst, src, stride, n_blocks, meta);
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||
return dispatch_h264_qpel_mc20_cpu(ctx, dst, src, stride,
|
||||
n_blocks, meta);
|
||||
return dispatch_h264_qpel_mc20_qpu(ctx, dst, src, stride,
|
||||
n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||
return -1; /* No mc02 QPU shader yet — explicit QPU fast-fails. */
|
||||
return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
/* -------------------- Recipe convenience wrappers --------------- */
|
||||
@@ -909,6 +1477,46 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
#define DEFINE_INTRA_RECIPE(name) \
|
||||
int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||||
uint8_t *dst, size_t dst_stride, \
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||||
{ \
|
||||
return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \
|
||||
dst, dst_stride, n_edges, meta); \
|
||||
}
|
||||
|
||||
DEFINE_INTRA_RECIPE(luma_v_intra)
|
||||
DEFINE_INTRA_RECIPE(luma_h_intra)
|
||||
DEFINE_INTRA_RECIPE(chroma_v_intra)
|
||||
DEFINE_INTRA_RECIPE(chroma_h_intra)
|
||||
|
||||
#undef DEFINE_INTRA_RECIPE
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
@@ -916,3 +1524,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||
return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,129 @@
|
||||
// daedalus-fourier — H.264 4x4 inverse integer transform + add, V3D 7.1.
|
||||
//
|
||||
// H.264 spec §8.5.12.1. Pure integer arithmetic — no trig constants
|
||||
// (unlike VP9 IDCT 8x8). Row pass first, column pass second; round
|
||||
// (+32) >> 6, add to dst, clip to u8.
|
||||
//
|
||||
// Block memory layout: COLUMN-MAJOR. block[c*4 + r] = coefficient at
|
||||
// (row r, column c). Matches FFmpeg `ff_h264_idct_add_neon`.
|
||||
//
|
||||
// Workgroup layout: 64 invocations = 4 lanes/block × 16 blocks/WG.
|
||||
// - row pass: lane k (0..3) reads row k of the block (4 coefficients,
|
||||
// one from each column), runs the butterfly, writes 4
|
||||
// outputs to one row of tmp_shared.
|
||||
// - column pass: lane k reads column k of tmp_shared (4 rows),
|
||||
// runs the butterfly, writes 4 outputs to dst as
|
||||
// column k at rows 0..3.
|
||||
//
|
||||
// shared = 16 × 16 × 4 B = 1 KiB. Well under V3D's 16 KiB limit.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Coeffs {
|
||||
int16_t coeffs[]; // N × 16 column-major
|
||||
} u_coeffs;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[]; // H × stride bytes (caller-provided base)
|
||||
} u_dst;
|
||||
|
||||
layout(binding = 2) readonly buffer Meta {
|
||||
uvec4 meta[]; // .x = dst_off (byte offset into u_dst.dst)
|
||||
} u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint dst_stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
// 16 blocks per WG × 16 ints per block = 256 ints = 1 KiB shared.
|
||||
shared int tmp_shared[16 * 16];
|
||||
|
||||
// 1D butterfly per H.264 §8.5.12.1. d[0..3] in, o[0..3] out.
|
||||
void idct4_1d(int d0, int d1, int d2, int d3,
|
||||
out int o0, out int o1, out int o2, out int o3)
|
||||
{
|
||||
int e = d0 + d2;
|
||||
int f = d0 - d2;
|
||||
int g = (d1 >> 1) - d3;
|
||||
int h = d1 + (d3 >> 1);
|
||||
o0 = e + h;
|
||||
o1 = f + g;
|
||||
o2 = f - g;
|
||||
o3 = e - h;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
// Lane decomposition: local_size 64 = 16 blocks × 4 lanes/block.
|
||||
uint gid = gl_GlobalInvocationID.x;
|
||||
uint wg_id = gid / 64u;
|
||||
uint lane_in_wg = gid & 63u;
|
||||
uint block_local = lane_in_wg >> 2; // 0..15
|
||||
uint k = lane_in_wg & 3u; // 0..3
|
||||
uint block_idx = wg_id * 16u + block_local;
|
||||
|
||||
bool oob = (block_idx >= pc.n_blocks);
|
||||
|
||||
// ---- Row pass --------------------------------------------------
|
||||
// lane k handles row r=k. Reads block[c*4 + k] for c=0..3 (one
|
||||
// element from each column at fixed row).
|
||||
if (!oob) {
|
||||
uint base = block_idx * 16u;
|
||||
int d0 = int(u_coeffs.coeffs[base + 0u * 4u + k]);
|
||||
int d1 = int(u_coeffs.coeffs[base + 1u * 4u + k]);
|
||||
int d2 = int(u_coeffs.coeffs[base + 2u * 4u + k]);
|
||||
int d3 = int(u_coeffs.coeffs[base + 3u * 4u + k]);
|
||||
|
||||
int o0, o1, o2, o3;
|
||||
idct4_1d(d0, d1, d2, d3, o0, o1, o2, o3);
|
||||
|
||||
// Write row k of tmp_shared[block_local].
|
||||
uint tbase = block_local * 16u + k * 4u;
|
||||
tmp_shared[tbase + 0u] = o0;
|
||||
tmp_shared[tbase + 1u] = o1;
|
||||
tmp_shared[tbase + 2u] = o2;
|
||||
tmp_shared[tbase + 3u] = o3;
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
// ---- Column pass ----------------------------------------------
|
||||
// lane k handles column c=k. Reads tmp[r][k] for r=0..3.
|
||||
if (!oob) {
|
||||
uint tbase = block_local * 16u;
|
||||
int s0 = tmp_shared[tbase + 0u * 4u + k];
|
||||
int s1 = tmp_shared[tbase + 1u * 4u + k];
|
||||
int s2 = tmp_shared[tbase + 2u * 4u + k];
|
||||
int s3 = tmp_shared[tbase + 3u * 4u + k];
|
||||
|
||||
int o0, o1, o2, o3;
|
||||
idct4_1d(s0, s1, s2, s3, o0, o1, o2, o3);
|
||||
|
||||
// Column k at rows 0..3 of dst, offset by meta.x (dst_off).
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint stride = pc.dst_stride_u8;
|
||||
uint a0 = dst_off + 0u * stride + k;
|
||||
uint a1 = dst_off + 1u * stride + k;
|
||||
uint a2 = dst_off + 2u * stride + k;
|
||||
uint a3 = dst_off + 3u * stride + k;
|
||||
|
||||
int p0 = int(u_dst.dst[a0]);
|
||||
int p1 = int(u_dst.dst[a1]);
|
||||
int p2 = int(u_dst.dst[a2]);
|
||||
int p3 = int(u_dst.dst[a3]);
|
||||
|
||||
u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 32) >> 6), 0, 255));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
// daedalus-fourier — H.264 8x8 inverse integer transform + add, V3D 7.1.
|
||||
//
|
||||
// H.264 spec §8.5.13.2 (High profile 8x8 IT). Pure integer arithmetic
|
||||
// — different butterfly from VP9 IDCT 8x8 (cycle 1, uses cospi
|
||||
// multipliers). Row pass first, column pass second; round (+32) >> 6,
|
||||
// add to dst, clip to u8.
|
||||
//
|
||||
// Block layout: COLUMN-MAJOR. block[c*8 + r] = coefficient at
|
||||
// (row r, column c). Matches FFmpeg `ff_h264_idct8_add_neon`.
|
||||
//
|
||||
// Workgroup layout: 64 invocations = 8 lanes/block × 8 blocks/WG.
|
||||
// - row pass: lane k (0..7) reads row k of the block (8 coefficients,
|
||||
// one from each column), runs the butterfly, writes 8
|
||||
// outputs to one row of tmp_shared.
|
||||
// - column pass: lane k reads column k of tmp_shared (8 rows),
|
||||
// runs the butterfly, writes 8 outputs to dst as
|
||||
// column k at rows 0..7.
|
||||
//
|
||||
// shared = 8 × 64 × 4 B = 2 KiB. Well under V3D's 16 KiB limit.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Coeffs {
|
||||
int16_t coeffs[]; // N × 64 column-major
|
||||
} u_coeffs;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[]; // H × stride bytes
|
||||
} u_dst;
|
||||
|
||||
layout(binding = 2) readonly buffer Meta {
|
||||
uvec4 meta[]; // .x = dst_off
|
||||
} u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint dst_stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
// 8 blocks/WG × 64 ints/block × 4 B = 2 KiB shared.
|
||||
shared int tmp_shared[8 * 64];
|
||||
|
||||
// 1D 8-element butterfly per H.264 §8.5.13.2.
|
||||
void idct8_1d(int d0, int d1, int d2, int d3,
|
||||
int d4, int d5, int d6, int d7,
|
||||
out int g0, out int g1, out int g2, out int g3,
|
||||
out int g4, out int g5, out int g6, out int g7)
|
||||
{
|
||||
int e0 = d0 + d4;
|
||||
int e1 = -d3 + d5 - d7 - (d7 >> 1);
|
||||
int e2 = d0 - d4;
|
||||
int e3 = d1 + d7 - d3 - (d3 >> 1);
|
||||
int e4 = (d2 >> 1) - d6;
|
||||
int e5 = -d1 + d7 + d5 + (d5 >> 1);
|
||||
int e6 = d2 + (d6 >> 1);
|
||||
int e7 = d3 + d5 + d1 + (d1 >> 1);
|
||||
|
||||
int f0 = e0 + e6;
|
||||
int f1 = e1 + (e7 >> 2);
|
||||
int f2 = e2 + e4;
|
||||
int f3 = e3 + (e5 >> 2);
|
||||
int f4 = e2 - e4;
|
||||
int f5 = (e3 >> 2) - e5;
|
||||
int f6 = e0 - e6;
|
||||
int f7 = e7 - (e1 >> 2);
|
||||
|
||||
g0 = f0 + f7;
|
||||
g1 = f2 + f5;
|
||||
g2 = f4 + f3;
|
||||
g3 = f6 + f1;
|
||||
g4 = f6 - f1;
|
||||
g5 = f4 - f3;
|
||||
g6 = f2 - f5;
|
||||
g7 = f0 - f7;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
// local_size 64 = 8 blocks × 8 lanes/block.
|
||||
uint gid = gl_GlobalInvocationID.x;
|
||||
uint wg_id = gid / 64u;
|
||||
uint lane_in_wg = gid & 63u;
|
||||
uint block_local = lane_in_wg >> 3; // 0..7
|
||||
uint k = lane_in_wg & 7u; // 0..7
|
||||
uint block_idx = wg_id * 8u + block_local;
|
||||
|
||||
bool oob = (block_idx >= pc.n_blocks);
|
||||
|
||||
// ---- Row pass --------------------------------------------------
|
||||
// lane k handles row r=k. Reads block[c*8 + k] for c=0..7.
|
||||
if (!oob) {
|
||||
uint base = block_idx * 64u;
|
||||
int d0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
|
||||
int d1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
|
||||
int d2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
|
||||
int d3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
|
||||
int d4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
|
||||
int d5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
|
||||
int d6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
|
||||
int d7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
|
||||
|
||||
int g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
idct8_1d(d0, d1, d2, d3, d4, d5, d6, d7,
|
||||
g0, g1, g2, g3, g4, g5, g6, g7);
|
||||
|
||||
// Write row k of tmp_shared[block_local].
|
||||
uint tbase = block_local * 64u + k * 8u;
|
||||
tmp_shared[tbase + 0u] = g0;
|
||||
tmp_shared[tbase + 1u] = g1;
|
||||
tmp_shared[tbase + 2u] = g2;
|
||||
tmp_shared[tbase + 3u] = g3;
|
||||
tmp_shared[tbase + 4u] = g4;
|
||||
tmp_shared[tbase + 5u] = g5;
|
||||
tmp_shared[tbase + 6u] = g6;
|
||||
tmp_shared[tbase + 7u] = g7;
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
// ---- Column pass ----------------------------------------------
|
||||
// lane k handles column c=k. Reads tmp[r][k] for r=0..7.
|
||||
if (!oob) {
|
||||
uint tbase = block_local * 64u;
|
||||
int s0 = tmp_shared[tbase + 0u * 8u + k];
|
||||
int s1 = tmp_shared[tbase + 1u * 8u + k];
|
||||
int s2 = tmp_shared[tbase + 2u * 8u + k];
|
||||
int s3 = tmp_shared[tbase + 3u * 8u + k];
|
||||
int s4 = tmp_shared[tbase + 4u * 8u + k];
|
||||
int s5 = tmp_shared[tbase + 5u * 8u + k];
|
||||
int s6 = tmp_shared[tbase + 6u * 8u + k];
|
||||
int s7 = tmp_shared[tbase + 7u * 8u + k];
|
||||
|
||||
int g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||
g0, g1, g2, g3, g4, g5, g6, g7);
|
||||
|
||||
// Column k at rows 0..7 of dst, offset by meta.x.
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint stride = pc.dst_stride_u8;
|
||||
uint a0 = dst_off + 0u * stride + k;
|
||||
uint a1 = dst_off + 1u * stride + k;
|
||||
uint a2 = dst_off + 2u * stride + k;
|
||||
uint a3 = dst_off + 3u * stride + k;
|
||||
uint a4 = dst_off + 4u * stride + k;
|
||||
uint a5 = dst_off + 5u * stride + k;
|
||||
uint a6 = dst_off + 6u * stride + k;
|
||||
uint a7 = dst_off + 7u * stride + k;
|
||||
|
||||
int p0 = int(u_dst.dst[a0]);
|
||||
int p1 = int(u_dst.dst[a1]);
|
||||
int p2 = int(u_dst.dst[a2]);
|
||||
int p3 = int(u_dst.dst[a3]);
|
||||
int p4 = int(u_dst.dst[a4]);
|
||||
int p5 = int(u_dst.dst[a5]);
|
||||
int p6 = int(u_dst.dst[a6]);
|
||||
int p7 = int(u_dst.dst[a7]);
|
||||
|
||||
u_dst.dst[a0] = uint8_t(clamp(p0 + ((g0 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a1] = uint8_t(clamp(p1 + ((g1 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a2] = uint8_t(clamp(p2 + ((g2 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a3] = uint8_t(clamp(p3 + ((g3 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a4] = uint8_t(clamp(p4 + ((g4 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a5] = uint8_t(clamp(p5 + ((g5 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a6] = uint8_t(clamp(p6 + ((g6 + 32) >> 6), 0, 255));
|
||||
u_dst.dst[a7] = uint8_t(clamp(p7 + ((g7 + 32) >> 6), 0, 255));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc20 (8x8, horizontal half-pel), V3D 7.1.
|
||||
//
|
||||
// H.264 spec §8.4.2.2.1 horizontal 6-tap luma interpolation:
|
||||
//
|
||||
// dst[r,c] = clip255(
|
||||
// ( s[r,c-2]
|
||||
// - 5 * s[r,c-1]
|
||||
// + 20 * s[r,c]
|
||||
// + 20 * s[r,c+1]
|
||||
// - 5 * s[r,c+2]
|
||||
// + s[r,c+3]
|
||||
// + 16
|
||||
// ) >> 5)
|
||||
//
|
||||
// Single-stride: dst and src share `stride` (H264QpelContext
|
||||
// convention). src+src_off already points at the leftmost output
|
||||
// column (col 0); the filter reads cols -2..+3. Caller guarantees
|
||||
// edge-padding context per the public API docstring.
|
||||
//
|
||||
// Workgroup layout: 64 invocations = 1 lane per output pixel.
|
||||
// 1 block per WG; n_blocks WGs total. This is the simplest layout
|
||||
// that avoids any inter-lane communication — each lane independently
|
||||
// reads its 6 src samples and writes its 1 dst sample. V3D's L2
|
||||
// cache handles the redundant reads from adjacent lanes.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src {
|
||||
uint8_t src[];
|
||||
} u_src;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[];
|
||||
} u_dst;
|
||||
|
||||
layout(binding = 2) readonly buffer Meta {
|
||||
uvec4 meta[]; // .x = dst_off, .y = src_off
|
||||
} u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
// 1 block per WG, 64 lanes covering the 8x8 output block.
|
||||
uint wg_id = gl_WorkGroupID.x;
|
||||
uint block_idx = wg_id;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3; // 0..7 (row)
|
||||
uint c = lane & 7u; // 0..7 (column)
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// src points at output col 0 of the block; filter reads cols -2..+3
|
||||
// of the current row. Negative col arithmetic is unsigned-safe
|
||||
// because src_off >= 2 (caller-guaranteed left context).
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base + 0u]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int p = clamp(v >> 5, 0, 255);
|
||||
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(p);
|
||||
}
|
||||
@@ -17,6 +17,18 @@
|
||||
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
|
||||
|
||||
/* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB). Cycle
|
||||
* 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize
|
||||
* requests fall through to non-pooled allocation. */
|
||||
#define V3D_POOL_MIN_LOG2 8
|
||||
#define V3D_POOL_MAX_LOG2 23
|
||||
#define V3D_POOL_BUCKETS (V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1)
|
||||
|
||||
struct v3d_pool_entry {
|
||||
v3d_buffer buf;
|
||||
struct v3d_pool_entry *next;
|
||||
};
|
||||
|
||||
struct v3d_runner {
|
||||
VkInstance instance;
|
||||
VkPhysicalDevice phys;
|
||||
@@ -26,6 +38,15 @@ struct v3d_runner {
|
||||
VkCommandPool pool;
|
||||
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
|
||||
VkPhysicalDeviceMemoryProperties mem_props;
|
||||
|
||||
/* Buffer pool: per-bucket freelist of previously-released
|
||||
* v3d_buffer. bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2.
|
||||
* pool_total_bytes accumulates every successful vkAllocateMemory
|
||||
* we've done through the pool — never decreases (the freelist
|
||||
* just hands buffers around, no vkFreeMemory until destroy).
|
||||
*/
|
||||
struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS];
|
||||
size_t pool_total_bytes;
|
||||
};
|
||||
|
||||
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
|
||||
@@ -168,6 +189,21 @@ void v3d_runner_destroy(v3d_runner *r)
|
||||
{
|
||||
if (!r) return;
|
||||
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
|
||||
|
||||
/* Drain the buffer pool BEFORE destroying device — the pool
|
||||
* entries own VkBuffer/VkDeviceMemory handles, which need a live
|
||||
* device for vkDestroyBuffer/vkFreeMemory. */
|
||||
for (int b = 0; b < V3D_POOL_BUCKETS; b++) {
|
||||
struct v3d_pool_entry *e = r->pool_free[b];
|
||||
while (e) {
|
||||
struct v3d_pool_entry *next = e->next;
|
||||
v3d_runner_destroy_buffer(r, &e->buf);
|
||||
free(e);
|
||||
e = next;
|
||||
}
|
||||
r->pool_free[b] = NULL;
|
||||
}
|
||||
|
||||
if (r->pool != VK_NULL_HANDLE)
|
||||
vkDestroyCommandPool(r->device, r->pool, NULL);
|
||||
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
|
||||
@@ -175,6 +211,92 @@ void v3d_runner_destroy(v3d_runner *r)
|
||||
free(r);
|
||||
}
|
||||
|
||||
/* ---- Buffer pool ----------------------------------------------- */
|
||||
|
||||
/* ceil_log2 for buffer pool bucket selection. */
|
||||
static int v3d_pool_bucket_for(size_t size)
|
||||
{
|
||||
int log2;
|
||||
size_t m;
|
||||
|
||||
if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2))
|
||||
return 0;
|
||||
m = size - 1;
|
||||
log2 = 0;
|
||||
while (m) { log2++; m >>= 1; }
|
||||
if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2;
|
||||
if (log2 > V3D_POOL_MAX_LOG2) return -1;
|
||||
return log2 - V3D_POOL_MIN_LOG2;
|
||||
}
|
||||
|
||||
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
|
||||
{
|
||||
int bucket;
|
||||
size_t bucket_size;
|
||||
struct v3d_pool_entry *e;
|
||||
int rc;
|
||||
|
||||
if (!r || !out || size == 0) return -1;
|
||||
|
||||
bucket = v3d_pool_bucket_for(size);
|
||||
if (bucket < 0) {
|
||||
/* Oversize — fall through to non-pooled allocation. Caller
|
||||
* still calls v3d_runner_release_buffer(), which detects the
|
||||
* oversize bucket via bucket_for() and destroys. */
|
||||
return v3d_runner_create_buffer(r, size, out);
|
||||
}
|
||||
bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2);
|
||||
|
||||
e = r->pool_free[bucket];
|
||||
if (e) {
|
||||
r->pool_free[bucket] = e->next;
|
||||
*out = e->buf;
|
||||
free(e);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Miss — allocate fresh at the bucket size. Subsequent acquire/
|
||||
* release for the same bucket reuses this buffer. */
|
||||
rc = v3d_runner_create_buffer(r, bucket_size, out);
|
||||
if (rc == 0)
|
||||
r->pool_total_bytes += bucket_size;
|
||||
return rc;
|
||||
}
|
||||
|
||||
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf)
|
||||
{
|
||||
int bucket;
|
||||
struct v3d_pool_entry *e;
|
||||
|
||||
if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return;
|
||||
|
||||
bucket = v3d_pool_bucket_for(buf->size);
|
||||
if (bucket < 0) {
|
||||
/* Oversize — destroy outright; never made it into the pool. */
|
||||
v3d_runner_destroy_buffer(r, buf);
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
return;
|
||||
}
|
||||
|
||||
e = malloc(sizeof(*e));
|
||||
if (!e) {
|
||||
/* Allocator failure: just destroy. Pool degenerates to
|
||||
* non-pooled behaviour but doesn't leak. */
|
||||
v3d_runner_destroy_buffer(r, buf);
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
return;
|
||||
}
|
||||
e->buf = *buf;
|
||||
e->next = r->pool_free[bucket];
|
||||
r->pool_free[bucket] = e;
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
}
|
||||
|
||||
size_t v3d_runner_pool_total_bytes(v3d_runner *r)
|
||||
{
|
||||
return r ? r->pool_total_bytes : 0;
|
||||
}
|
||||
|
||||
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
|
||||
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
|
||||
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
|
||||
@@ -364,12 +486,27 @@ int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
|
||||
.pSetLayouts = &out->ds_layout,
|
||||
};
|
||||
CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
|
||||
|
||||
/* Persistent command buffer — pool was created with
|
||||
* RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch
|
||||
* sites can call vkResetCommandBuffer on this same cb instead
|
||||
* of paying vkAllocateCommandBuffers per call. */
|
||||
VkCommandBufferAllocateInfo cbai = {
|
||||
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
|
||||
.commandPool = r->pool,
|
||||
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
|
||||
.commandBufferCount = 1,
|
||||
};
|
||||
CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
||||
{
|
||||
if (!p || p->pipeline == VK_NULL_HANDLE) return;
|
||||
if (p->cb != VK_NULL_HANDLE)
|
||||
vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb);
|
||||
vkDestroyPipeline(r->device, p->pipeline, NULL);
|
||||
vkDestroyPipelineLayout(r->device, p->layout, NULL);
|
||||
vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */
|
||||
@@ -377,6 +514,13 @@ void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
||||
memset(p, 0, sizeof(*p));
|
||||
}
|
||||
|
||||
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p)
|
||||
{
|
||||
(void) r;
|
||||
if (!p || p->cb == VK_NULL_HANDLE) return -1;
|
||||
return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1;
|
||||
}
|
||||
|
||||
int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
|
||||
const v3d_buffer *bufs, uint32_t n)
|
||||
{
|
||||
|
||||
@@ -34,6 +34,12 @@ typedef struct {
|
||||
VkDescriptorSet desc_set;
|
||||
uint32_t n_ssbos;
|
||||
uint32_t push_const_size;
|
||||
/* Persistent command buffer. Allocated at create-pipeline time;
|
||||
* dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to
|
||||
* vkResetCommandBuffer instead of paying vkAllocateCommandBuffers
|
||||
* per dispatch. Pool flagged RESET_COMMAND_BUFFER_BIT so reset
|
||||
* is permitted. */
|
||||
VkCommandBuffer cb;
|
||||
} v3d_pipeline;
|
||||
|
||||
/*
|
||||
@@ -57,10 +63,43 @@ const char *v3d_runner_device_name(v3d_runner *r);
|
||||
* host side. The mapping persists for the lifetime of the buffer.
|
||||
*
|
||||
* Returns 0 on success, non-zero on failure.
|
||||
*
|
||||
* NOTE: prefer v3d_runner_acquire_buffer() on the dispatch hot path —
|
||||
* create_buffer/destroy_buffer go straight to vkAllocateMemory each
|
||||
* call, which on V3D7's Mesa stack costs ~10-50us. The acquire/
|
||||
* release pair pulls from a freelist and pays vkAllocateMemory only
|
||||
* on a cache miss.
|
||||
*/
|
||||
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
||||
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
|
||||
|
||||
/*
|
||||
* Pooled buffer acquisition. Returns a v3d_buffer whose .size is the
|
||||
* smallest power-of-2 >= the requested size (so callers can pool
|
||||
* across similar-sized requests). Backed by HOST_VISIBLE |
|
||||
* HOST_COHERENT memory; mapped pointer is valid.
|
||||
*
|
||||
* On cache hit: zero-cost reuse of a previously-released buffer.
|
||||
* On miss: falls through to v3d_runner_create_buffer(). Release with
|
||||
* v3d_runner_release_buffer(); pool drains in v3d_runner_destroy().
|
||||
*
|
||||
* Lifetime contract: the returned buffer's .mapped contents are
|
||||
* UNINITIALISED — the previous user's data may still be present.
|
||||
* Callers that need a clean buffer must memset themselves. This is
|
||||
* deliberate; the dispatch hot paths immediately overwrite the
|
||||
* buffer with new coefficients / meta anyway.
|
||||
*
|
||||
* Thread-safety: NOT thread-safe. A daedalus_ctx is single-threaded
|
||||
* by API contract; the pool inherits that constraint.
|
||||
*/
|
||||
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
||||
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf);
|
||||
|
||||
/* Pool diagnostics: total allocated bytes (sum across all size
|
||||
* classes, including currently-released entries). Useful for
|
||||
* watermark logging. */
|
||||
size_t v3d_runner_pool_total_bytes(v3d_runner *r);
|
||||
|
||||
/* Compute pipeline from a SPIR-V file path. The descriptor-set
|
||||
* layout exposes `n_ssbos` storage buffer bindings at binding
|
||||
* indices 0..n_ssbos-1, all visible to the compute stage. A push
|
||||
@@ -88,6 +127,12 @@ int v3d_runner_bind_buffers(v3d_runner *r,
|
||||
/* Allocate a primary command buffer from the runner's pool. */
|
||||
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
|
||||
|
||||
/* Reset @p->cb so it can be re-recorded. Returns 0 on success.
|
||||
* Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path —
|
||||
* vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us
|
||||
* driver cost. */
|
||||
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p);
|
||||
|
||||
/* Submit `cb` to the queue and wait for completion. The classic
|
||||
* timed operation. Returns 0 on success.
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* bench_pool_overhead — measure QPU dispatch overhead with and without
|
||||
* the v3d_runner buffer pool warm.
|
||||
*
|
||||
* Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and
|
||||
* prints the per-call distribution. The first call pays
|
||||
* vkAllocateMemory (typically tens of microseconds on V3D7's Mesa);
|
||||
* the second and subsequent should hit the pool freelist and amortise
|
||||
* to the pure dispatch-floor cost.
|
||||
*
|
||||
* Purpose: provide a concrete before/after number for the QPU-default
|
||||
* substrate decree (2026-05-23). Bench is non-gating and runs in
|
||||
* fractions of a second.
|
||||
*
|
||||
* License: BSD-2-Clause.
|
||||
*/
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "../include/daedalus.h"
|
||||
|
||||
extern size_t v3d_runner_pool_total_bytes(void *); /* exposed if we wanted it */
|
||||
|
||||
static double now_seconds(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||
}
|
||||
|
||||
static int cmp_double(const void *a, const void *b)
|
||||
{
|
||||
double da = *(const double *)a, db = *(const double *)b;
|
||||
return da < db ? -1 : da > db ? 1 : 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int n_calls = argc > 1 ? atoi(argv[1]) : 200;
|
||||
int n_blocks = 8; /* one MB column of 8x8 IDCT blocks */
|
||||
int stride = 64;
|
||||
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
||||
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||
printf("ctx: has_qpu=%d\n", has_qpu);
|
||||
if (!has_qpu) {
|
||||
fprintf(stderr, "QPU not available on this device; bench needs V3D\n");
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
/* Build a representative IDCT 8x8 batch and warm a dst buffer. */
|
||||
int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t));
|
||||
uint8_t *dst = calloc((size_t) n_blocks * 8 * stride, 1);
|
||||
daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta));
|
||||
if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; }
|
||||
|
||||
uint64_t s = 0x1234567abcdefULL;
|
||||
for (size_t i = 0; i < (size_t) n_blocks * 64; i++) {
|
||||
s ^= s << 13; s ^= s >> 7; s ^= s << 17;
|
||||
coeffs[i] = (int16_t)(s & 0x7ff) - 0x400;
|
||||
}
|
||||
for (int b = 0; b < n_blocks; b++) {
|
||||
meta[b].dst_off = (uint32_t) b * 8;
|
||||
meta[b].block_x = (uint32_t) b;
|
||||
meta[b].block_y = 0;
|
||||
}
|
||||
|
||||
double *t = malloc((size_t) n_calls * sizeof(double));
|
||||
int rc;
|
||||
|
||||
printf("=== dispatching %d times, n_blocks=%d/call ===\n",
|
||||
n_calls, n_blocks);
|
||||
|
||||
for (int i = 0; i < n_calls; i++) {
|
||||
double t0 = now_seconds();
|
||||
rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU,
|
||||
dst, (size_t) stride,
|
||||
coeffs, (size_t) n_blocks, meta);
|
||||
double t1 = now_seconds();
|
||||
if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; }
|
||||
t[i] = (t1 - t0) * 1e6; /* us */
|
||||
}
|
||||
|
||||
/* Per-call distribution (first few + sorted summary on the steady-state) */
|
||||
printf("\nfirst 5 calls (cold-warm transition):\n");
|
||||
for (int i = 0; i < 5 && i < n_calls; i++)
|
||||
printf(" call %d: %.2f us\n", i, t[i]);
|
||||
|
||||
int skip = 10; /* drop warm-up calls from the steady-state stats */
|
||||
if (n_calls > skip + 10) {
|
||||
int n = n_calls - skip;
|
||||
double *s_arr = malloc((size_t) n * sizeof(double));
|
||||
memcpy(s_arr, t + skip, (size_t) n * sizeof(double));
|
||||
qsort(s_arr, (size_t) n, sizeof(double), cmp_double);
|
||||
double sum = 0;
|
||||
for (int i = 0; i < n; i++) sum += s_arr[i];
|
||||
printf("\nsteady-state stats (calls %d..%d, n=%d):\n",
|
||||
skip, n_calls - 1, n);
|
||||
printf(" min: %.2f us\n", s_arr[0]);
|
||||
printf(" p50: %.2f us\n", s_arr[n / 2]);
|
||||
printf(" p90: %.2f us\n", s_arr[(int)(n * 0.9)]);
|
||||
printf(" p99: %.2f us\n", s_arr[(int)(n * 0.99)]);
|
||||
printf(" max: %.2f us\n", s_arr[n - 1]);
|
||||
printf(" mean: %.2f us\n", sum / n);
|
||||
printf("\nfirst-call / steady-state median ratio: %.1fx\n",
|
||||
t[0] / s_arr[n / 2]);
|
||||
free(s_arr);
|
||||
}
|
||||
|
||||
free(t); free(coeffs); free(dst); free(meta);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 chroma loop filters
|
||||
* (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
|
||||
* when added). Covers both orientations:
|
||||
*
|
||||
* v_loop_filter_chroma: filter applied VERTICALLY across a
|
||||
* HORIZONTAL edge. Tile is 8 cols × 4 rows of context
|
||||
* (rows -2..+1); pix points to row 0 of the bottom block.
|
||||
* h_loop_filter_chroma: filter applied HORIZONTALLY across a
|
||||
* VERTICAL edge. Tile is 4 cols × 8 rows of context
|
||||
* (cols -2..+1); pix points to col 0 of the right block.
|
||||
*
|
||||
* Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
|
||||
* `ff_h264_h_loop_filter_chroma_neon` (line 430) in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
|
||||
*
|
||||
* Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
|
||||
* - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
|
||||
* - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
|
||||
* - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
|
||||
* - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
|
||||
* - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
|
||||
*
|
||||
* tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
|
||||
* (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or
|
||||
* 8 rows for H edge).
|
||||
*
|
||||
* Signature (matches FFmpeg + the existing luma refs):
|
||||
* void(uint8_t *pix, ptrdiff_t stride,
|
||||
* int alpha, int beta, int8_t tc0[4]);
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline int clip3(int v, int lo, int hi) {
|
||||
return v < lo ? lo : v > hi ? hi : v;
|
||||
}
|
||||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||
|
||||
/* Per-cell chroma filter, vertical-direction access (one column
|
||||
* across the horizontal edge). p1 is at pix[-2*stride], q1 at
|
||||
* pix[+1*stride]. */
|
||||
static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int tc0_s)
|
||||
{
|
||||
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
|
||||
pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
|
||||
}
|
||||
|
||||
/* Same kernel, horizontal-direction access (one row across the
|
||||
* vertical edge). p1 at pix[-2], q1 at pix[+1]. */
|
||||
static void h264_chroma_cell_h(uint8_t *pix,
|
||||
int alpha, int beta, int tc0_s)
|
||||
{
|
||||
int p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1];
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
pix[-1] = (uint8_t) clip_u8(p0 + delta);
|
||||
pix[ 0] = (uint8_t) clip_u8(q0 - delta);
|
||||
}
|
||||
|
||||
void daedalus_h264_v_loop_filter_chroma_ref(
|
||||
uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4])
|
||||
{
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||||
|
||||
/* 8 cols divided into 4 segments of 2 cols each. */
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int tc0_s = tc0[s];
|
||||
if (tc0_s < 0) continue;
|
||||
for (int c = 0; c < 2; c++) {
|
||||
int col = s * 2 + c;
|
||||
h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_chroma_ref(
|
||||
uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4])
|
||||
{
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||||
|
||||
/* 8 rows divided into 4 segments of 2 rows each. */
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int tc0_s = tc0[s];
|
||||
if (tc0_s < 0) continue;
|
||||
for (int r = 0; r < 2; r++) {
|
||||
int row = s * 2 + r;
|
||||
h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma "horizontal"
|
||||
* loop filter (h_loop_filter_luma): applies filter HORIZONTALLY
|
||||
* across a VERTICAL edge. The edge spans the 16-row macroblock
|
||||
* height, between columns -1 and 0.
|
||||
*
|
||||
* Mirrors FFmpeg `ff_h264_h_loop_filter_luma_neon` in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S
|
||||
* line 134. Operates on an 8-col × 16-row region:
|
||||
* pix[r*stride + c] for r in 0..15, c in -4..+3
|
||||
* With pix pointing to row 0, col 0 of the right block (= the
|
||||
* leftmost column of the bottom-/right-block half of the edge).
|
||||
*
|
||||
* 16 rows divided into 4 segments of 4 rows; each segment has its
|
||||
* own tc0 strength (tc0[0..3]).
|
||||
*
|
||||
* Note: FFmpeg's "h_loop_filter" naming uses the FILTER DIRECTION
|
||||
* (horizontal = across the edge from the left), not the edge
|
||||
* orientation (vertical). H.264 spec calls this the "vertical
|
||||
* edge" filter.
|
||||
*
|
||||
* This is the column-axis transpose of h264_v_loop_filter_luma_ref:
|
||||
* - v variant: p3..p0 above the edge (pix[-4*stride..-1*stride]),
|
||||
* q0..q3 below (pix[0..+3*stride]). 16 columns × 4 segments.
|
||||
* - h variant: p3..p0 left of the edge (pix[-4..-1]),
|
||||
* q0..q3 right (pix[0..+3]). 16 rows × 4 segments.
|
||||
* Same per-segment kernel; only the address arithmetic transposes.
|
||||
*
|
||||
* Signature:
|
||||
* void(uint8_t *pix, ptrdiff_t stride,
|
||||
* int alpha, int beta, int8_t tc0[4]);
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline int clip3(int v, int lo, int hi) {
|
||||
return v < lo ? lo : v > hi ? hi : v;
|
||||
}
|
||||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||
|
||||
/* Apply luma deblock to one ROW at the vertical edge.
|
||||
* p0..p3 are pixels left of the edge (pix[-1..-4]),
|
||||
* q0..q3 right (pix[0..+3]).
|
||||
* tc0_s is the segment's tc0 value (already known >= 0).
|
||||
*
|
||||
* Writes back to pix[-2], pix[-1], pix[0], pix[+1]
|
||||
* (= p1, p0, q0, q1).
|
||||
*/
|
||||
static void h264_deblock_luma_row(uint8_t *pix,
|
||||
int alpha, int beta, int tc0_s)
|
||||
{
|
||||
int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3];
|
||||
(void) p3; (void) q3; /* not used in bS<4 path */
|
||||
|
||||
/* Edge pre-conditions. */
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
/* Side conditions. */
|
||||
int ap = abs_i(p2 - p0);
|
||||
int aq = abs_i(q2 - q0);
|
||||
int ap_lt_beta = (ap < beta);
|
||||
int aq_lt_beta = (aq < beta);
|
||||
|
||||
/* Combined filter strength. */
|
||||
int tc = tc0_s + ap_lt_beta + aq_lt_beta;
|
||||
|
||||
/* p0 / q0 update. */
|
||||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
int p0p = clip_u8(p0 + delta);
|
||||
int q0p = clip_u8(q0 - delta);
|
||||
|
||||
/* p1 update (only if ap<beta). */
|
||||
int p1p = p1;
|
||||
if (ap_lt_beta) {
|
||||
int delta_p1 = clip3((p2 + ((p0 + q0 + 1) >> 1) - 2*p1) >> 1, -tc0_s, tc0_s);
|
||||
p1p = p1 + delta_p1;
|
||||
}
|
||||
/* q1 update (only if aq<beta). */
|
||||
int q1p = q1;
|
||||
if (aq_lt_beta) {
|
||||
int delta_q1 = clip3((q2 + ((p0 + q0 + 1) >> 1) - 2*q1) >> 1, -tc0_s, tc0_s);
|
||||
q1p = q1 + delta_q1;
|
||||
}
|
||||
|
||||
pix[-2] = (uint8_t) p1p;
|
||||
pix[-1] = (uint8_t) p0p;
|
||||
pix[ 0] = (uint8_t) q0p;
|
||||
pix[ 1] = (uint8_t) q1p;
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_luma_ref(
|
||||
uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4])
|
||||
{
|
||||
/* H.264 deblock "outer" precondition: alpha == 0 OR beta == 0
|
||||
* skips filtering. Also if ALL tc0[*] == -1, skip
|
||||
* (h264_loop_filter_start macro check). */
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||||
|
||||
/* 16 rows divided into 4 segments of 4 rows each. */
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int tc0_s = tc0[s];
|
||||
if (tc0_s < 0) continue; /* bS = 0 segment → skip */
|
||||
for (int r = 0; r < 4; r++) {
|
||||
int row = s * 4 + r;
|
||||
h264_deblock_luma_row(pix + row * stride, alpha, beta, tc0_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma + chroma "intra"
|
||||
* loop filters (bS = 4 variant, used at I-MB edges where the
|
||||
* boundary strength is forced to 4). Covers all four orientations:
|
||||
*
|
||||
* v_loop_filter_luma_intra — 16 cols × 8 rows, edge between
|
||||
* rows -1 and 0
|
||||
* h_loop_filter_luma_intra — 8 cols × 16 rows, edge between
|
||||
* cols -1 and 0
|
||||
* v_loop_filter_chroma_intra — 8 cols × 4 rows
|
||||
* h_loop_filter_chroma_intra — 4 cols × 8 rows
|
||||
*
|
||||
* Mirrors FFmpeg's `ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon`
|
||||
* in external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
|
||||
*
|
||||
* Algorithm per H.264 §8.7.2.3 (bS=4):
|
||||
*
|
||||
* Preconditions (same as bS<4):
|
||||
* |p0-q0| < α AND |p1-p0| < β AND |q1-q0| < β
|
||||
*
|
||||
* Luma — strong/weak filter selector per side:
|
||||
* strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2)
|
||||
* strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2)
|
||||
*
|
||||
* If strong_p, update p0/p1/p2:
|
||||
* p0' = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3
|
||||
* p1' = (p2 + p1 + p0 + q0 + 2) >> 2
|
||||
* p2' = (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
|
||||
* Else weak (single cell):
|
||||
* p0' = (2*p1 + p0 + q1 + 2) >> 2
|
||||
* Mirror for q-side.
|
||||
*
|
||||
* Chroma — always weak (no quad-tree selector):
|
||||
* p0' = (2*p1 + p0 + q1 + 2) >> 2
|
||||
* q0' = (2*q1 + q0 + p1 + 2) >> 2
|
||||
* Chroma never updates p1/p2/q1/q2.
|
||||
*
|
||||
* Signature (no tc0 in the intra path — the daedalus_h264_deblock_meta
|
||||
* struct's tc0 field is ignored at the dispatch layer):
|
||||
* void(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||
|
||||
/* --- luma intra, one column across the horizontal edge --- */
|
||||
static void h264_luma_intra_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta)
|
||||
{
|
||||
int p3 = pix[-4*stride], p2 = pix[-3*stride];
|
||||
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||
int q2 = pix[ 2*stride], q3 = pix[ 3*stride];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2);
|
||||
int strong_p = strong_common && (abs_i(p2 - p0) < beta);
|
||||
int strong_q = strong_common && (abs_i(q2 - q0) < beta);
|
||||
|
||||
if (strong_p) {
|
||||
pix[-1*stride] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
|
||||
pix[-2*stride] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2);
|
||||
pix[-3*stride] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
|
||||
} else {
|
||||
pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
}
|
||||
|
||||
if (strong_q) {
|
||||
pix[ 0*stride] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
|
||||
pix[ 1*stride] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2);
|
||||
pix[ 2*stride] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
|
||||
} else {
|
||||
pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* --- luma intra, one row across the vertical edge --- */
|
||||
static void h264_luma_intra_cell_h(uint8_t *pix, int alpha, int beta)
|
||||
{
|
||||
int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2);
|
||||
int strong_p = strong_common && (abs_i(p2 - p0) < beta);
|
||||
int strong_q = strong_common && (abs_i(q2 - q0) < beta);
|
||||
|
||||
if (strong_p) {
|
||||
pix[-1] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
|
||||
pix[-2] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2);
|
||||
pix[-3] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
|
||||
} else {
|
||||
pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
}
|
||||
|
||||
if (strong_q) {
|
||||
pix[ 0] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
|
||||
pix[ 1] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2);
|
||||
pix[ 2] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
|
||||
} else {
|
||||
pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* --- chroma intra, one column across the horizontal edge --- */
|
||||
static void h264_chroma_intra_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta)
|
||||
{
|
||||
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
|
||||
/* --- chroma intra, one row across the vertical edge --- */
|
||||
static void h264_chroma_intra_cell_h(uint8_t *pix, int alpha, int beta)
|
||||
{
|
||||
int p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
|
||||
/* --- public refs --- */
|
||||
|
||||
void daedalus_h264_v_loop_filter_luma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
/* Note: the FFmpeg .S `h264_loop_filter_start_intra` macro
|
||||
* returns early if (alpha|beta) == 0. For non-zero alpha or
|
||||
* non-zero beta it runs the filter; the per-cell preconditions
|
||||
* (abs(p0-q0)<alpha etc.) then decide whether each column
|
||||
* actually updates pixels. Match that here. */
|
||||
if ((alpha | beta) == 0) return;
|
||||
|
||||
/* 16 columns; no quad-tree segments in the intra path (bS=4 is
|
||||
* uniform across the edge, no tc0_seg < 0 skip). */
|
||||
for (int c = 0; c < 16; c++)
|
||||
h264_luma_intra_cell_v(pix + c, stride, alpha, beta);
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_luma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
if ((alpha | beta) == 0) return;
|
||||
for (int r = 0; r < 16; r++)
|
||||
h264_luma_intra_cell_h(pix + r * stride, alpha, beta);
|
||||
}
|
||||
|
||||
void daedalus_h264_v_loop_filter_chroma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
if ((alpha | beta) == 0) return;
|
||||
for (int c = 0; c < 8; c++)
|
||||
h264_chroma_intra_cell_v(pix + c, stride, alpha, beta);
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_chroma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
if ((alpha | beta) == 0) return;
|
||||
for (int r = 0; r < 8; r++)
|
||||
h264_chroma_intra_cell_h(pix + r * stride, alpha, beta);
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma Intra_16x16
|
||||
* prediction modes (per H.264 spec §8.3.2). All 4 modes.
|
||||
*
|
||||
* Mode index → name (per H.264 Table 7-15):
|
||||
* 0 = Vertical
|
||||
* 1 = Horizontal
|
||||
* 2 = DC
|
||||
* 3 = Plane
|
||||
*
|
||||
* Calling convention (FFmpeg-style, matches the Intra_4x4 ref):
|
||||
* pred_16x16_<mode>(uint8_t *dst, ptrdiff_t stride)
|
||||
*
|
||||
* `dst` points at row 0, col 0 of the 16x16 output block. Neighbours:
|
||||
* top[0..15] = dst[-stride + 0 .. -stride + 15]
|
||||
* top-left = dst[-stride - 1]
|
||||
* left[0..15] = dst[ 0*stride - 1 .. 15*stride - 1]
|
||||
*
|
||||
* AVAILABILITY: assumes all neighbours valid (interior-MB case). The
|
||||
* H.264 spec defines fallback for boundary cases (DC averages just
|
||||
* the available side, etc.); the eventual libavcodec intercept
|
||||
* handles availability before calling.
|
||||
*
|
||||
* License: BSD-2-Clause.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
/* Mode 0 — Vertical: each col = top[col]. */
|
||||
void daedalus_h264_pred_16x16_vertical_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
for (int r = 0; r < 16; r++)
|
||||
for (int c = 0; c < 16; c++) dst[r * stride + c] = top[c];
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
void daedalus_h264_pred_16x16_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 16; r++) {
|
||||
uint8_t l = dst[r * stride - 1];
|
||||
for (int c = 0; c < 16; c++) dst[r * stride + c] = l;
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 2 — DC: ((sum_top16 + sum_left16 + 16) >> 5) broadcast. */
|
||||
void daedalus_h264_pred_16x16_dc_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int sum = 16; /* rounding for >> 5 over 32 samples */
|
||||
for (int i = 0; i < 16; i++) sum += top[i];
|
||||
for (int i = 0; i < 16; i++) sum += dst[i * stride - 1];
|
||||
uint8_t v = (uint8_t)(sum >> 5);
|
||||
for (int r = 0; r < 16; r++)
|
||||
for (int c = 0; c < 16; c++) dst[r * stride + c] = v;
|
||||
}
|
||||
|
||||
/* Mode 3 — Plane (per H.264 §8.3.2.4):
|
||||
* H = sum_{i=0..7} (i+1) * (p[7+i+1, -1] - p[7-i-1, -1])
|
||||
* = sum_{i=0..7} (i+1) * (top[8+i] - top[6-i])
|
||||
* V = sum_{j=0..7} (j+1) * (p[-1, 7+j+1] - p[-1, 7-j-1])
|
||||
* = sum_{j=0..7} (j+1) * (left[8+j] - left[6-j])
|
||||
* b = (5*H + 32) >> 6
|
||||
* c = (5*V + 32) >> 6
|
||||
* a = 16 * (p[-1, 15] + p[15, -1])
|
||||
* = 16 * (left[15] + top[15])
|
||||
* pred[y][x] = Clip1((a + b*(x-7) + c*(y-7) + 16) >> 5)
|
||||
*
|
||||
* Note: spec indexing uses [x, y] with x = col, y = row (or vice
|
||||
* versa depending on the section). Here I use the FFmpeg convention
|
||||
* pred[y][x] = pred[row][col]; the H = horizontal-slope formula uses
|
||||
* the TOP row's left-vs-right asymmetry; V = vertical-slope uses the
|
||||
* LEFT col's top-vs-bottom asymmetry. Boundary participants are
|
||||
* the top-left corner p[-1,-1] inferred from the spec's index range
|
||||
* (it does NOT participate in the H/V sums in the 16x16 case — only
|
||||
* for the chroma 8x8 plane mode).
|
||||
*/
|
||||
void daedalus_h264_pred_16x16_plane_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
/* H accumulates differences across the right vs left half of the
|
||||
* top row. Per spec, the top-left p[-1,-1] participates: i=7 uses
|
||||
* p[15,-1] - p[-1,-1]. We include it by reading top[-1]. */
|
||||
int H = 0, V = 0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int t_right = top[8 + i];
|
||||
int t_left = (i == 7) ? top[-1] : top[6 - i];
|
||||
H += (i + 1) * (t_right - t_left);
|
||||
}
|
||||
for (int j = 0; j < 8; j++) {
|
||||
int l_bot = dst[(8 + j) * stride - 1];
|
||||
int l_top = (j == 7) ? top[-1] : dst[(6 - j) * stride - 1];
|
||||
V += (j + 1) * (l_bot - l_top);
|
||||
}
|
||||
int b = (5 * H + 32) >> 6;
|
||||
int c = (5 * V + 32) >> 6;
|
||||
int a = 16 * (dst[15 * stride - 1] + top[15]);
|
||||
for (int y = 0; y < 16; y++) {
|
||||
for (int x = 0; x < 16; x++) {
|
||||
int v = (a + b * (x - 7) + c * (y - 7) + 16) >> 5;
|
||||
dst[y * stride + x] = (uint8_t) clip_u8(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma Intra_4x4
|
||||
* prediction modes (per H.264 spec §8.3.1.4). All 9 modes.
|
||||
*
|
||||
* Mode index → name (per H.264 Table 8-2):
|
||||
* 0 = Vertical
|
||||
* 1 = Horizontal
|
||||
* 2 = DC
|
||||
* 3 = Diagonal_Down_Left
|
||||
* 4 = Diagonal_Down_Right
|
||||
* 5 = Vertical_Right
|
||||
* 6 = Horizontal_Down
|
||||
* 7 = Vertical_Left
|
||||
* 8 = Horizontal_Up
|
||||
*
|
||||
* Calling convention matches FFmpeg's h264pred:
|
||||
* pred_4x4_<mode>(uint8_t *dst, ptrdiff_t stride)
|
||||
*
|
||||
* `dst` points at row 0, col 0 of the 4x4 output block. Neighbour
|
||||
* pixels come from the already-decoded surrounding pixels in the same
|
||||
* buffer:
|
||||
* top-left = dst[-stride - 1]
|
||||
* top[0..3] = dst[-stride + 0 .. -stride + 3]
|
||||
* top-right = dst[-stride + 4 .. -stride + 7] (DDL / VL only)
|
||||
* left[0..3] = dst[ 0*stride - 1 .. 3*stride - 1]
|
||||
*
|
||||
* AVAILABILITY: this reference assumes ALL neighbours are available
|
||||
* (the "interior MB" case). The H.264 spec defines fallback behaviour
|
||||
* for unavailable neighbours (e.g. DC averages only the available
|
||||
* side, top-right substitution from top[3] for DDL/VL near the right
|
||||
* frame edge); those branches are NOT modelled here. Tests must
|
||||
* exercise the kernel with all 13 neighbour bytes valid. The eventual
|
||||
* libavcodec intercept handles availability before calling.
|
||||
*
|
||||
* License: BSD-2-Clause for the reference + tests; the underlying
|
||||
* algorithm is from H.264/ITU-T H.264 (2003) and AVC standards, free
|
||||
* to implement.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Helper: 3-tap weighted average ((a + 2*b + c + 2) >> 2). */
|
||||
static inline uint8_t avg3(int a, int b, int c)
|
||||
{
|
||||
return (uint8_t)((a + 2*b + c + 2) >> 2);
|
||||
}
|
||||
|
||||
/* Helper: 2-tap mean ((a + b + 1) >> 1). */
|
||||
static inline uint8_t avg2(int a, int b)
|
||||
{
|
||||
return (uint8_t)((a + b + 1) >> 1);
|
||||
}
|
||||
|
||||
/* Mode 0 — Vertical: each col = top[col]. */
|
||||
void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
for (int r = 0; r < 4; r++) {
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = top[c];
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 4; r++) {
|
||||
uint8_t l = dst[r * stride - 1];
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = l;
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 2 — DC: mean of top 4 + left 4, broadcast. */
|
||||
void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int sum = 4; /* rounding for ((sum + 4) >> 3) */
|
||||
for (int i = 0; i < 4; i++) sum += top[i];
|
||||
for (int i = 0; i < 4; i++) sum += dst[i * stride - 1];
|
||||
uint8_t v = (uint8_t)(sum >> 3);
|
||||
for (int r = 0; r < 4; r++)
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = v;
|
||||
}
|
||||
|
||||
/* Mode 3 — Diagonal_Down_Left. Uses top[0..7] (incl. top-right). */
|
||||
void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int t0 = top[0], t1 = top[1], t2 = top[2], t3 = top[3];
|
||||
int t4 = top[4], t5 = top[5], t6 = top[6], t7 = top[7];
|
||||
/* zz[7] = top filtered with 3-tap; spec table 8-7. */
|
||||
uint8_t zz[7];
|
||||
zz[0] = avg3(t0, t1, t2);
|
||||
zz[1] = avg3(t1, t2, t3);
|
||||
zz[2] = avg3(t2, t3, t4);
|
||||
zz[3] = avg3(t3, t4, t5);
|
||||
zz[4] = avg3(t4, t5, t6);
|
||||
zz[5] = avg3(t5, t6, t7);
|
||||
zz[6] = avg3(t6, t7, t7); /* spec: t7 doubled at the boundary */
|
||||
/* dst[r][c] = zz[c + r] */
|
||||
for (int r = 0; r < 4; r++)
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[c + r];
|
||||
}
|
||||
|
||||
/* Mode 4 — Diagonal_Down_Right. Uses top-left + top[0..3] + left[0..3]. */
|
||||
void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int tl = dst[-stride - 1];
|
||||
int t0 = dst[-stride + 0], t1 = dst[-stride + 1];
|
||||
int t2 = dst[-stride + 2], t3 = dst[-stride + 3];
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
|
||||
/* zz indexed by (col - row): -3..+3 */
|
||||
uint8_t zz_m3 = avg3(l1, l2, l3);
|
||||
uint8_t zz_m2 = avg3(l0, l1, l2);
|
||||
uint8_t zz_m1 = avg3(tl, l0, l1);
|
||||
uint8_t zz_p0 = avg3(l0, tl, t0);
|
||||
uint8_t zz_p1 = avg3(tl, t0, t1);
|
||||
uint8_t zz_p2 = avg3(t0, t1, t2);
|
||||
uint8_t zz_p3 = avg3(t1, t2, t3);
|
||||
uint8_t zz[7] = { zz_m3, zz_m2, zz_m1, zz_p0, zz_p1, zz_p2, zz_p3 };
|
||||
for (int r = 0; r < 4; r++)
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[(c - r) + 3];
|
||||
}
|
||||
|
||||
/* Mode 5 — Vertical_Right. */
|
||||
void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int tl = dst[-stride - 1];
|
||||
int t0 = dst[-stride + 0], t1 = dst[-stride + 1];
|
||||
int t2 = dst[-stride + 2], t3 = dst[-stride + 3];
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1];
|
||||
/* H.264 §8.3.1.4.6: two patterns based on (2c - r) parity. */
|
||||
dst[0*stride + 0] = avg2(tl, t0);
|
||||
dst[0*stride + 1] = avg2(t0, t1);
|
||||
dst[0*stride + 2] = avg2(t1, t2);
|
||||
dst[0*stride + 3] = avg2(t2, t3);
|
||||
|
||||
dst[1*stride + 0] = avg3(l0, tl, t0);
|
||||
dst[1*stride + 1] = avg3(tl, t0, t1);
|
||||
dst[1*stride + 2] = avg3(t0, t1, t2);
|
||||
dst[1*stride + 3] = avg3(t1, t2, t3);
|
||||
|
||||
dst[2*stride + 0] = avg3(tl, l0, l1);
|
||||
dst[2*stride + 1] = dst[0*stride + 0];
|
||||
dst[2*stride + 2] = dst[0*stride + 1];
|
||||
dst[2*stride + 3] = dst[0*stride + 2];
|
||||
|
||||
dst[3*stride + 0] = avg3(l0, l1, l2);
|
||||
dst[3*stride + 1] = dst[1*stride + 0];
|
||||
dst[3*stride + 2] = dst[1*stride + 1];
|
||||
dst[3*stride + 3] = dst[1*stride + 2];
|
||||
}
|
||||
|
||||
/* Mode 6 — Horizontal_Down. */
|
||||
void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int tl = dst[-stride - 1];
|
||||
int t0 = dst[-stride + 0], t1 = dst[-stride + 1], t2 = dst[-stride + 2];
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
|
||||
|
||||
dst[0*stride + 0] = avg2(tl, l0);
|
||||
dst[0*stride + 1] = avg3(l0, tl, t0);
|
||||
dst[0*stride + 2] = avg3(tl, t0, t1);
|
||||
dst[0*stride + 3] = avg3(t0, t1, t2);
|
||||
|
||||
dst[1*stride + 0] = avg2(l0, l1);
|
||||
dst[1*stride + 1] = avg3(tl, l0, l1);
|
||||
dst[1*stride + 2] = dst[0*stride + 0];
|
||||
dst[1*stride + 3] = dst[0*stride + 1];
|
||||
|
||||
dst[2*stride + 0] = avg2(l1, l2);
|
||||
dst[2*stride + 1] = avg3(l0, l1, l2);
|
||||
dst[2*stride + 2] = dst[1*stride + 0];
|
||||
dst[2*stride + 3] = dst[1*stride + 1];
|
||||
|
||||
dst[3*stride + 0] = avg2(l2, l3);
|
||||
dst[3*stride + 1] = avg3(l1, l2, l3);
|
||||
dst[3*stride + 2] = dst[2*stride + 0];
|
||||
dst[3*stride + 3] = dst[2*stride + 1];
|
||||
}
|
||||
|
||||
/* Mode 7 — Vertical_Left. Uses top[0..7]. */
|
||||
void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int t0=top[0], t1=top[1], t2=top[2], t3=top[3];
|
||||
int t4=top[4], t5=top[5], t6=top[6], t7=top[7];
|
||||
|
||||
dst[0*stride + 0] = avg2(t0, t1);
|
||||
dst[0*stride + 1] = avg2(t1, t2);
|
||||
dst[0*stride + 2] = avg2(t2, t3);
|
||||
dst[0*stride + 3] = avg2(t3, t4);
|
||||
|
||||
dst[1*stride + 0] = avg3(t0, t1, t2);
|
||||
dst[1*stride + 1] = avg3(t1, t2, t3);
|
||||
dst[1*stride + 2] = avg3(t2, t3, t4);
|
||||
dst[1*stride + 3] = avg3(t3, t4, t5);
|
||||
|
||||
dst[2*stride + 0] = avg2(t1, t2);
|
||||
dst[2*stride + 1] = avg2(t2, t3);
|
||||
dst[2*stride + 2] = avg2(t3, t4);
|
||||
dst[2*stride + 3] = avg2(t4, t5);
|
||||
|
||||
dst[3*stride + 0] = avg3(t1, t2, t3);
|
||||
dst[3*stride + 1] = avg3(t2, t3, t4);
|
||||
dst[3*stride + 2] = avg3(t3, t4, t5);
|
||||
dst[3*stride + 3] = avg3(t4, t5, t6);
|
||||
(void) t6; (void) t7; /* t6 used; t7 unused in 4x4 VL */
|
||||
}
|
||||
|
||||
/* Mode 8 — Horizontal_Up. Uses left[0..3] only. */
|
||||
void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
|
||||
|
||||
dst[0*stride + 0] = avg2(l0, l1);
|
||||
dst[0*stride + 1] = avg3(l0, l1, l2);
|
||||
dst[0*stride + 2] = avg2(l1, l2);
|
||||
dst[0*stride + 3] = avg3(l1, l2, l3);
|
||||
|
||||
dst[1*stride + 0] = avg2(l1, l2);
|
||||
dst[1*stride + 1] = avg3(l1, l2, l3);
|
||||
dst[1*stride + 2] = avg2(l2, l3);
|
||||
dst[1*stride + 3] = avg3(l2, l3, l3);
|
||||
|
||||
dst[2*stride + 0] = avg2(l2, l3);
|
||||
dst[2*stride + 1] = avg3(l2, l3, l3);
|
||||
dst[2*stride + 2] = l3;
|
||||
dst[2*stride + 3] = l3;
|
||||
|
||||
dst[3*stride + 0] = l3;
|
||||
dst[3*stride + 1] = l3;
|
||||
dst[3*stride + 2] = l3;
|
||||
dst[3*stride + 3] = l3;
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 chroma Intra_8x8
|
||||
* prediction modes (per H.264 §8.3.3), used for both Cb and Cr
|
||||
* planes at 4:2:0. All 4 modes.
|
||||
*
|
||||
* Mode index → name (per H.264 Table 7-16):
|
||||
* 0 = DC (per-quadrant — asymmetric, see §8.3.3.2)
|
||||
* 1 = Horizontal
|
||||
* 2 = Vertical
|
||||
* 3 = Plane (slope coefficient 34, distinct from luma's 5)
|
||||
*
|
||||
* Calling convention (same shape as luma intra refs):
|
||||
* pred_chroma8x8_<mode>(uint8_t *dst, ptrdiff_t stride)
|
||||
*
|
||||
* `dst` points at row 0, col 0 of the 8x8 output block (single
|
||||
* component plane — Cb or Cr, dispatched independently). Neighbours:
|
||||
* top[0..7] = dst[-stride + 0 .. -stride + 7]
|
||||
* top-left = dst[-stride - 1]
|
||||
* left[0..7] = dst[ 0*stride - 1 .. 7*stride - 1]
|
||||
*
|
||||
* AVAILABILITY: assumes all neighbours valid (interior-MB case).
|
||||
* The H.264 spec defines per-quadrant fallback for the DC mode at
|
||||
* MB boundaries; that's caller-side via the libavcodec intercept.
|
||||
*
|
||||
* License: BSD-2-Clause.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
/* Mode 0 — DC (per-quadrant, 4:2:0 layout per §8.3.3.2).
|
||||
*
|
||||
* The 8×8 block is split into four 4×4 quadrants. For interior
|
||||
* MBs (all neighbours available), the DC value per quadrant uses:
|
||||
* (0,0) top-left : (sum_top[0..3] + sum_left[0..3] + 4) >> 3
|
||||
* (0,1) top-right : sum_top[4..7] + 2) >> 2
|
||||
* (1,0) bot-left : (sum_left[4..7] + 2) >> 2
|
||||
* (1,1) bot-right : (sum_top[4..7] + sum_left[4..7] + 4) >> 3
|
||||
*
|
||||
* The asymmetry mirrors what neighbours are "logically available"
|
||||
* for each quadrant in the spec's availability model. Top-right
|
||||
* quadrant ignores the top-left-half because that half is "vertically
|
||||
* above" the top-left quadrant; the spec uses top[4..7] only.
|
||||
*/
|
||||
void daedalus_h264_pred_chroma8x8_dc_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int top_lo = 0, top_hi = 0, left_lo = 0, left_hi = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
top_lo += top[i];
|
||||
top_hi += top[4 + i];
|
||||
left_lo += dst[i * stride - 1];
|
||||
left_hi += dst[(4 + i) * stride - 1];
|
||||
}
|
||||
uint8_t dc00 = (uint8_t)((top_lo + left_lo + 4) >> 3); /* top-left */
|
||||
uint8_t dc01 = (uint8_t)((top_hi + 2) >> 2); /* top-right */
|
||||
uint8_t dc10 = (uint8_t)(( left_hi + 2) >> 2); /* bot-left */
|
||||
uint8_t dc11 = (uint8_t)((top_hi + left_hi + 4) >> 3); /* bot-right */
|
||||
for (int r = 0; r < 4; r++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
dst[( r) * stride + c ] = dc00;
|
||||
dst[( r) * stride + 4 + c ] = dc01;
|
||||
dst[(4 + r) * stride + c ] = dc10;
|
||||
dst[(4 + r) * stride + 4 + c ] = dc11;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
void daedalus_h264_pred_chroma8x8_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++) {
|
||||
uint8_t l = dst[r * stride - 1];
|
||||
for (int c = 0; c < 8; c++) dst[r * stride + c] = l;
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 2 — Vertical: each col = top[col]. */
|
||||
void daedalus_h264_pred_chroma8x8_vertical_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) dst[r * stride + c] = top[c];
|
||||
}
|
||||
|
||||
/* Mode 3 — Plane (per H.264 §8.3.3.4):
|
||||
* H = sum_{i=0..3} (i+1) * (p[4+i, -1] - p[2-i, -1]) ; i=3 uses p[-1,-1]
|
||||
* V = sum_{j=0..3} (j+1) * (p[-1, 4+j] - p[-1, 2-j]) ; j=3 uses p[-1,-1]
|
||||
* b = (34 * H + 32) >> 6
|
||||
* c = (34 * V + 32) >> 6
|
||||
* a = 16 * (p[-1, 7] + p[7, -1])
|
||||
* pred[y][x] = Clip1((a + b*(x - 3) + c*(y - 3) + 16) >> 5)
|
||||
*
|
||||
* Distinct from the Intra_16x16 luma Plane:
|
||||
* - Slope coefficient is 34 (not 5).
|
||||
* - Centre is (x-3, y-3) (not x-7, y-7).
|
||||
* - Spans 4 differences per sum (not 8).
|
||||
*/
|
||||
void daedalus_h264_pred_chroma8x8_plane_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int H = 0, V = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int t_right = top[4 + i];
|
||||
int t_left = (i == 3) ? top[-1] : top[2 - i];
|
||||
H += (i + 1) * (t_right - t_left);
|
||||
}
|
||||
for (int j = 0; j < 4; j++) {
|
||||
int l_bot = dst[(4 + j) * stride - 1];
|
||||
int l_top = (j == 3) ? top[-1] : dst[(2 - j) * stride - 1];
|
||||
V += (j + 1) * (l_bot - l_top);
|
||||
}
|
||||
int b = (34 * H + 32) >> 6;
|
||||
int c = (34 * V + 32) >> 6;
|
||||
int a = 16 * (dst[7 * stride - 1] + top[7]);
|
||||
for (int y = 0; y < 8; y++) {
|
||||
for (int x = 0; x < 8; x++) {
|
||||
int v = (a + b * (x - 3) + c * (y - 3) + 16) >> 5;
|
||||
dst[y * stride + x] = (uint8_t) clip_u8(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma qpel 8×8 mc02
|
||||
* (vertical half-pel, "put" variant). Mirror of mc20 with rows
|
||||
* and columns transposed. 6-tap filter applied vertically:
|
||||
*
|
||||
* dst[r,c] = clip255( (s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
|
||||
* + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
|
||||
* + 16) >> 5 )
|
||||
*
|
||||
* Mirrors FFmpeg `ff_put_h264_qpel8_mc02_neon` (in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||
* line 678, which tail-calls put_h264_qpel8_v_lowpass_neon).
|
||||
*
|
||||
* Signature:
|
||||
* void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
*
|
||||
* Both dst and src use the SAME stride. src points at row 0 col 0
|
||||
* of the output block; the filter reads rows -2..+3 (2 rows of top
|
||||
* context, 3 rows of bottom context). Caller must guarantee the
|
||||
* source buffer has those rows available (FFmpeg's edge-emulated
|
||||
* buffer handles this at the frame boundary; matches the contract
|
||||
* documented for mc20).
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++) {
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int s_m2 = src[(r - 2) * stride + c];
|
||||
int s_m1 = src[(r - 1) * stride + c];
|
||||
int s_0 = src[(r + 0) * stride + c];
|
||||
int s_p1 = src[(r + 1) * stride + c];
|
||||
int s_p2 = src[(r + 2) * stride + c];
|
||||
int s_p3 = src[(r + 3) * stride + c];
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
dst[r * stride + c] = (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16,8 +16,24 @@
|
||||
|
||||
extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
@@ -145,6 +161,206 @@ static int test_deblock(void)
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_h(void)
|
||||
{
|
||||
/* Mirror of test_deblock but for the H variant. Per-tile layout
|
||||
* is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
|
||||
* of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
|
||||
* column of the right block so the kernel's pix[-4..+3] read sits
|
||||
* inside the tile. */
|
||||
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||||
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs() % 8);
|
||||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||||
daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
|
||||
N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_chroma_v(void)
|
||||
{
|
||||
/* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
|
||||
* (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
|
||||
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
|
||||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||||
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
|
||||
EDGE_OFF = EDGE_ROW * TILE_STRIDE };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs() % 8);
|
||||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||||
daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
|
||||
N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_chroma_h(void)
|
||||
{
|
||||
/* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
|
||||
* (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
|
||||
enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
|
||||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||||
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs() % 8);
|
||||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||||
daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
|
||||
N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
/* --- bS=4 intra-strength deblock tests ---
|
||||
* Tile geometry per orientation matches the bS<4 variant; only the
|
||||
* dispatch + reference function change. alpha/beta are non-trivial
|
||||
* (the C ref + NEON both early-return when alpha|beta == 0).
|
||||
*/
|
||||
typedef struct {
|
||||
const char *name;
|
||||
int n_edges, tile_stride, tile_rows, edge_off;
|
||||
void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
|
||||
int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
} intra_test_spec;
|
||||
|
||||
static int run_intra_test(const intra_test_spec *t)
|
||||
{
|
||||
int total = t->n_edges * t->tile_stride * t->tile_rows;
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t *dst = malloc((size_t) total);
|
||||
uint8_t *dst_ref = malloc((size_t) total);
|
||||
daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta));
|
||||
if (!dst || !dst_ref || !meta) return 1;
|
||||
|
||||
for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
int tile_bytes = t->tile_stride * t->tile_rows;
|
||||
for (int i = 0; i < t->n_edges; i++) {
|
||||
meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off);
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
/* tc0[] unused for intra; leave at 0 from calloc. */
|
||||
}
|
||||
for (int i = 0; i < t->n_edges; i++) {
|
||||
t->ref(dst_ref + meta[i].dst_off,
|
||||
(ptrdiff_t) t->tile_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride,
|
||||
(size_t) t->n_edges, meta);
|
||||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; }
|
||||
|
||||
int diff = 0;
|
||||
for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
t->name, total - diff, total, 100.0 * (total - diff) / total);
|
||||
|
||||
free(meta); free(dst_ref); free(dst);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_intra_all(void)
|
||||
{
|
||||
intra_test_spec specs[] = {
|
||||
{ "luma v intra", 8, 16, 8, 4 * 16,
|
||||
daedalus_h264_v_loop_filter_luma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_luma_v_intra },
|
||||
{ "luma h intra", 8, 8, 16, 4,
|
||||
daedalus_h264_h_loop_filter_luma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_luma_h_intra },
|
||||
{ "chroma v intra", 8, 8, 4, 2 * 8,
|
||||
daedalus_h264_v_loop_filter_chroma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_chroma_v_intra },
|
||||
{ "chroma h intra", 8, 4, 8, 2,
|
||||
daedalus_h264_h_loop_filter_chroma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_chroma_h_intra },
|
||||
};
|
||||
int fail = 0;
|
||||
for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++)
|
||||
fail |= run_intra_test(&specs[i]);
|
||||
return fail;
|
||||
}
|
||||
|
||||
static int test_qpel_mc20(void)
|
||||
{
|
||||
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
|
||||
@@ -185,6 +401,46 @@ static int test_qpel_mc20(void)
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_mc02(void)
|
||||
{
|
||||
/* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel
|
||||
* can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
|
||||
* SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
|
||||
* the tile) and rows 8..10 below (rows 11..13). */
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
memset(dst, 0, sizeof(dst));
|
||||
memset(dst_ref, 0, sizeof(dst_ref));
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
TILE_STRIDE);
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
|
||||
TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||
@@ -197,10 +453,24 @@ int main(void)
|
||||
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
|
||||
|
||||
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
|
||||
printf(" H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
|
||||
printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
|
||||
printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (CPU, bS=4 set)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA));
|
||||
|
||||
int fail = 0;
|
||||
fail |= test_idct4();
|
||||
fail |= test_idct8();
|
||||
fail |= test_deblock();
|
||||
fail |= test_deblock_h();
|
||||
fail |= test_deblock_chroma_v();
|
||||
fail |= test_deblock_chroma_h();
|
||||
fail |= test_deblock_intra_all();
|
||||
fail |= test_qpel_mc20();
|
||||
fail |= test_qpel_mc02();
|
||||
return fail;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
/*
|
||||
* Tests the 4 H.264 Intra_16x16 luma prediction modes against
|
||||
* spec-derived expected patterns. Same layout as the 4x4 test:
|
||||
* a buffer that holds the 16x16 output plus 1-pixel top/left
|
||||
* context and 1-pixel top-left corner.
|
||||
*
|
||||
* row 0: [tl][t0..t15]
|
||||
* row 1: [l0][output row 0]
|
||||
* row 2: [l1][output row 1]
|
||||
* ...
|
||||
* row 16: [l15][output row 15]
|
||||
*
|
||||
* Buffer dimensions: 17 rows × 17 cols, total 289 bytes.
|
||||
* dst (passed to the pred fns) points at row 1 col 1.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_pred_16x16_vertical_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_16x16_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_16x16_dc_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_16x16_plane_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
#define STRIDE 17
|
||||
#define ROWS 17
|
||||
|
||||
static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl,
|
||||
const int t[16], const int l[16])
|
||||
{
|
||||
for (int r = 0; r < ROWS; r++)
|
||||
for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
|
||||
buf[0][0] = (uint8_t) tl;
|
||||
for (int c = 0; c < 16; c++) buf[0][1 + c] = (uint8_t) t[c];
|
||||
for (int r = 0; r < 16; r++) buf[1 + r][0] = (uint8_t) l[r];
|
||||
}
|
||||
|
||||
static int check(const uint8_t buf[ROWS][STRIDE], const char *name,
|
||||
uint8_t (*expect_at)(int r, int c, void *), void *cookie)
|
||||
{
|
||||
int diff = 0;
|
||||
int first_r = 0, first_c = 0, first_got = 0, first_exp = 0;
|
||||
for (int r = 0; r < 16; r++) {
|
||||
for (int c = 0; c < 16; c++) {
|
||||
uint8_t got = buf[1 + r][1 + c];
|
||||
uint8_t exp = expect_at(r, c, cookie);
|
||||
if (got != exp) {
|
||||
if (diff == 0) {
|
||||
first_r = r; first_c = c;
|
||||
first_got = got; first_exp = exp;
|
||||
}
|
||||
diff++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (diff == 0)
|
||||
printf(" %-30s PASS\n", name);
|
||||
else
|
||||
printf(" %-30s FAIL (%d/256 wrong, first r=%d c=%d got=%u exp=%u)\n",
|
||||
name, diff, first_r, first_c, first_got, first_exp);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Expectation helpers for each mode. */
|
||||
static uint8_t expect_uniform(int r, int c, void *cookie)
|
||||
{ (void)r; (void)c; return *(uint8_t *)cookie; }
|
||||
|
||||
struct vertical_ctx { const int *t; };
|
||||
static uint8_t expect_vertical(int r, int c, void *cookie)
|
||||
{ (void)r; return (uint8_t) ((struct vertical_ctx *)cookie)->t[c]; }
|
||||
|
||||
struct horizontal_ctx { const int *l; };
|
||||
static uint8_t expect_horizontal(int r, int c, void *cookie)
|
||||
{ (void)c; return (uint8_t) ((struct horizontal_ctx *)cookie)->l[r]; }
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* --- Mode 0 Vertical: each col = top[col] --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = 10 + i; l[i] = 0; }
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_16x16_vertical_ref(&buf[1][1], STRIDE);
|
||||
struct vertical_ctx vc = { t };
|
||||
fail |= check(buf, "Vertical (mode 0)", expect_vertical, &vc);
|
||||
}
|
||||
|
||||
/* --- Mode 1 Horizontal: each row = left[row] --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16] = {0}, l[16];
|
||||
for (int i = 0; i < 16; i++) l[i] = 50 + i;
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_16x16_horizontal_ref(&buf[1][1], STRIDE);
|
||||
struct horizontal_ctx hc = { l };
|
||||
fail |= check(buf, "Horizontal (mode 1)", expect_horizontal, &hc);
|
||||
}
|
||||
|
||||
/* --- Mode 2 DC: ((sum + 16) >> 5) --- */
|
||||
/* All top = 2, all left = 6: sum = 32 + 96 = 128, +16 = 144,
|
||||
* >>5 = 144/32 = 4. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = 2; l[i] = 6; }
|
||||
set_ctx(buf, 99, t, l);
|
||||
daedalus_h264_pred_16x16_dc_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp_val = 4;
|
||||
fail |= check(buf, "DC (mode 2)", expect_uniform, &exp_val);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane: uniform neighbours → uniform output --- */
|
||||
/* H=V=0 when neighbours are uniform. a = 16*(p+p) = 32p.
|
||||
* pred[y][x] = (32p + 0 + 0 + 16) >> 5 = (32p + 16) >> 5 = p
|
||||
* (exact integer for any p, since 32p/32 = p and +16/32 = 0).
|
||||
* Verifies the orientation-free portion of the formula. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = 100; l[i] = 100; }
|
||||
set_ctx(buf, 100, t, l); /* uniform tl too — H/V sums actually zero */
|
||||
daedalus_h264_pred_16x16_plane_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp_val = 100;
|
||||
fail |= check(buf, "Plane (mode 3, uniform)", expect_uniform, &exp_val);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane: gradient sanity ---
|
||||
* Top row = 0..15 (gradient), left col = 0..15, tl = 0.
|
||||
* H = sum_{i=0..7} (i+1) * (t[8+i] - t[6-i] for i<7; or t[15]-tl=15 for i=7)
|
||||
* = 1*(8-6) + 2*(9-5) + 3*(10-4) + 4*(11-3) + 5*(12-2) + 6*(13-1)
|
||||
* + 7*(14-0) + 8*(15-0)
|
||||
* = 2 + 8 + 18 + 32 + 50 + 72 + 98 + 120 = 400
|
||||
* V = same shape on left col = 400
|
||||
* b = (5*400 + 32) >> 6 = 2032 >> 6 = 31
|
||||
* c = (5*400 + 32) >> 6 = 31
|
||||
* a = 16 * (l[15] + t[15]) = 16 * (15 + 15) = 480
|
||||
* pred[0][0] = (480 + 31*(-7) + 31*(-7) + 16) >> 5
|
||||
* = (480 - 217 - 217 + 16) >> 5
|
||||
* = 62 >> 5 = 1
|
||||
* pred[15][15] = (480 + 31*8 + 31*8 + 16) >> 5
|
||||
* = (480 + 248 + 248 + 16) >> 5
|
||||
* = 992 >> 5 = 31
|
||||
* Just spot-check those two corners. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = i; l[i] = i; }
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_16x16_plane_ref(&buf[1][1], STRIDE);
|
||||
uint8_t tl_actual = buf[1 + 0][1 + 0];
|
||||
uint8_t br_actual = buf[1 + 15][1 + 15];
|
||||
int spot_fail = 0;
|
||||
if (tl_actual != 1) { fprintf(stderr, "Plane gradient pred[0][0] = %u, expected 1\n", tl_actual); spot_fail = 1; }
|
||||
if (br_actual != 31) { fprintf(stderr, "Plane gradient pred[15][15] = %u, expected 31\n", br_actual); spot_fail = 1; }
|
||||
if (!spot_fail) printf(" %-30s PASS (corners 1, 31)\n", "Plane (mode 3, gradient)");
|
||||
else printf(" %-30s FAIL\n", "Plane (mode 3, gradient)");
|
||||
fail |= spot_fail;
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL Intra_16x16 mode references PASS\n");
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
@@ -0,0 +1,246 @@
|
||||
/*
|
||||
* Tests the 9 H.264 Intra_4x4 luma prediction modes against
|
||||
* spec-derived expected patterns. Goal: catch any mistake in
|
||||
* the reference (sign / shift / table mapping) before it lands
|
||||
* downstream. Each mode is exercised with a deterministic
|
||||
* neighbour context and checked against a hand-computed (or
|
||||
* spec-derived) expected 4x4 output.
|
||||
*
|
||||
* The test buffer layout reserves a 1-pixel top/left context border
|
||||
* + a 4-pixel top-right (for modes 3 / 7):
|
||||
*
|
||||
* row 0: [tl][t0 t1 t2 t3 t4 t5 t6 t7] <- TOP_STRIDE = 9 bytes
|
||||
* row 1: [l0][ 4x4 output goes here ]
|
||||
* row 2: [l1][ ]
|
||||
* row 3: [l2][ ]
|
||||
* row 4: [l3][ ]
|
||||
*
|
||||
* dst (passed to the pred fns) points at row 1 col 1.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
#define STRIDE 9
|
||||
typedef void (*pred_fn)(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
/* Set up the buffer: 5 rows × STRIDE cols.
|
||||
* top-left = tl, top[0..7] = t[0..7], left[0..3] = l[0..3].
|
||||
* The 4x4 output region (rows 1..4, cols 1..4) is filled with 0xff
|
||||
* sentinels so any unwritten cell shows up as 255 in the compare. */
|
||||
static void set_ctx(uint8_t buf[5][STRIDE], int tl, const int t[8], const int l[4])
|
||||
{
|
||||
for (int r = 0; r < 5; r++) for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
|
||||
buf[0][0] = (uint8_t) tl;
|
||||
for (int c = 0; c < 8; c++) buf[0][1 + c] = (uint8_t) t[c];
|
||||
for (int r = 0; r < 4; r++) buf[1 + r][0] = (uint8_t) l[r];
|
||||
}
|
||||
|
||||
static int check(const uint8_t buf[5][STRIDE], const char *name,
|
||||
const uint8_t expect[4][4])
|
||||
{
|
||||
int diff = 0;
|
||||
for (int r = 0; r < 4; r++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
uint8_t got = buf[1 + r][1 + c];
|
||||
uint8_t exp = expect[r][c];
|
||||
if (got != exp) {
|
||||
if (diff == 0)
|
||||
fprintf(stderr,
|
||||
"%s: first mismatch r=%d c=%d got=%u exp=%u\n",
|
||||
name, r, c, got, exp);
|
||||
diff++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (diff == 0)
|
||||
printf(" %-26s PASS\n", name);
|
||||
else
|
||||
printf(" %-26s FAIL (%d/16 bytes wrong)\n", name, diff);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* Mode 0 — Vertical: each col = top[col]. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int tl = 0;
|
||||
int t[8] = { 10, 20, 30, 40, 0, 0, 0, 0 };
|
||||
int l[4] = { 0, 0, 0, 0 };
|
||||
set_ctx(buf, tl, t, l);
|
||||
daedalus_h264_pred_4x4_vertical_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{10,20,30,40}, {10,20,30,40}, {10,20,30,40}, {10,20,30,40}
|
||||
};
|
||||
fail |= check(buf, "Vertical (mode 0)", exp);
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 0,0,0,0, 0,0,0,0 };
|
||||
int l[4] = { 50, 60, 70, 80 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_horizontal_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{50,50,50,50}, {60,60,60,60}, {70,70,70,70}, {80,80,80,80}
|
||||
};
|
||||
fail |= check(buf, "Horizontal (mode 1)", exp);
|
||||
}
|
||||
|
||||
/* Mode 2 — DC: all 8 neighbours valid → ((sum + 4) >> 3) broadcast.
|
||||
* top sum = 4*1 = 4, left sum = 4*3 = 12, total 16, +4 = 20,
|
||||
* >>3 = 2. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 1,1,1,1, 0,0,0,0 };
|
||||
int l[4] = { 3,3,3,3 };
|
||||
set_ctx(buf, 99, t, l); /* tl unused for DC */
|
||||
daedalus_h264_pred_4x4_dc_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{2,2,2,2}, {2,2,2,2}, {2,2,2,2}, {2,2,2,2}
|
||||
};
|
||||
fail |= check(buf, "DC (mode 2)", exp);
|
||||
}
|
||||
|
||||
/* Mode 3 — Diagonal_Down_Left: zz[i] = avg3(t[i], t[i+1], t[i+2]);
|
||||
* dst[r][c] = zz[c + r].
|
||||
* With all t[]=100 → all zz=100 → all dst=100. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 100,100,100,100, 100,100,100,100 };
|
||||
int l[4] = { 0,0,0,0 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_ddl_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{100,100,100,100}, {100,100,100,100},
|
||||
{100,100,100,100}, {100,100,100,100}
|
||||
};
|
||||
fail |= check(buf, "DiagDownLeft (mode 3)", exp);
|
||||
}
|
||||
|
||||
/* Mode 4 — Diagonal_Down_Right: zz[c-r] with c-r ∈ {-3..+3}.
|
||||
* If all 9 surrounding pixels = 200 → all zz = 200 → all dst = 200. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 200,200,200,200, 0,0,0,0 };
|
||||
int l[4] = { 200,200,200,200 };
|
||||
set_ctx(buf, 200, t, l);
|
||||
daedalus_h264_pred_4x4_ddr_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{200,200,200,200}, {200,200,200,200},
|
||||
{200,200,200,200}, {200,200,200,200}
|
||||
};
|
||||
fail |= check(buf, "DiagDownRight (mode 4)", exp);
|
||||
}
|
||||
|
||||
/* Mode 5 — Vertical_Right. With all neighbours = 80 the 3-tap
|
||||
* (a+2b+c+2)>>2 and 2-tap (a+b+1)>>1 both yield 80. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 80,80,80,80, 0,0,0,0 };
|
||||
int l[4] = { 80,80,80,80 };
|
||||
set_ctx(buf, 80, t, l);
|
||||
daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{80,80,80,80}, {80,80,80,80}, {80,80,80,80}, {80,80,80,80}
|
||||
};
|
||||
fail |= check(buf, "VerticalRight (mode 5)", exp);
|
||||
}
|
||||
|
||||
/* Mode 6 — Horizontal_Down. Same uniform-context degenerate case. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 120,120,120,120, 0,0,0,0 };
|
||||
int l[4] = { 120,120,120,120 };
|
||||
set_ctx(buf, 120, t, l);
|
||||
daedalus_h264_pred_4x4_hd_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{120,120,120,120}, {120,120,120,120},
|
||||
{120,120,120,120}, {120,120,120,120}
|
||||
};
|
||||
fail |= check(buf, "HorizontalDown (mode 6)", exp);
|
||||
}
|
||||
|
||||
/* Mode 7 — Vertical_Left. Uniform context. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 64,64,64,64, 64,64,64,64 };
|
||||
int l[4] = { 0,0,0,0 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_vl_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{64,64,64,64}, {64,64,64,64}, {64,64,64,64}, {64,64,64,64}
|
||||
};
|
||||
fail |= check(buf, "VerticalLeft (mode 7)", exp);
|
||||
}
|
||||
|
||||
/* Mode 8 — Horizontal_Up. Uniform context. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 0,0,0,0, 0,0,0,0 };
|
||||
int l[4] = { 200,200,200,200 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_hu_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{200,200,200,200}, {200,200,200,200},
|
||||
{200,200,200,200}, {200,200,200,200}
|
||||
};
|
||||
fail |= check(buf, "HorizontalUp (mode 8)", exp);
|
||||
}
|
||||
|
||||
/* Asymmetric Vertical_Right test: detects orientation /
|
||||
* row-vs-col confusion. Top=10,20,30,40, Left=50,60,70,
|
||||
* top-left=5. Spec-derived expected output computed by hand
|
||||
* from §8.3.1.4.6.
|
||||
*
|
||||
* d[0][0] = (tl+t0+1)>>1 = (5+10+1)>>1 = 8
|
||||
* d[0][1] = (t0+t1+1)>>1 = (10+20+1)>>1 = 15
|
||||
* d[0][2] = (t1+t2+1)>>1 = (20+30+1)>>1 = 25
|
||||
* d[0][3] = (t2+t3+1)>>1 = (30+40+1)>>1 = 35
|
||||
* d[1][0] = avg3(l0,tl,t0) = (50+2*5+10+2)>>2 = 72/4 = 18
|
||||
* d[1][1] = avg3(tl,t0,t1) = (5+20+20+2)>>2 = 47/4 = 11
|
||||
* d[1][2] = avg3(t0,t1,t2) = (10+40+30+2)>>2 = 82/4 = 20
|
||||
* d[1][3] = avg3(t1,t2,t3) = (20+60+40+2)>>2 = 122/4 = 30
|
||||
* d[2][0] = avg3(tl,l0,l1) = (5+100+60+2)>>2 = 167/4 = 41
|
||||
* d[2][1] = d[0][0] = 8
|
||||
* d[2][2] = d[0][1] = 15
|
||||
* d[2][3] = d[0][2] = 25
|
||||
* d[3][0] = avg3(l0,l1,l2) = (50+120+70+2)>>2 = 242/4 = 60
|
||||
* d[3][1] = d[1][0] = 18
|
||||
* d[3][2] = d[1][1] = 11
|
||||
* d[3][3] = d[1][2] = 20
|
||||
*/
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 10,20,30,40, 0,0,0,0 };
|
||||
int l[4] = { 50,60,70,0 };
|
||||
set_ctx(buf, 5, t, l);
|
||||
daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{ 8,15,25,35},
|
||||
{18,11,20,30},
|
||||
{41, 8,15,25},
|
||||
{60,18,11,20},
|
||||
};
|
||||
fail |= check(buf, "VR asym (sanity)", exp);
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL %d intra-4x4 mode references PASS\n", 10);
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Tests the 4 H.264 Intra_8x8 chroma prediction modes against
|
||||
* spec-derived expected patterns. Same buffer layout idea as the
|
||||
* other intra tests: a buffer that holds the 8x8 output + 1-pixel
|
||||
* top/left context + 1-pixel top-left corner.
|
||||
*
|
||||
* row 0: [tl][t0..t7]
|
||||
* row 1: [l0][output row 0]
|
||||
* ...
|
||||
* row 8: [l7][output row 7]
|
||||
*
|
||||
* Dimensions: 9 rows × 9 cols. dst (passed to pred fns) = &buf[1][1].
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_pred_chroma8x8_dc_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_chroma8x8_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_chroma8x8_vertical_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_chroma8x8_plane_ref(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
#define STRIDE 9
|
||||
#define ROWS 9
|
||||
|
||||
static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl,
|
||||
const int t[8], const int l[8])
|
||||
{
|
||||
for (int r = 0; r < ROWS; r++)
|
||||
for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
|
||||
buf[0][0] = (uint8_t) tl;
|
||||
for (int c = 0; c < 8; c++) buf[0][1 + c] = (uint8_t) t[c];
|
||||
for (int r = 0; r < 8; r++) buf[1 + r][0] = (uint8_t) l[r];
|
||||
}
|
||||
|
||||
static int check_per_cell(const uint8_t buf[ROWS][STRIDE], const char *name,
|
||||
const uint8_t expect[8][8])
|
||||
{
|
||||
int diff = 0;
|
||||
int first_r = 0, first_c = 0, first_got = 0, first_exp = 0;
|
||||
for (int r = 0; r < 8; r++) {
|
||||
for (int c = 0; c < 8; c++) {
|
||||
uint8_t got = buf[1 + r][1 + c];
|
||||
uint8_t exp = expect[r][c];
|
||||
if (got != exp) {
|
||||
if (diff == 0) {
|
||||
first_r = r; first_c = c;
|
||||
first_got = got; first_exp = exp;
|
||||
}
|
||||
diff++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (diff == 0)
|
||||
printf(" %-30s PASS\n", name);
|
||||
else
|
||||
printf(" %-30s FAIL (%d/64 wrong, first r=%d c=%d got=%u exp=%u)\n",
|
||||
name, diff, first_r, first_c, first_got, first_exp);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* --- Mode 1 Horizontal --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8] = {0}, l[8] = {10, 20, 30, 40, 50, 60, 70, 80};
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_chroma8x8_horizontal_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8];
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) l[r];
|
||||
fail |= check_per_cell(buf, "Horizontal (mode 1)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 2 Vertical --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8] = {15, 25, 35, 45, 55, 65, 75, 85}, l[8] = {0};
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_chroma8x8_vertical_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8];
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) t[c];
|
||||
fail |= check_per_cell(buf, "Vertical (mode 2)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 0 DC: per-quadrant. Test with distinct halves so any
|
||||
* quadrant mix-up surfaces immediately.
|
||||
*
|
||||
* top[0..3] = 4 × 8 → sum_top_lo = 32
|
||||
* top[4..7] = 4 × 16 → sum_top_hi = 64
|
||||
* left[0..3] = 4 × 24 → sum_left_lo = 96
|
||||
* left[4..7] = 4 × 40 → sum_left_hi = 160
|
||||
*
|
||||
* dc00 = (32 + 96 + 4) >> 3 = 132/8 = 16
|
||||
* dc01 = (64 + 2) >> 2 = 66/4 = 16
|
||||
* dc10 = ( 160 + 2) >> 2 = 162/4 = 40
|
||||
* dc11 = (64 + 160 + 4) >> 3 = 228/8 = 28
|
||||
*/
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8] = { 8, 8, 8, 8, 16, 16, 16, 16 };
|
||||
int l[8] = { 24, 24, 24, 24, 40, 40, 40, 40 };
|
||||
set_ctx(buf, 99, t, l);
|
||||
daedalus_h264_pred_chroma8x8_dc_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8] = {
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
};
|
||||
fail |= check_per_cell(buf, "DC quadrants (mode 0)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane (uniform): H = V = 0; a = 16 * (100 + 100) = 3200.
|
||||
* pred[y][x] = (3200 + 0 + 0 + 16) >> 5 = 3216 >> 5 = 100. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8], l[8];
|
||||
for (int i = 0; i < 8; i++) { t[i] = 100; l[i] = 100; }
|
||||
set_ctx(buf, 100, t, l);
|
||||
daedalus_h264_pred_chroma8x8_plane_ref(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8];
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = 100;
|
||||
fail |= check_per_cell(buf, "Plane uniform (mode 3)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane gradient sanity ---
|
||||
* t = 0..7, l = 0..7, tl = 0.
|
||||
* H = 1*(t[4]-t[2]) + 2*(t[5]-t[1]) + 3*(t[6]-t[0]) + 4*(t[7]-tl)
|
||||
* = 1*(4-2) + 2*(5-1) + 3*(6-0) + 4*(7-0)
|
||||
* = 2 + 8 + 18 + 28 = 56
|
||||
* V = same shape on left = 56
|
||||
* b = (34*56 + 32) >> 6 = 1936 >> 6 = 30
|
||||
* c = 30
|
||||
* a = 16 * (l[7] + t[7]) = 16 * (7 + 7) = 224
|
||||
*
|
||||
* pred[0][0] = (224 + 30*(-3) + 30*(-3) + 16) >> 5
|
||||
* = (224 - 90 - 90 + 16) >> 5
|
||||
* = 60 >> 5 = 1
|
||||
* pred[7][7] = (224 + 30*4 + 30*4 + 16) >> 5
|
||||
* = (224 + 120 + 120 + 16) >> 5
|
||||
* = 480 >> 5 = 15
|
||||
* Spot-check those two corners. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8], l[8];
|
||||
for (int i = 0; i < 8; i++) { t[i] = i; l[i] = i; }
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_chroma8x8_plane_ref(&buf[1][1], STRIDE);
|
||||
uint8_t tl_actual = buf[1 + 0][1 + 0];
|
||||
uint8_t br_actual = buf[1 + 7][1 + 7];
|
||||
int spot_fail = 0;
|
||||
if (tl_actual != 1) { fprintf(stderr, "Plane gradient pred[0][0] = %u, expected 1\n", tl_actual); spot_fail = 1; }
|
||||
if (br_actual != 15) { fprintf(stderr, "Plane gradient pred[7][7] = %u, expected 15\n", br_actual); spot_fail = 1; }
|
||||
if (!spot_fail) printf(" %-30s PASS (corners 1, 15)\n", "Plane gradient (mode 3)");
|
||||
else printf(" %-30s FAIL\n", "Plane gradient (mode 3)");
|
||||
fail |= spot_fail;
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL Intra_8x8 chroma mode references PASS\n");
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
Reference in New Issue
Block a user