Files
daedalus-fourier/CMakeLists.txt
T
marfrit 5c8b09349c Cycle 9 closed: H.264 luma qpel mc20 = 131 Mblock/s NEON, CPU-only
Last unmeasured H.264 kernel. mc20 picked as representative
(horizontal half-pel, 6-tap filter; canonical for the H.264 luma
qpel family). M1 PASS 10000/10000 first try, M3 = 131.477
Mblock/s on a single core (7.6 ns/block), 135x the 1080p30 floor.

Per the cycles 6+7 lightweight-kernel rationale, Phase 4 deferred:
QPU dispatch floor (~250 ns/block) is 33x above the NEON per-block
cost; R9 ≈ 0.03 deep RED. No realistic QPU offload value.

Generalization: all H.264 luma MC variants (mc02, mc11, mc22,
etc.) will share this verdict. No need to measure each variant
individually.

H.264 NEON is dramatically faster than VP9 NEON across the board:
- IDCT 4x4: 175 vs N/A    (no VP9 analog)
- IDCT 8x8: 151 vs 8.2 Mblock/s (18x faster)
- MC 6/8-tap: 131 vs 7.0   (19x faster)
- Deblock: 92 vs 48 Medge/s (2x faster)

H.264 deployment recipe: all CPU NEON except deblock (opportunistic
QPU). On a Pi 5 running H.264-only, the QPU is mostly idle.

Cycles 1-9 complete. Public API exposes all 9.
Next: daedalus-v4l2 sibling repo per locked Phase 8 architecture
(B + γ + sibling), then README polish.

- external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
  vendored (1467 lines, all qpel variants)
- tests/h264_qpel8_mc20_ref.c: 40-line C ref (clip255 of
  6-tap convolution)
- tests/bench_neon_h264qpel_mc20.c: M1 + M3 bench
- docs/k9_h264qpel_mc20.md: cycle 9 closure with comparison
  matrix

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:53:21 +00:00

474 lines
16 KiB
CMake

# daedalus-fourier — Phase 3 baseline + (later) Phase 6 implementation.
#
# Builds:
# bench_neon_idct — NEON throughput baseline (Phase 3 M3) +
# bit-exact correctness gate (Phase 1 M1).
# bench_vulkan_dispatch — Vulkan compute dispatch-overhead baseline (M5).
#
# Linkage note: bench_neon_idct statically links the vendored
# FFmpeg n7.1.3 NEON snapshot (LGPL-2.1+); see
# external/ffmpeg-snapshot/PROVENANCE.md.
cmake_minimum_required(VERSION 3.20)
project(daedalus-fourier C ASM)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED ON)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
message(FATAL_ERROR
"daedalus-fourier targets aarch64 (Pi 5 / BCM2712). "
"Cross-compile not yet wired.")
endif()
add_compile_options(-Wall -Wextra -Wno-unused-parameter)
# ---- Vendored FFmpeg snapshot (LGPL-2.1+) -----------------------------------
set(FFSNAP ${CMAKE_SOURCE_DIR}/external/ffmpeg-snapshot)
# Assembly preamble (config.h shim + FFmpeg's asm helpers) used by the
# vendored .S file. -I flags expose:
# - FFSNAP/ so `#include "config.h"` finds our shim
# - FFSNAP/libavcodec/aarch64/ so `#include "neon.S"` finds the helper
# - FFSNAP/ so `#include "libavutil/aarch64/asm.S"`
# resolves against the vendored copy
set(FFASM_FLAGS
-I${FFSNAP}
-I${FFSNAP}/libavcodec/aarch64
-I${FFSNAP}
)
# ---- Vendored dav1d snapshot (BSD-2-Clause) — cycle 5+ ----------------------
set(DAV1DSNAP ${CMAKE_SOURCE_DIR}/external/dav1d-snapshot)
# dav1d's asm preamble expects "src/arm/asm.S" and "cdef_tmpl.S" / "util.S"
# (the latter two as bare basenames from within src/arm/64/). Include paths:
set(DAV1D_ASM_FLAGS
-I${DAV1DSNAP} # for config.h shim + src/arm/asm.S
-I${DAV1DSNAP}/src/arm/64 # for util.S, cdef_tmpl.S
)
set(DAV1D_CDEF_ASM_SOURCES
${DAV1DSNAP}/src/arm/64/cdef.S
)
set(DAV1D_CDEF_C_SOURCES
${DAV1DSNAP}/src/tables_cdef_subset.c
)
set_source_files_properties(${DAV1D_CDEF_ASM_SOURCES} PROPERTIES
COMPILE_OPTIONS "${DAV1D_ASM_FLAGS}"
LANGUAGE ASM)
set(FFASM_SOURCES
${FFSNAP}/libavcodec/aarch64/vp9itxfm_neon.S
)
# Cycle 6 — H.264 IDCT 4x4 + 8x8 NEON (vendored 2026-05-18).
set(FFASM_H264IDCT_SOURCES
${FFSNAP}/libavcodec/aarch64/h264idct_neon.S
)
set_source_files_properties(${FFASM_H264IDCT_SOURCES} PROPERTIES
COMPILE_OPTIONS "${FFASM_FLAGS}"
LANGUAGE ASM)
# Cycle 2 — VP9 loop filter NEON source (vendored 2026-05-18).
set(FFASM_LPF_SOURCES
${FFSNAP}/libavcodec/aarch64/vp9lpf_neon.S
)
set_source_files_properties(${FFASM_LPF_SOURCES} PROPERTIES
COMPILE_OPTIONS "${FFASM_FLAGS}"
LANGUAGE ASM)
# Cycle 3 — VP9 MC interpolation NEON source + filter coefficient table
# (vendored 2026-05-18). The .c table provides ff_vp9_subpel_filters
# symbol which vp9mc_neon.S references via movrel.
set(FFASM_MC_SOURCES
${FFSNAP}/libavcodec/aarch64/vp9mc_neon.S
)
set(FFC_MC_SOURCES
${FFSNAP}/libavcodec/vp9_subpel_filters_table.c
)
set_source_files_properties(${FFASM_MC_SOURCES} PROPERTIES
COMPILE_OPTIONS "${FFASM_FLAGS}"
LANGUAGE ASM)
# Tell CMake/gas to preprocess .S sources.
set_source_files_properties(${FFASM_SOURCES} PROPERTIES
COMPILE_OPTIONS "${FFASM_FLAGS}"
LANGUAGE ASM)
# ---- NEON baseline microbenches --------------------------------------------
# Cycle 6 — H.264 IDCT 4x4 NEON M3 baseline bench.
add_executable(bench_neon_h264idct4
tests/bench_neon_h264idct4.c
tests/h264_idct4_ref.c
${FFASM_H264IDCT_SOURCES}
)
target_compile_options(bench_neon_h264idct4 PRIVATE -O3 -march=armv8-a+simd)
# Cycle 7 — H.264 IDCT 8x8 NEON M3 baseline bench.
add_executable(bench_neon_h264idct8
tests/bench_neon_h264idct8.c
tests/h264_idct8_ref.c
${FFASM_H264IDCT_SOURCES}
)
target_compile_options(bench_neon_h264idct8 PRIVATE -O3 -march=armv8-a+simd)
# Cycle 8 — H.264 luma vertical deblock NEON M3 baseline bench.
set(FFASM_H264DSP_SOURCES
${FFSNAP}/libavcodec/aarch64/h264dsp_neon.S
)
set_source_files_properties(${FFASM_H264DSP_SOURCES} PROPERTIES
COMPILE_OPTIONS "${FFASM_FLAGS}"
LANGUAGE ASM)
# Cycle 9 — H.264 luma qpel MC NEON.
set(FFASM_H264QPEL_SOURCES
${FFSNAP}/libavcodec/aarch64/h264qpel_neon.S
)
set_source_files_properties(${FFASM_H264QPEL_SOURCES} PROPERTIES
COMPILE_OPTIONS "${FFASM_FLAGS}"
LANGUAGE ASM)
add_executable(bench_neon_h264deblock
tests/bench_neon_h264deblock.c
tests/h264_deblock_ref.c
${FFASM_H264DSP_SOURCES}
)
target_compile_options(bench_neon_h264deblock PRIVATE -O3 -march=armv8-a+simd)
# Cycle 9 — H.264 luma qpel mc20 NEON M3 baseline.
add_executable(bench_neon_h264qpel_mc20
tests/bench_neon_h264qpel_mc20.c
tests/h264_qpel8_mc20_ref.c
${FFASM_H264QPEL_SOURCES}
)
target_compile_options(bench_neon_h264qpel_mc20 PRIVATE -O3 -march=armv8-a+simd)
add_executable(bench_neon_idct
tests/bench_neon_idct.c
tests/vp9_idct8_ref.c
${FFASM_SOURCES}
)
target_compile_options(bench_neon_idct PRIVATE -O3 -march=armv8-a+simd)
# Cycle 2 — VP9 loop filter NEON baseline.
add_executable(bench_neon_lpf
tests/bench_neon_lpf.c
tests/vp9_lpf_ref.c
${FFASM_LPF_SOURCES}
)
target_compile_options(bench_neon_lpf PRIVATE -O3 -march=armv8-a+simd)
# Cycle 3 — VP9 MC interpolation NEON baseline.
add_executable(bench_neon_mc
tests/bench_neon_mc.c
tests/vp9_mc_ref.c
${FFASM_MC_SOURCES}
${FFC_MC_SOURCES}
)
target_compile_options(bench_neon_mc PRIVATE -O3 -march=armv8-a+simd)
# Cycle 4 — VP9 LPF wd=8 NEON baseline (same vendored .S as cycle 2).
add_executable(bench_neon_lpf8
tests/bench_neon_lpf8.c
tests/vp9_lpf8_ref.c
${FFASM_LPF_SOURCES}
)
target_compile_options(bench_neon_lpf8 PRIVATE -O3 -march=armv8-a+simd)
# Cycle 5 — AV1 CDEF NEON baseline (dav1d snapshot).
add_executable(bench_neon_cdef
tests/bench_neon_cdef.c
tests/cdef_ref.c
${DAV1D_CDEF_ASM_SOURCES}
${DAV1D_CDEF_C_SOURCES}
)
target_compile_options(bench_neon_cdef PRIVATE -O3 -march=armv8-a+simd)
# bench_neon_idct doesn't need vulkan/drm — pure CPU baseline.
# ---- Vulkan dispatch-overhead microbench (next chunk) ----------------------
# Stub: written in a follow-up step. Toggle ON with -DDAEDALUS_BUILD_VULKAN=ON
# once tests/bench_vulkan_dispatch.c exists.
option(DAEDALUS_BUILD_VULKAN "Build Vulkan compute-dispatch microbench" ON)
if (DAEDALUS_BUILD_VULKAN)
find_package(Vulkan REQUIRED)
# Compile GLSL compute shaders to SPIR-V via glslangValidator.
# The binary loads them at runtime from the build dir (cwd-relative).
find_program(GLSLANG_VALIDATOR
NAMES glslangValidator glslang
REQUIRED)
set(NOOP_SPV ${CMAKE_BINARY_DIR}/noop.spv)
add_custom_command(
OUTPUT ${NOOP_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V -o ${NOOP_SPV}
${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
DEPENDS ${CMAKE_SOURCE_DIR}/tests/shaders/noop.comp
COMMENT "glslang: noop.comp -> noop.spv"
VERBATIM
)
set(IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_idct8.spv)
add_custom_command(
OUTPUT ${IDCT8_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${IDCT8_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_idct8.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_idct8.comp
COMMENT "glslang: v3d_idct8.comp -> v3d_idct8.spv"
VERBATIM
)
set(LPF_SPV ${CMAKE_BINARY_DIR}/v3d_lpf_h_4_8.spv)
add_custom_command(
OUTPUT ${LPF_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${LPF_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_lpf_h_4_8.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_lpf_h_4_8.comp
COMMENT "glslang: v3d_lpf_h_4_8.comp -> v3d_lpf_h_4_8.spv"
VERBATIM
)
set(MC_SPV ${CMAKE_BINARY_DIR}/v3d_mc_8h.spv)
add_custom_command(
OUTPUT ${MC_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${MC_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_mc_8h.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_mc_8h.comp
COMMENT "glslang: v3d_mc_8h.comp -> v3d_mc_8h.spv"
VERBATIM
)
set(LPF8_SPV ${CMAKE_BINARY_DIR}/v3d_lpf_h_8_8.spv)
add_custom_command(
OUTPUT ${LPF8_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${LPF8_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_lpf_h_8_8.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_lpf_h_8_8.comp
COMMENT "glslang: v3d_lpf_h_8_8.comp -> v3d_lpf_h_8_8.spv"
VERBATIM
)
set(CDEF_SPV ${CMAKE_BINARY_DIR}/v3d_cdef.spv)
add_custom_command(
OUTPUT ${CDEF_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${CDEF_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_cdef.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_cdef.comp
COMMENT "glslang: v3d_cdef.comp -> v3d_cdef.spv"
VERBATIM
)
set(H264DEBLOCK_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock.spv)
add_custom_command(
OUTPUT ${H264DEBLOCK_SPV}
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
-o ${H264DEBLOCK_SPV}
${CMAKE_SOURCE_DIR}/src/v3d_h264deblock.comp
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock.comp
COMMENT "glslang: v3d_h264deblock.comp -> v3d_h264deblock.spv"
VERBATIM
)
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV})
# v3d_runner — reusable Vulkan plumbing.
add_library(v3d_runner STATIC src/v3d_runner.c)
target_include_directories(v3d_runner PUBLIC src)
target_link_libraries(v3d_runner PUBLIC Vulkan::Vulkan)
target_compile_options(v3d_runner PRIVATE -O2)
add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c)
add_dependencies(bench_vulkan_dispatch daedalus_shaders)
target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan)
target_compile_options(bench_vulkan_dispatch PRIVATE -O2)
add_executable(bench_v3d_idct
tests/bench_v3d_idct.c
tests/vp9_idct8_ref.c
)
add_dependencies(bench_v3d_idct daedalus_shaders)
target_link_libraries(bench_v3d_idct PRIVATE v3d_runner Vulkan::Vulkan)
target_compile_options(bench_v3d_idct PRIVATE -O2)
# Cycle 2 — QPU LPF bench.
add_executable(bench_v3d_lpf
tests/bench_v3d_lpf.c
tests/vp9_lpf_ref.c
)
add_dependencies(bench_v3d_lpf daedalus_shaders)
target_link_libraries(bench_v3d_lpf PRIVATE v3d_runner Vulkan::Vulkan)
target_compile_options(bench_v3d_lpf PRIVATE -O2)
# Cycle 3 — QPU MC bench.
add_executable(bench_v3d_mc
tests/bench_v3d_mc.c
tests/vp9_mc_ref.c
)
add_dependencies(bench_v3d_mc daedalus_shaders)
target_link_libraries(bench_v3d_mc PRIVATE v3d_runner Vulkan::Vulkan)
target_compile_options(bench_v3d_mc PRIVATE -O2)
# Cycle 4 — QPU LPF wd=8 bench.
add_executable(bench_v3d_lpf8
tests/bench_v3d_lpf8.c
tests/vp9_lpf8_ref.c
)
add_dependencies(bench_v3d_lpf8 daedalus_shaders)
target_link_libraries(bench_v3d_lpf8 PRIVATE v3d_runner Vulkan::Vulkan)
target_compile_options(bench_v3d_lpf8 PRIVATE -O2)
# Cycle 5 — QPU CDEF bench (3-way M1 against NEON + C ref).
add_executable(bench_v3d_cdef
tests/bench_v3d_cdef.c
tests/cdef_ref.c
${DAV1D_CDEF_ASM_SOURCES}
${DAV1D_CDEF_C_SOURCES}
)
add_dependencies(bench_v3d_cdef daedalus_shaders)
target_link_libraries(bench_v3d_cdef PRIVATE v3d_runner Vulkan::Vulkan)
target_compile_options(bench_v3d_cdef PRIVATE -O2)
# Cycle 8 — QPU H.264 deblock bench (3-way).
add_executable(bench_v3d_h264deblock
tests/bench_v3d_h264deblock.c
tests/h264_deblock_ref.c
${FFASM_H264DSP_SOURCES}
)
add_dependencies(bench_v3d_h264deblock daedalus_shaders)
target_link_libraries(bench_v3d_h264deblock PRIVATE v3d_runner Vulkan::Vulkan)
target_compile_options(bench_v3d_h264deblock PRIVATE -O2)
endif()
# ---- Phase 8 — public C API library + smoke test ---------------------------
add_library(daedalus_core STATIC
src/daedalus_core.c
src/v3d_runner.c
${FFASM_SOURCES}
${FFASM_LPF_SOURCES}
${FFASM_MC_SOURCES}
${FFC_MC_SOURCES}
${FFASM_H264IDCT_SOURCES}
${FFASM_H264DSP_SOURCES}
${DAV1D_CDEF_ASM_SOURCES}
${DAV1D_CDEF_C_SOURCES}
)
target_include_directories(daedalus_core PUBLIC include)
target_include_directories(daedalus_core PRIVATE src)
target_link_libraries(daedalus_core PUBLIC Vulkan::Vulkan)
target_compile_options(daedalus_core PRIVATE -O2)
if (DAEDALUS_BUILD_VULKAN)
add_dependencies(daedalus_core daedalus_shaders)
endif()
add_executable(test_api_idct
tests/test_api_idct.c
tests/vp9_idct8_ref.c
)
target_link_libraries(test_api_idct PRIVATE daedalus_core)
target_compile_options(test_api_idct PRIVATE -O2)
add_executable(test_api_lpf
tests/test_api_lpf.c
tests/vp9_lpf_ref.c
tests/vp9_lpf8_ref.c
)
target_link_libraries(test_api_lpf PRIVATE daedalus_core)
target_compile_options(test_api_lpf PRIVATE -O2)
add_executable(test_api_h264
tests/test_api_h264.c
tests/h264_idct4_ref.c
tests/h264_idct8_ref.c
tests/h264_deblock_ref.c
)
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
target_compile_options(test_api_h264 PRIVATE -O2)
add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
if (DAEDALUS_BUILD_VULKAN)
# (re-open the conditional so the closing endif() below balances)
# M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON
# snapshot so we can run real NEON kernels on pinned CPU cores
# while the QPU runs its dispatch loop concurrently.
add_executable(bench_concurrent
tests/bench_concurrent.c
${FFASM_SOURCES}
)
add_dependencies(bench_concurrent daedalus_shaders)
target_link_libraries(bench_concurrent PRIVATE v3d_runner Vulkan::Vulkan pthread)
target_compile_options(bench_concurrent PRIVATE -O3 -march=armv8-a+simd)
# Cycle 2 M4'' — concurrent LPF.
add_executable(bench_concurrent_lpf
tests/bench_concurrent_lpf.c
${FFASM_LPF_SOURCES}
)
add_dependencies(bench_concurrent_lpf daedalus_shaders)
target_link_libraries(bench_concurrent_lpf PRIVATE v3d_runner Vulkan::Vulkan pthread)
target_compile_options(bench_concurrent_lpf PRIVATE -O3 -march=armv8-a+simd)
# Cycle 3 M4''' — concurrent MC.
add_executable(bench_concurrent_mc
tests/bench_concurrent_mc.c
${FFASM_MC_SOURCES}
${FFC_MC_SOURCES}
)
add_dependencies(bench_concurrent_mc daedalus_shaders)
target_link_libraries(bench_concurrent_mc PRIVATE v3d_runner Vulkan::Vulkan pthread)
target_compile_options(bench_concurrent_mc PRIVATE -O3 -march=armv8-a+simd)
# Cycle 4 M4'''' — concurrent LPF wd=8.
add_executable(bench_concurrent_lpf8
tests/bench_concurrent_lpf8.c
${FFASM_LPF_SOURCES}
)
add_dependencies(bench_concurrent_lpf8 daedalus_shaders)
target_link_libraries(bench_concurrent_lpf8 PRIVATE v3d_runner Vulkan::Vulkan pthread)
target_compile_options(bench_concurrent_lpf8 PRIVATE -O3 -march=armv8-a+simd)
# Issue 003 — mixed-kernel M4 bench (NEON-N kernel A + QPU kernel B).
# Links all FFmpeg + dav1d NEON sources we have (cycles 1-8).
add_executable(bench_concurrent_mixed
tests/bench_concurrent_mixed.c
${FFASM_SOURCES}
${FFASM_LPF_SOURCES}
${FFASM_MC_SOURCES}
${FFC_MC_SOURCES}
${FFASM_H264DSP_SOURCES}
${DAV1D_CDEF_ASM_SOURCES}
${DAV1D_CDEF_C_SOURCES}
)
add_dependencies(bench_concurrent_mixed daedalus_shaders)
target_link_libraries(bench_concurrent_mixed PRIVATE v3d_runner Vulkan::Vulkan pthread)
target_compile_options(bench_concurrent_mixed PRIVATE -O3 -march=armv8-a+simd)
endif()
# ---- Summary ----------------------------------------------------------------
message(STATUS "daedalus-fourier build configured for ${CMAKE_SYSTEM_PROCESSOR}")
message(STATUS " FFmpeg snapshot: ${FFSNAP}")
message(STATUS " Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS " Targets: bench_neon_idct"
"$<$<BOOL:${DAEDALUS_BUILD_VULKAN}>:; bench_vulkan_dispatch>")