Files
daedalus-fourier/tests/bench_h264_primitives.c
T
claude-noether ba5bbae8e2 bench: H.264 primitives NEON CPU baseline (1080p budget projection)
Adds bench_h264_primitives — a non-ctest binary that times the
H.264 pixel-math primitives at their representative per-frame N and
projects 1080p frame budgets.  Lets us answer "how much of the
33-ms 30fps deadline does the pixel-math layer eat on NEON alone,
before the intercept patch adds entropy decode + metadata work."

Results on hertz (Pi 5 / 4×Cortex-A76, NEON path):

  Per-kernel ns/op (CPU NEON):
    IDCT 4x4 luma            10.78 ns/block
    IDCT 8x8 luma            29.73 ns/block
    Deblock luma_v           18.04 ns/edge
    Deblock luma_h           41.65 ns/edge   (H access pattern less SIMD-friendly)
    qpel mc20  (H half-pel)  25.66 ns/block
    qpel mc02  (V half-pel)  15.06 ns/block  (faster than mc20!)
    qpel mc22  (HV half-pel) 71.50 ns/block  (cascaded H+V, expected)

  Projected 1080p frame budgets (worst-case, CPU NEON only):
    IDCT 4x4 (all-4x4 MBs):       1.41 ms   (130,560 blocks)
    IDCT 8x8 (all-8x8 MBs):       0.97 ms   ( 32,640 blocks)
    Deblock luma_v (all MBs):     0.59 ms   ( 32,640 edges)
    Deblock luma_h (all MBs):     1.36 ms   ( 32,640 edges)
    qpel mc22 (all 8x8 blocks):   2.33 ms   ( 32,640 blocks)

    Sum (IDCT 4x4 + deblock luma + MC all-mc22):    5.69 ms
    30 fps deadline:                              33.33 ms
    Margin:                                       +27.64 ms

What this validates:

  - The "30fps@1080p is the fine floor" memory note holds with
    huge headroom on the pixel-math layer alone.  17% of the
    deadline goes to pixel math (worst case); 83% is available
    for entropy decode + reference frame management + intra
    prediction + chroma deblock + chroma IDCT + the libavcodec
    intercept overhead.
  - The CPU-vs-QPU substrate finding from earlier (PR #10 on
    daedalus-decoder showed CPU NEON is 4x faster than QPU for
    IDCT) is consistent here.  All these kernels have CPU-only
    recipes by default; the data suggests that's the right call
    for now.  The recipe substrate decision can be revisited
    per-kernel once QPU shaders catch up.
  - mc22 (2D HV half-pel) is the most expensive single qpel
    position at ~71 ns/block — 2-7x more than the 1D variants.
    Real B-slice biprediction with two mc22 calls per MB would
    add ~4.7 ms/frame; still comfortable but worth knowing.

What this DOESN'T measure (intentionally — they aren't on the
critical path at NEON speeds):

  - Chroma IDCT (4 cb + 4 cr 4x4 per MB).  At similar ns/block to
    luma, that's ~0.7 ms/frame.
  - Chroma deblock (smaller tile, simpler kernel — sub-ms).
  - Intra prediction (per-block, ~50 ops at NEON, but serialized
    in z-scan order so cache-friendly; ~0.5 ms/frame estimate).
  - bS=4 intra deblock variants — different algorithm, similar
    cost to bS<4.
  - chroma DC Hadamard — trivial.

Adding all of those in the worst case would maybe double the 5.69
ms number to ~12 ms.  Still leaves 20+ ms for entropy decode +
metadata work in the intercept patch.
2026-05-25 11:26:11 +02:00

221 lines
9.3 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/* CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */
#define _POSIX_C_SOURCE 200809L
/*
* bench_h264_primitives — NEON-path latency baseline for the H.264
* primitive library landed across PRs #9#23.
*
* Each kernel is exercised at a representative per-frame N for 1080p
* (8160 MBs); the per-kernel total + ns/op + ms/frame are reported.
* Lets us answer "what's the total NEON-only budget for the H.264
* decode at 1080p" — useful for sizing intercept-patch decisions
* (which kernels NEED QPU shaders vs which are budget-fine on NEON).
*
* NOT a ctest — produces wall-time numbers, doesn't pass/fail.
*
* Invoke: ./build/bench_h264_primitives [iters]
* (default iters = 50, post-warmup = 5)
*
* NB: results are inherently approximate — single-core, includes
* loop overhead + memory access patterns that may not match what
* a real decode would hit (we touch a small set of pages repeatedly).
* The numbers are useful for relative comparison and order-of-
* magnitude sizing, not absolute perf claims.
*/
#include "daedalus.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
static uint64_t xs64_state = 0xfeedface5a5a5a5aULL;
static uint64_t xs64(void) {
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
static double now_ms(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6;
}
/* Per-1080p-frame counts (8160 MBs at 1920x1088). */
#define MBS_1080P 8160
#define LUMA_4x4_PER_MB 16 /* if transform_8x8=0 */
#define LUMA_8x8_PER_MB 4 /* if transform_8x8=1 */
#define CHROMA_4x4_PER_MB 8 /* 4 Cb + 4 Cr */
#define DEBLOCK_LUMA_EDGES_PER_MB 4 /* 4 horiz + 4 vert internal+MB-edge — ~4 each */
#define DEBLOCK_CHROMA_EDGES_PER_MB 2 /* 2 each direction */
/* Standard benchmark loop. fn() is called n times per iteration. */
typedef void (*bench_fn)(void);
static double bench_ns(const char *name, int iters, int warmup,
int ops_per_iter, bench_fn fn)
{
for (int i = 0; i < warmup; i++) fn();
double t0 = now_ms();
for (int i = 0; i < iters; i++) fn();
double t1 = now_ms();
double total_ms = (t1 - t0);
double ns_per_op = (total_ms * 1e6) / ((double) iters * ops_per_iter);
printf(" %-32s %8.2f ns/op (%d iters x %d ops)\n",
name, ns_per_op, iters, ops_per_iter);
return ns_per_op;
}
/* ---- Per-kernel scaffolding. Each section sets up the buffers +
* meta, then defines a static fn() that calls the corresponding
* dispatch with a representative N. */
static daedalus_ctx *ctx;
/* --- IDCT 4x4 luma: N = 16 blocks per MB. Bench with 1024 blocks
* per call (64 MBs worth). Per-MB the dispatch overhead is the
* same regardless of N — we want ns per block. */
static int16_t idct4_coeffs[1024 * 16];
static daedalus_h264_block_meta idct4_meta[1024];
static uint8_t idct_dst[64 * 4 * 16 * 16]; /* 64 MB-rows × ... */
static void bench_idct4(void) {
daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_CPU,
idct_dst, 64*16, idct4_coeffs, 1024, idct4_meta);
}
/* --- IDCT 8x8 luma: 256 8x8 blocks per call. */
static int16_t idct8_coeffs[256 * 64];
static daedalus_h264_block_meta idct8_meta[256];
static void bench_idct8(void) {
daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_CPU,
idct_dst, 64*16, idct8_coeffs, 256, idct8_meta);
}
/* --- Deblock luma_v (cycle 8 baseline; M3 path). */
static daedalus_h264_deblock_meta deblock_meta[256];
static uint8_t deblock_dst[256 * 16 * 16];
static void bench_deblock_v(void) {
daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU,
deblock_dst, 16, 256, deblock_meta);
}
static void bench_deblock_h(void) {
daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_CPU,
deblock_dst, 16, 256, deblock_meta);
}
/* --- qpel mc20 + mc02 + mc22 (the H/V/HV anchors). */
static uint8_t qpel_src[256 * 16 * 16];
static uint8_t qpel_dst[256 * 16 * 16];
static daedalus_h264_qpel_meta qpel_meta[256];
static void bench_qpel_mc20(void) {
daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_CPU,
qpel_dst, qpel_src, 16, 256, qpel_meta);
}
static void bench_qpel_mc02(void) {
daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_CPU,
qpel_dst, qpel_src, 16, 256, qpel_meta);
}
static void bench_qpel_mc22(void) {
daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_CPU,
qpel_dst, qpel_src, 16, 256, qpel_meta);
}
int main(int argc, char **argv)
{
int iters = argc > 1 ? atoi(argv[1]) : 50;
int warmup = argc > 2 ? atoi(argv[2]) : 5;
ctx = daedalus_ctx_create();
if (!ctx) {
fprintf(stderr, "ctx create failed (Vulkan?)\n");
return 1;
}
/* Pre-fill all input buffers with random data so the NEON inner
* loops see realistic memory access patterns. */
for (size_t i = 0; i < sizeof(idct4_coeffs)/2; i++)
idct4_coeffs[i] = (int16_t)((int)(xs64() % 1024) - 512);
for (size_t i = 0; i < sizeof(idct8_coeffs)/2; i++)
idct8_coeffs[i] = (int16_t)((int)(xs64() % 1024) - 512);
for (size_t i = 0; i < sizeof(qpel_src); i++) qpel_src[i] = (uint8_t)(xs64() & 0xff);
/* IDCT meta: each block at offset i*16 (row layout matters less
* here since we're just measuring per-block latency). */
for (size_t i = 0; i < 1024; i++)
idct4_meta[i].dst_off = (uint32_t)((i / 16) * 64 + (i % 16) * 4);
for (size_t i = 0; i < 256; i++)
idct8_meta[i].dst_off = (uint32_t)((i / 8) * 64 + (i % 8) * 8);
/* Deblock meta: edge offsets within 256 16x16 tiles. */
for (size_t i = 0; i < 256; i++) {
deblock_meta[i].dst_off = (uint32_t)(i * 256 + 4 * 16);
deblock_meta[i].alpha = 30;
deblock_meta[i].beta = 10;
for (int s = 0; s < 4; s++) deblock_meta[i].tc0[s] = (int8_t)(s + 1);
}
/* qpel meta: src and dst at row 3 col 3 of each 16x16 tile. */
for (size_t i = 0; i < 256; i++) {
qpel_meta[i].src_off = (uint32_t)(i * 256 + 3 * 16 + 3);
qpel_meta[i].dst_off = (uint32_t)(i * 256 + 3 * 16 + 3);
}
printf("bench_h264_primitives: %d iters (%d warmup), substrate=CPU NEON\n",
iters, warmup);
printf("Per-call N is set per kernel; ns/op is per BLOCK or EDGE.\n\n");
double idct4_ns = bench_ns("IDCT 4x4 luma", iters, warmup, 1024, bench_idct4);
double idct8_ns = bench_ns("IDCT 8x8 luma", iters, warmup, 256, bench_idct8);
double debl_v_ns = bench_ns("Deblock luma_v", iters, warmup, 256, bench_deblock_v);
double debl_h_ns = bench_ns("Deblock luma_h", iters, warmup, 256, bench_deblock_h);
double qmc20_ns = bench_ns("qpel mc20 (8x8)", iters, warmup, 256, bench_qpel_mc20);
double qmc02_ns = bench_ns("qpel mc02 (8x8)", iters, warmup, 256, bench_qpel_mc02);
double qmc22_ns = bench_ns("qpel mc22 (8x8)", iters, warmup, 256, bench_qpel_mc22);
/* Per-frame budget summary at 1080p (8160 MBs). Worst-case
* assumptions:
* - All MBs are transform_4x4 (16 4x4 IDCTs each) — so 130,560
* IDCT 4x4 blocks per frame. If High profile transform_8x8,
* it'd be 32,640 IDCT 8x8 blocks instead.
* - All MBs are intra (no MC — qpel zero) OR all inter (no
* intra prediction). We report MC at "all inter, all qpel
* mc22" worst case.
* - Deblock: ~4 luma_v + 4 luma_h edges per MB; assume all 8
* edges trigger filtering. */
printf("\nProjected 1080p frame budgets (worst-case, CPU NEON only):\n");
printf(" IDCT 4x4 (all-4x4 MBs): %7.2f ms (%d blocks)\n",
idct4_ns * MBS_1080P * 16 / 1e6, MBS_1080P * 16);
printf(" IDCT 8x8 (all-8x8 MBs): %7.2f ms (%d blocks)\n",
idct8_ns * MBS_1080P * 4 / 1e6, MBS_1080P * 4);
printf(" Deblock luma_v (all MBs): %7.2f ms (%d edges)\n",
debl_v_ns * MBS_1080P * 4 / 1e6, MBS_1080P * 4);
printf(" Deblock luma_h (all MBs): %7.2f ms (%d edges)\n",
debl_h_ns * MBS_1080P * 4 / 1e6, MBS_1080P * 4);
printf(" qpel mc22 (all 8x8 blocks): %7.2f ms (%d blocks)\n",
qmc22_ns * MBS_1080P * 4 / 1e6, MBS_1080P * 4);
double sum_idct_4x4 = idct4_ns * MBS_1080P * 16 / 1e6;
double sum_deblock = (debl_v_ns + debl_h_ns) * MBS_1080P * 4 / 1e6;
double sum_mc = qmc22_ns * MBS_1080P * 4 / 1e6; /* worst-case all-mc22 */
printf("\n Sum (IDCT 4x4 + deblock luma + MC all-mc22): %7.2f ms\n",
sum_idct_4x4 + sum_deblock + sum_mc);
printf(" 30 fps deadline: 33.33 ms\n");
printf(" Margin: %+.2f ms\n",
33.33 - (sum_idct_4x4 + sum_deblock + sum_mc));
printf("\n(NOT included: chroma deblock, chroma IDCT, intra prediction,\n");
printf(" CABAC/CAVLC entropy. These bench numbers are a budget LOWER\n");
printf(" bound; the real decode stack adds 20-40%% on top.)\n");
(void) qmc20_ns; (void) qmc02_ns;
daedalus_ctx_destroy(ctx);
return 0;
}