From 814b74d0bbea54e96afad83685cbd879c6ac257a Mon Sep 17 00:00:00 2001 From: claude-noether Date: Thu, 21 May 2026 20:17:09 +0200 Subject: [PATCH] daemon: per-frame decode_us + periodic stats summary (#11 step 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Establishes observable baseline metrics before any daedalus-fourier kernel substitution lands. Step 1 of the daemon-rewrite arc tracked at daedalus-v4l2#11. Changes ------- - Per-frame `decoder: OK ...` log line now carries decode_us=N (the send_packet + receive_frame wall-clock cost in microseconds — exclusively the libavcodec round-trip, not the bitstream pack / SPS-PPS synth / pack-to-planes work). - New "decoder stats" summary line every DAEDALUS_STATS_EVERY (60) decoded frames, reporting: codec, frame count, window seconds, fps, avg decode_us, MBs/s throughput, bytes/MB bitrate. Sample ------ decoder stats: codec=h264 frames=300 window=12.32s fps=24.35 avg_decode_us=4216.4 mbs_per_s=87643 bs_b_per_mb=1.56 What this tells us ------------------ Steady-state on higgs (Pi CM5) decoding bbb_720p_h264.mp4: ~4 ms decode_us, ~90 K MBs/s, well under the daedalus-fourier NEON kernel ceilings (IDCT 4×4 @ 175 Mblocks/s, deblock @ 92 Medges/s, qpel mc20 @ 131 Mblocks/s — all 100-1000× over our actual workload). Means the 4 ms/frame is mostly libavcodec's CABAC + MV prediction + intra prediction overhead, NOT the pixel-math primitives. Substituting a single primitive would shave only a small slice of the 4 ms. Useful as guidance for the upcoming substitution work — we'll pick the primitive with the largest cycle cost relative to the alternative, and measure CPU saved per substitution. No behaviour change: counters are static + unsynchronised (the chardev event loop is single-threaded); reset when codec_id changes. clock_gettime(CLOCK_MONOTONIC) for timing. --- daemon/src/decoder.c | 114 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 3 deletions(-) diff --git a/daemon/src/decoder.c b/daemon/src/decoder.c index a43d6ab..10f70f5 100644 --- a/daemon/src/decoder.c +++ b/daemon/src/decoder.c @@ -10,12 +10,55 @@ #include #include #include +#include #include #include #include +/* + * Per-codec running stats — daedalus-v4l2#11 step 1. Establishes + * baseline observability before any daedalus-fourier kernel + * substitution lands, so we can see what each substitution actually + * shifted. Per-frame `decoder: OK` line now carries decode_us; a + * "decoder stats" summary line lands every DAEDALUS_STATS_EVERY OK + * frames with throughput + per-frame budget aggregates. + * + * Counters are static (process-local) and unsynchronised — the + * daemon's chardev event loop is single-threaded, so no atomics or + * locking needed. Reset when codec_id changes (different stream). + */ +#define DAEDALUS_STATS_EVERY 60u + +struct daedalus_decode_stats { + uint32_t codec_id; + uint64_t frames; + uint64_t total_decode_ns; + uint64_t total_bitstream_bytes; + uint64_t total_mbs; /* derived from frame WxH; H.264-style 16x16 */ + struct timespec window_start; +}; + +static struct daedalus_decode_stats g_stats; + +static inline uint64_t timespec_delta_ns(const struct timespec *a, + const struct timespec *b) +{ + return (uint64_t)(b->tv_sec - a->tv_sec) * 1000000000ull + + (uint64_t)(b->tv_nsec - a->tv_nsec); +} + +static const char *codec_id_name(uint32_t cid) +{ + switch (cid) { + case DAEDALUS_CODEC_VP9: return "vp9"; + case DAEDALUS_CODEC_AV1: return "av1"; + case DAEDALUS_CODEC_H264: return "h264"; + default: return "?"; + } +} + /* * FNV-1a 32-bit hash. Used as a compact digest of the decoded * frame's YUV planes so the kernel can verify "the daemon produced @@ -467,6 +510,17 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, dec->pkt->size = (int) req->bitstream_len; } + /* + * Time send_packet+receive_frame for the per-frame `decoder: OK` + * line + the periodic stats summary. Includes only the + * libavcodec round-trip — not the bitstream packing, SPS/PPS + * synth, or pack-to-planes work (those are accounted for + * separately in the request's overall handle time). + */ + struct timespec t_decode_start, t_decode_end; + uint64_t decode_ns = 0; + clock_gettime(CLOCK_MONOTONIC, &t_decode_start); + rc = fm->avcodec_send_packet(ctx, dec->pkt); if (rc < 0) { log_err("decoder: avcodec_send_packet failed: %d", rc); @@ -476,8 +530,11 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, fm->av_frame_unref(dec->frame); rc = fm->avcodec_receive_frame(ctx, dec->frame); + clock_gettime(CLOCK_MONOTONIC, &t_decode_end); + decode_ns = timespec_delta_ns(&t_decode_start, &t_decode_end); if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) { - log_debug("decoder: no frame ready yet (rc=%d)", rc); + log_debug("decoder: no frame ready yet (rc=%d, %lu us)", + rc, (unsigned long)(decode_ns / 1000)); resp->status = DAEDALUS_DECODE_NO_FRAME; goto out; } @@ -584,10 +641,61 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, fr->format, req->capture_pix_fmt); } - log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u", + log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u decode_us=%lu", fr->width, fr->height, fr->format, desc ? desc->name : "?", - h, luma_len, chroma_len); + h, luma_len, chroma_len, + (unsigned long)(decode_ns / 1000)); + + /* + * Periodic stats summary (every DAEDALUS_STATS_EVERY frames). + * Reset window on codec change. Gives observable baseline + * for the daedalus-v4l2#11 substitution arc: fps, average + * decode_us, MB/s throughput, bitstream B/MB. Compare + * against daedalus-fourier README's per-kernel NEON + * baselines (e.g. H.264 IDCT 4x4 = 175 Mblock/s) to gauge + * which substitutions are worth pursuing. + */ + if (g_stats.codec_id != req->codec_id) { + g_stats.codec_id = req->codec_id; + g_stats.frames = 0; + g_stats.total_decode_ns = 0; + g_stats.total_bitstream_bytes = 0; + g_stats.total_mbs = 0; + clock_gettime(CLOCK_MONOTONIC, &g_stats.window_start); + } + g_stats.frames++; + g_stats.total_decode_ns += decode_ns; + g_stats.total_bitstream_bytes += req->bitstream_len; + g_stats.total_mbs += (uint64_t)((fr->width + 15) / 16) * + (uint64_t)((fr->height + 15) / 16); + + if (g_stats.frames % DAEDALUS_STATS_EVERY == 0) { + struct timespec t_now; + clock_gettime(CLOCK_MONOTONIC, &t_now); + uint64_t window_ns = + timespec_delta_ns(&g_stats.window_start, &t_now); + double window_s = (double)window_ns / 1e9; + double fps = window_s > 0 ? + (double)g_stats.frames / window_s : 0.0; + double avg_decode_us = g_stats.frames > 0 ? + (double)g_stats.total_decode_ns / + (double)g_stats.frames / 1000.0 : 0.0; + double mb_per_s = window_s > 0 ? + (double)g_stats.total_mbs / window_s : 0.0; + double bs_b_per_mb = g_stats.total_mbs > 0 ? + (double)g_stats.total_bitstream_bytes / + (double)g_stats.total_mbs : 0.0; + + log_info("decoder stats: codec=%s " + "frames=%llu window=%.2fs fps=%.2f " + "avg_decode_us=%.1f mbs_per_s=%.0f " + "bs_b_per_mb=%.2f", + codec_id_name(g_stats.codec_id), + (unsigned long long)g_stats.frames, + window_s, fps, avg_decode_us, + mb_per_s, bs_b_per_mb); + } } fm->av_frame_unref(dec->frame); -- 2.47.3