daemon: per-frame decode_us + periodic stats summary (#11 step 1)

Establishes observable baseline metrics before any daedalus-fourier
kernel substitution lands.  Step 1 of the daemon-rewrite arc tracked
at daedalus-v4l2#11.

Changes
-------
- Per-frame `decoder: OK ...` log line now carries decode_us=N (the
  send_packet + receive_frame wall-clock cost in microseconds —
  exclusively the libavcodec round-trip, not the bitstream pack /
  SPS-PPS synth / pack-to-planes work).
- New "decoder stats" summary line every DAEDALUS_STATS_EVERY (60)
  decoded frames, reporting: codec, frame count, window seconds,
  fps, avg decode_us, MBs/s throughput, bytes/MB bitrate.

Sample
------
  decoder stats: codec=h264 frames=300 window=12.32s fps=24.35
                 avg_decode_us=4216.4 mbs_per_s=87643 bs_b_per_mb=1.56

What this tells us
------------------
Steady-state on higgs (Pi CM5) decoding bbb_720p_h264.mp4:
~4 ms decode_us, ~90 K MBs/s, well under the daedalus-fourier
NEON kernel ceilings (IDCT 4×4 @ 175 Mblocks/s, deblock @ 92 Medges/s,
qpel mc20 @ 131 Mblocks/s — all 100-1000× over our actual workload).

Means the 4 ms/frame is mostly libavcodec's CABAC + MV prediction +
intra prediction overhead, NOT the pixel-math primitives.
Substituting a single primitive would shave only a small slice of
the 4 ms.  Useful as guidance for the upcoming substitution work —
we'll pick the primitive with the largest cycle cost relative to
the alternative, and measure CPU saved per substitution.

No behaviour change: counters are static + unsynchronised (the
chardev event loop is single-threaded); reset when codec_id changes.
clock_gettime(CLOCK_MONOTONIC) for timing.
This commit is contained in:
2026-05-21 20:17:09 +02:00
parent 77e14e5a19
commit 814b74d0bb
+111 -3
View File
@@ -10,12 +10,55 @@
#include <errno.h> #include <errno.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <time.h>
#include <linux/videodev2.h> #include <linux/videodev2.h>
#include <libavcodec/avcodec.h> #include <libavcodec/avcodec.h>
#include <libavutil/pixfmt.h> #include <libavutil/pixfmt.h>
/*
* Per-codec running stats — daedalus-v4l2#11 step 1. Establishes
* baseline observability before any daedalus-fourier kernel
* substitution lands, so we can see what each substitution actually
* shifted. Per-frame `decoder: OK` line now carries decode_us; a
* "decoder stats" summary line lands every DAEDALUS_STATS_EVERY OK
* frames with throughput + per-frame budget aggregates.
*
* Counters are static (process-local) and unsynchronised — the
* daemon's chardev event loop is single-threaded, so no atomics or
* locking needed. Reset when codec_id changes (different stream).
*/
#define DAEDALUS_STATS_EVERY 60u
struct daedalus_decode_stats {
uint32_t codec_id;
uint64_t frames;
uint64_t total_decode_ns;
uint64_t total_bitstream_bytes;
uint64_t total_mbs; /* derived from frame WxH; H.264-style 16x16 */
struct timespec window_start;
};
static struct daedalus_decode_stats g_stats;
static inline uint64_t timespec_delta_ns(const struct timespec *a,
const struct timespec *b)
{
return (uint64_t)(b->tv_sec - a->tv_sec) * 1000000000ull +
(uint64_t)(b->tv_nsec - a->tv_nsec);
}
static const char *codec_id_name(uint32_t cid)
{
switch (cid) {
case DAEDALUS_CODEC_VP9: return "vp9";
case DAEDALUS_CODEC_AV1: return "av1";
case DAEDALUS_CODEC_H264: return "h264";
default: return "?";
}
}
/* /*
* FNV-1a 32-bit hash. Used as a compact digest of the decoded * FNV-1a 32-bit hash. Used as a compact digest of the decoded
* frame's YUV planes so the kernel can verify "the daemon produced * frame's YUV planes so the kernel can verify "the daemon produced
@@ -467,6 +510,17 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec,
dec->pkt->size = (int) req->bitstream_len; dec->pkt->size = (int) req->bitstream_len;
} }
/*
* Time send_packet+receive_frame for the per-frame `decoder: OK`
* line + the periodic stats summary. Includes only the
* libavcodec round-trip — not the bitstream packing, SPS/PPS
* synth, or pack-to-planes work (those are accounted for
* separately in the request's overall handle time).
*/
struct timespec t_decode_start, t_decode_end;
uint64_t decode_ns = 0;
clock_gettime(CLOCK_MONOTONIC, &t_decode_start);
rc = fm->avcodec_send_packet(ctx, dec->pkt); rc = fm->avcodec_send_packet(ctx, dec->pkt);
if (rc < 0) { if (rc < 0) {
log_err("decoder: avcodec_send_packet failed: %d", rc); log_err("decoder: avcodec_send_packet failed: %d", rc);
@@ -476,8 +530,11 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec,
fm->av_frame_unref(dec->frame); fm->av_frame_unref(dec->frame);
rc = fm->avcodec_receive_frame(ctx, dec->frame); rc = fm->avcodec_receive_frame(ctx, dec->frame);
clock_gettime(CLOCK_MONOTONIC, &t_decode_end);
decode_ns = timespec_delta_ns(&t_decode_start, &t_decode_end);
if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) { if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) {
log_debug("decoder: no frame ready yet (rc=%d)", rc); log_debug("decoder: no frame ready yet (rc=%d, %lu us)",
rc, (unsigned long)(decode_ns / 1000));
resp->status = DAEDALUS_DECODE_NO_FRAME; resp->status = DAEDALUS_DECODE_NO_FRAME;
goto out; goto out;
} }
@@ -584,10 +641,61 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec,
fr->format, req->capture_pix_fmt); fr->format, req->capture_pix_fmt);
} }
log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u", log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u decode_us=%lu",
fr->width, fr->height, fr->format, fr->width, fr->height, fr->format,
desc ? desc->name : "?", desc ? desc->name : "?",
h, luma_len, chroma_len); h, luma_len, chroma_len,
(unsigned long)(decode_ns / 1000));
/*
* Periodic stats summary (every DAEDALUS_STATS_EVERY frames).
* Reset window on codec change. Gives observable baseline
* for the daedalus-v4l2#11 substitution arc: fps, average
* decode_us, MB/s throughput, bitstream B/MB. Compare
* against daedalus-fourier README's per-kernel NEON
* baselines (e.g. H.264 IDCT 4x4 = 175 Mblock/s) to gauge
* which substitutions are worth pursuing.
*/
if (g_stats.codec_id != req->codec_id) {
g_stats.codec_id = req->codec_id;
g_stats.frames = 0;
g_stats.total_decode_ns = 0;
g_stats.total_bitstream_bytes = 0;
g_stats.total_mbs = 0;
clock_gettime(CLOCK_MONOTONIC, &g_stats.window_start);
}
g_stats.frames++;
g_stats.total_decode_ns += decode_ns;
g_stats.total_bitstream_bytes += req->bitstream_len;
g_stats.total_mbs += (uint64_t)((fr->width + 15) / 16) *
(uint64_t)((fr->height + 15) / 16);
if (g_stats.frames % DAEDALUS_STATS_EVERY == 0) {
struct timespec t_now;
clock_gettime(CLOCK_MONOTONIC, &t_now);
uint64_t window_ns =
timespec_delta_ns(&g_stats.window_start, &t_now);
double window_s = (double)window_ns / 1e9;
double fps = window_s > 0 ?
(double)g_stats.frames / window_s : 0.0;
double avg_decode_us = g_stats.frames > 0 ?
(double)g_stats.total_decode_ns /
(double)g_stats.frames / 1000.0 : 0.0;
double mb_per_s = window_s > 0 ?
(double)g_stats.total_mbs / window_s : 0.0;
double bs_b_per_mb = g_stats.total_mbs > 0 ?
(double)g_stats.total_bitstream_bytes /
(double)g_stats.total_mbs : 0.0;
log_info("decoder stats: codec=%s "
"frames=%llu window=%.2fs fps=%.2f "
"avg_decode_us=%.1f mbs_per_s=%.0f "
"bs_b_per_mb=%.2f",
codec_id_name(g_stats.codec_id),
(unsigned long long)g_stats.frames,
window_s, fps, avg_decode_us,
mb_per_s, bs_b_per_mb);
}
} }
fm->av_frame_unref(dec->frame); fm->av_frame_unref(dec->frame);