daemon: per-frame decode_us + periodic stats summary (#11 step 1)
Establishes observable baseline metrics before any daedalus-fourier
kernel substitution lands. Step 1 of the daemon-rewrite arc tracked
at daedalus-v4l2#11.
Changes
-------
- Per-frame `decoder: OK ...` log line now carries decode_us=N (the
send_packet + receive_frame wall-clock cost in microseconds —
exclusively the libavcodec round-trip, not the bitstream pack /
SPS-PPS synth / pack-to-planes work).
- New "decoder stats" summary line every DAEDALUS_STATS_EVERY (60)
decoded frames, reporting: codec, frame count, window seconds,
fps, avg decode_us, MBs/s throughput, bytes/MB bitrate.
Sample
------
decoder stats: codec=h264 frames=300 window=12.32s fps=24.35
avg_decode_us=4216.4 mbs_per_s=87643 bs_b_per_mb=1.56
What this tells us
------------------
Steady-state on higgs (Pi CM5) decoding bbb_720p_h264.mp4:
~4 ms decode_us, ~90 K MBs/s, well under the daedalus-fourier
NEON kernel ceilings (IDCT 4×4 @ 175 Mblocks/s, deblock @ 92 Medges/s,
qpel mc20 @ 131 Mblocks/s — all 100-1000× over our actual workload).
Means the 4 ms/frame is mostly libavcodec's CABAC + MV prediction +
intra prediction overhead, NOT the pixel-math primitives.
Substituting a single primitive would shave only a small slice of
the 4 ms. Useful as guidance for the upcoming substitution work —
we'll pick the primitive with the largest cycle cost relative to
the alternative, and measure CPU saved per substitution.
No behaviour change: counters are static + unsynchronised (the
chardev event loop is single-threaded); reset when codec_id changes.
clock_gettime(CLOCK_MONOTONIC) for timing.
This commit is contained in:
+111
-3
@@ -10,12 +10,55 @@
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
#include <linux/videodev2.h>
|
#include <linux/videodev2.h>
|
||||||
|
|
||||||
#include <libavcodec/avcodec.h>
|
#include <libavcodec/avcodec.h>
|
||||||
#include <libavutil/pixfmt.h>
|
#include <libavutil/pixfmt.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Per-codec running stats — daedalus-v4l2#11 step 1. Establishes
|
||||||
|
* baseline observability before any daedalus-fourier kernel
|
||||||
|
* substitution lands, so we can see what each substitution actually
|
||||||
|
* shifted. Per-frame `decoder: OK` line now carries decode_us; a
|
||||||
|
* "decoder stats" summary line lands every DAEDALUS_STATS_EVERY OK
|
||||||
|
* frames with throughput + per-frame budget aggregates.
|
||||||
|
*
|
||||||
|
* Counters are static (process-local) and unsynchronised — the
|
||||||
|
* daemon's chardev event loop is single-threaded, so no atomics or
|
||||||
|
* locking needed. Reset when codec_id changes (different stream).
|
||||||
|
*/
|
||||||
|
#define DAEDALUS_STATS_EVERY 60u
|
||||||
|
|
||||||
|
struct daedalus_decode_stats {
|
||||||
|
uint32_t codec_id;
|
||||||
|
uint64_t frames;
|
||||||
|
uint64_t total_decode_ns;
|
||||||
|
uint64_t total_bitstream_bytes;
|
||||||
|
uint64_t total_mbs; /* derived from frame WxH; H.264-style 16x16 */
|
||||||
|
struct timespec window_start;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct daedalus_decode_stats g_stats;
|
||||||
|
|
||||||
|
static inline uint64_t timespec_delta_ns(const struct timespec *a,
|
||||||
|
const struct timespec *b)
|
||||||
|
{
|
||||||
|
return (uint64_t)(b->tv_sec - a->tv_sec) * 1000000000ull +
|
||||||
|
(uint64_t)(b->tv_nsec - a->tv_nsec);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *codec_id_name(uint32_t cid)
|
||||||
|
{
|
||||||
|
switch (cid) {
|
||||||
|
case DAEDALUS_CODEC_VP9: return "vp9";
|
||||||
|
case DAEDALUS_CODEC_AV1: return "av1";
|
||||||
|
case DAEDALUS_CODEC_H264: return "h264";
|
||||||
|
default: return "?";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FNV-1a 32-bit hash. Used as a compact digest of the decoded
|
* FNV-1a 32-bit hash. Used as a compact digest of the decoded
|
||||||
* frame's YUV planes so the kernel can verify "the daemon produced
|
* frame's YUV planes so the kernel can verify "the daemon produced
|
||||||
@@ -467,6 +510,17 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec,
|
|||||||
dec->pkt->size = (int) req->bitstream_len;
|
dec->pkt->size = (int) req->bitstream_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Time send_packet+receive_frame for the per-frame `decoder: OK`
|
||||||
|
* line + the periodic stats summary. Includes only the
|
||||||
|
* libavcodec round-trip — not the bitstream packing, SPS/PPS
|
||||||
|
* synth, or pack-to-planes work (those are accounted for
|
||||||
|
* separately in the request's overall handle time).
|
||||||
|
*/
|
||||||
|
struct timespec t_decode_start, t_decode_end;
|
||||||
|
uint64_t decode_ns = 0;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &t_decode_start);
|
||||||
|
|
||||||
rc = fm->avcodec_send_packet(ctx, dec->pkt);
|
rc = fm->avcodec_send_packet(ctx, dec->pkt);
|
||||||
if (rc < 0) {
|
if (rc < 0) {
|
||||||
log_err("decoder: avcodec_send_packet failed: %d", rc);
|
log_err("decoder: avcodec_send_packet failed: %d", rc);
|
||||||
@@ -476,8 +530,11 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec,
|
|||||||
|
|
||||||
fm->av_frame_unref(dec->frame);
|
fm->av_frame_unref(dec->frame);
|
||||||
rc = fm->avcodec_receive_frame(ctx, dec->frame);
|
rc = fm->avcodec_receive_frame(ctx, dec->frame);
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &t_decode_end);
|
||||||
|
decode_ns = timespec_delta_ns(&t_decode_start, &t_decode_end);
|
||||||
if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) {
|
if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) {
|
||||||
log_debug("decoder: no frame ready yet (rc=%d)", rc);
|
log_debug("decoder: no frame ready yet (rc=%d, %lu us)",
|
||||||
|
rc, (unsigned long)(decode_ns / 1000));
|
||||||
resp->status = DAEDALUS_DECODE_NO_FRAME;
|
resp->status = DAEDALUS_DECODE_NO_FRAME;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@@ -584,10 +641,61 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec,
|
|||||||
fr->format, req->capture_pix_fmt);
|
fr->format, req->capture_pix_fmt);
|
||||||
}
|
}
|
||||||
|
|
||||||
log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u",
|
log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u decode_us=%lu",
|
||||||
fr->width, fr->height, fr->format,
|
fr->width, fr->height, fr->format,
|
||||||
desc ? desc->name : "?",
|
desc ? desc->name : "?",
|
||||||
h, luma_len, chroma_len);
|
h, luma_len, chroma_len,
|
||||||
|
(unsigned long)(decode_ns / 1000));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Periodic stats summary (every DAEDALUS_STATS_EVERY frames).
|
||||||
|
* Reset window on codec change. Gives observable baseline
|
||||||
|
* for the daedalus-v4l2#11 substitution arc: fps, average
|
||||||
|
* decode_us, MB/s throughput, bitstream B/MB. Compare
|
||||||
|
* against daedalus-fourier README's per-kernel NEON
|
||||||
|
* baselines (e.g. H.264 IDCT 4x4 = 175 Mblock/s) to gauge
|
||||||
|
* which substitutions are worth pursuing.
|
||||||
|
*/
|
||||||
|
if (g_stats.codec_id != req->codec_id) {
|
||||||
|
g_stats.codec_id = req->codec_id;
|
||||||
|
g_stats.frames = 0;
|
||||||
|
g_stats.total_decode_ns = 0;
|
||||||
|
g_stats.total_bitstream_bytes = 0;
|
||||||
|
g_stats.total_mbs = 0;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &g_stats.window_start);
|
||||||
|
}
|
||||||
|
g_stats.frames++;
|
||||||
|
g_stats.total_decode_ns += decode_ns;
|
||||||
|
g_stats.total_bitstream_bytes += req->bitstream_len;
|
||||||
|
g_stats.total_mbs += (uint64_t)((fr->width + 15) / 16) *
|
||||||
|
(uint64_t)((fr->height + 15) / 16);
|
||||||
|
|
||||||
|
if (g_stats.frames % DAEDALUS_STATS_EVERY == 0) {
|
||||||
|
struct timespec t_now;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &t_now);
|
||||||
|
uint64_t window_ns =
|
||||||
|
timespec_delta_ns(&g_stats.window_start, &t_now);
|
||||||
|
double window_s = (double)window_ns / 1e9;
|
||||||
|
double fps = window_s > 0 ?
|
||||||
|
(double)g_stats.frames / window_s : 0.0;
|
||||||
|
double avg_decode_us = g_stats.frames > 0 ?
|
||||||
|
(double)g_stats.total_decode_ns /
|
||||||
|
(double)g_stats.frames / 1000.0 : 0.0;
|
||||||
|
double mb_per_s = window_s > 0 ?
|
||||||
|
(double)g_stats.total_mbs / window_s : 0.0;
|
||||||
|
double bs_b_per_mb = g_stats.total_mbs > 0 ?
|
||||||
|
(double)g_stats.total_bitstream_bytes /
|
||||||
|
(double)g_stats.total_mbs : 0.0;
|
||||||
|
|
||||||
|
log_info("decoder stats: codec=%s "
|
||||||
|
"frames=%llu window=%.2fs fps=%.2f "
|
||||||
|
"avg_decode_us=%.1f mbs_per_s=%.0f "
|
||||||
|
"bs_b_per_mb=%.2f",
|
||||||
|
codec_id_name(g_stats.codec_id),
|
||||||
|
(unsigned long long)g_stats.frames,
|
||||||
|
window_s, fps, avg_decode_us,
|
||||||
|
mb_per_s, bs_b_per_mb);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fm->av_frame_unref(dec->frame);
|
fm->av_frame_unref(dec->frame);
|
||||||
|
|||||||
Reference in New Issue
Block a user