Phase 8.8: throughput baseline + multi-codec streams + HDR
Per the correctness-before-speed principle: measure before
optimising. Roadmap going in said "QPU dispatch substitution
to hit 30fps@1080p". Measurement on hertz shows the FFmpeg
software path already hits 65-88 fps@1080p across all three
codecs — QPU substitution would be premature optimisation.
So 8.8 ships what's actually useful:
1. Per-frame timing in test_m2m_stream.
2. Multi-frame AV1 + H.264 streams verified byte-exact at
1080p (closes the "VP9-only stream tests" gap from 8.7).
3. HDR / 10-bit via V4L2_PIX_FMT_P010 + daemon
pack_p010_to_plane.
Test harness (tools/test_m2m_stream.c):
- Per-frame µs timing via CLOCK_MONOTONIC; reports mean/p50/
p99/min/max + wall ms + fps.
- Annex-B H.264 parser: split on 3-/4-byte start codes,
accumulate NALs into access units (push on VCL NAL types
1 or 5). Without AU grouping FFmpeg rejects SPS/PPS-only
buffers as "no frame!".
- Format auto-detect (DKIF magic → IVF; else Annex-B).
- Optional 6th arg `[capture]`: nv12m | p010.
- CAPTURE mmap path generalised for num_planes==1 (P010).
Kernel (kernel/daedalus_v4l2_main.c):
- CAPTURE formats array {NV12M, P010}; enum_fmt walks it.
- daedalus_fill_capture_fmt takes a fourcc:
NV12M: 2 planes, W*H + W*H/2 bytes, bpl=W
P010: 1 plane, W*H*2 + W*H bytes, bpl=W*2
- try_fmt preserves caller fourcc when supported.
- daedalus_complete_resp_frame's dmabuf path now sets each
plane's payload to vb2_plane_size(vb,p) — generalises
cleanly across 1-plane (P010) and 2-plane (NV12M) layouts;
the daemon fully populates the plane so payload =
sizeimage.
Daemon (daemon/src/decoder.c):
- pack_p010_to_plane: YUV420P10LE → P010 single-plane.
10-bit samples shifted left by 6 to MSB-align in 16-bit
words per V4L2 ABI. Y at base+0, interleaved CbCr right
after Y plane (per format spec for single-plane P010).
Strips source stride padding; respects destination stride.
- daedalus_decoder_run_request dispatches on
req->capture_pix_fmt (NV12M → pack_nv12_to_planes; P010
→ pack_p010_to_plane; else warn + skip).
- Includes <linux/videodev2.h> for fourcc constants.
Verification on hertz (Pi 5, 6.12.75+rpt-rpi-2712):
1080p throughput baseline (30 frames testsrc, dmabuf path):
VP9 1080p: mean 12.0 ms, p99 15.9 ms, fps **83.1**, byte-exact ✓
AV1 1080p: mean 15.4 ms, p99 41.0 ms, fps **65.0**, byte-exact ✓
H.264 1080p: mean 11.3 ms, p99 21.5 ms, fps **88.3**, byte-exact ✓
All 2-3× over the 30fps-floor-is-fine criterion.
HDR / 10-bit 1080p P010:
10 frames, 62 MB output, fps **48.8**, byte-exact vs
`ffmpeg -pix_fmt p010le -f rawvideo`.
Small-frame P010 (320×240): fps 966 — fixed daemon overhead
dominates at low resolutions.
v4l2-compliance unchanged from 8.7: 49/49 passing.
Format enumeration confirms NM12 + P010 on CAPTURE.
Clean SIGTERM + rmmod; no kernel oops/WARN.
Roadmap update (docs/roadmap.md):
- 8.8 marked closed with closure-doc reference, including
the explicit "QPU substitution not needed" rationale.
- 8.9 reshaped: libva-v4l2-request consumer integration
(per project_consumer_target memory) — the actual
user-facing endpoint.
Per correctness-before-speed:
- Measured first; QPU work explicitly justified-out via data.
- Byte-exact pixel comparison for every codec/format combo
(NV12: VP9, AV1, H.264; P010: VP9 10-bit at 320×240 and
1080p).
- AU grouping in the Annex-B parser is the correct
semantic boundary, not just a workaround.
- vb2_plane_size for payload generalises to any plane
count, not hardcoded to 2.
Phase 8.9 next: libva-v4l2-request integration — close
the loop from YouTube/Firefox to /dev/video0 + daemon
playback.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+242
-15
@@ -25,6 +25,7 @@
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
@@ -42,11 +43,152 @@ static void die(const char *msg)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static uint64_t now_us(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (uint64_t) ts.tv_sec * 1000000ull +
|
||||
(uint64_t) (ts.tv_nsec / 1000ull);
|
||||
}
|
||||
|
||||
static int cmp_u64(const void *a, const void *b)
|
||||
{
|
||||
uint64_t va = *(const uint64_t *) a, vb = *(const uint64_t *) b;
|
||||
return (va > vb) - (va < vb);
|
||||
}
|
||||
|
||||
struct ivf_frame {
|
||||
uint8_t *data;
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
/*
|
||||
* Parse an Annex-B H.264 stream into ACCESS UNITS. An access
|
||||
* unit contains zero or more non-VCL NALs (SPS/PPS/SEI/AUD)
|
||||
* followed by one VCL NAL (slice). Submitting NALs individually
|
||||
* confuses FFmpeg's H.264 decoder — it needs SPS+PPS plus a
|
||||
* complete slice to produce a frame. We accumulate NALs in a
|
||||
* pending buffer; when we see a VCL NAL (type 1 or 5) we flush
|
||||
* (pending + that VCL NAL) as one access unit.
|
||||
*
|
||||
* Width/height aren't carried in the Annex-B framing; caller
|
||||
* must supply them via the [w] [h] command-line args.
|
||||
*/
|
||||
static int find_next_startcode(const uint8_t *d, size_t off, size_t len)
|
||||
{
|
||||
while (off + 3 <= len) {
|
||||
if (d[off] == 0 && d[off + 1] == 0) {
|
||||
if (d[off + 2] == 1)
|
||||
return (int) off;
|
||||
if (off + 4 <= len && d[off + 2] == 0 &&
|
||||
d[off + 3] == 1)
|
||||
return (int) off;
|
||||
}
|
||||
off++;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a NAL chunk (starts with 0x000001 or 0x00000001),
|
||||
* return the H.264 NAL unit type (byte after the start code,
|
||||
* masked with 0x1F).
|
||||
*/
|
||||
static int h264_nal_type(const uint8_t *nal, size_t sz)
|
||||
{
|
||||
size_t off;
|
||||
|
||||
if (sz < 4)
|
||||
return -1;
|
||||
/* skip the 3- or 4-byte start code */
|
||||
if (nal[2] == 1)
|
||||
off = 3;
|
||||
else if (sz >= 5 && nal[2] == 0 && nal[3] == 1)
|
||||
off = 4;
|
||||
else
|
||||
return -1;
|
||||
if (off >= sz)
|
||||
return -1;
|
||||
return nal[off] & 0x1F;
|
||||
}
|
||||
|
||||
static struct ivf_frame *parse_annexb(const char *path, int *out_count)
|
||||
{
|
||||
uint8_t *buf;
|
||||
struct stat st;
|
||||
int fd;
|
||||
ssize_t n;
|
||||
int count = 0, cap = 16;
|
||||
struct ivf_frame *frames;
|
||||
int off, next;
|
||||
uint8_t *pending = NULL;
|
||||
size_t pending_len = 0;
|
||||
|
||||
fd = open(path, O_RDONLY);
|
||||
if (fd < 0)
|
||||
die("open annex-b");
|
||||
if (fstat(fd, &st) < 0)
|
||||
die("fstat");
|
||||
buf = malloc(st.st_size);
|
||||
if (!buf)
|
||||
die("malloc annex-b");
|
||||
n = read(fd, buf, st.st_size);
|
||||
if (n != st.st_size)
|
||||
die("read annex-b");
|
||||
close(fd);
|
||||
|
||||
frames = malloc(cap * sizeof(*frames));
|
||||
if (!frames)
|
||||
die("malloc frames");
|
||||
|
||||
off = find_next_startcode(buf, 0, (size_t) st.st_size);
|
||||
if (off < 0) {
|
||||
fprintf(stderr, "no Annex-B start code in %s\n", path);
|
||||
exit(1);
|
||||
}
|
||||
while (off < st.st_size) {
|
||||
size_t start = (size_t) off;
|
||||
size_t end, sz;
|
||||
int nal_type;
|
||||
|
||||
next = find_next_startcode(buf, start + 3,
|
||||
(size_t) st.st_size);
|
||||
end = (next < 0) ? (size_t) st.st_size : (size_t) next;
|
||||
sz = end - start;
|
||||
|
||||
nal_type = h264_nal_type(buf + start, sz);
|
||||
/* Append this NAL to the pending access unit. */
|
||||
pending = realloc(pending, pending_len + sz);
|
||||
if (!pending)
|
||||
die("realloc pending au");
|
||||
memcpy(pending + pending_len, buf + start, sz);
|
||||
pending_len += sz;
|
||||
|
||||
/* VCL NAL types 1 (non-IDR slice) and 5 (IDR slice)
|
||||
* close the access unit. */
|
||||
if (nal_type == 1 || nal_type == 5) {
|
||||
if (count >= cap) {
|
||||
cap *= 2;
|
||||
frames = realloc(frames,
|
||||
cap * sizeof(*frames));
|
||||
if (!frames)
|
||||
die("realloc frames");
|
||||
}
|
||||
frames[count].size = (uint32_t) pending_len;
|
||||
frames[count].data = pending;
|
||||
count++;
|
||||
pending = NULL;
|
||||
pending_len = 0;
|
||||
}
|
||||
|
||||
off = (next < 0) ? (int) st.st_size : next;
|
||||
}
|
||||
free(pending);
|
||||
free(buf);
|
||||
*out_count = count;
|
||||
return frames;
|
||||
}
|
||||
|
||||
/* Parse an IVF file into a vector of frames (caller frees). */
|
||||
static struct ivf_frame *parse_ivf(const char *path, int *out_count,
|
||||
uint32_t *out_w, uint32_t *out_h)
|
||||
@@ -123,6 +265,8 @@ int main(int argc, char **argv)
|
||||
const char *ivf_path, *out_path;
|
||||
uint32_t override_w = 0, override_h = 0;
|
||||
uint32_t output_fourcc = V4L2_PIX_FMT_VP9_FRAME;
|
||||
uint32_t capture_fourcc = V4L2_PIX_FMT_NV12M;
|
||||
int capture_num_planes = 2;
|
||||
uint32_t w, h;
|
||||
int fd, frame_count;
|
||||
struct ivf_frame *frames;
|
||||
@@ -140,6 +284,8 @@ int main(int argc, char **argv)
|
||||
|
||||
FILE *of;
|
||||
int i, decoded = 0;
|
||||
uint64_t *per_frame_us = NULL;
|
||||
uint64_t total_start, total_us;
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr,
|
||||
@@ -164,8 +310,45 @@ int main(int argc, char **argv)
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
if (argc >= 7) {
|
||||
const char *cf = argv[6];
|
||||
if (!strcmp(cf, "nv12m")) {
|
||||
capture_fourcc = V4L2_PIX_FMT_NV12M;
|
||||
capture_num_planes = 2;
|
||||
} else if (!strcmp(cf, "p010")) {
|
||||
capture_fourcc = V4L2_PIX_FMT_P010;
|
||||
capture_num_planes = 1;
|
||||
} else {
|
||||
fprintf(stderr, "unknown capture format %s\n", cf);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
frames = parse_ivf(ivf_path, &frame_count, &w, &h);
|
||||
/*
|
||||
* Format detection: IVF starts with 'DKIF' magic; anything
|
||||
* else is treated as Annex-B (H.264 NAL stream). Width/
|
||||
* height come from the IVF header for IVF, or must be
|
||||
* provided as CLI args for Annex-B.
|
||||
*/
|
||||
{
|
||||
uint8_t hdr4[4] = { 0 };
|
||||
int hfd = open(ivf_path, O_RDONLY);
|
||||
if (hfd < 0) die("open input");
|
||||
if (read(hfd, hdr4, 4) != 4) die("read header");
|
||||
close(hfd);
|
||||
if (!memcmp(hdr4, "DKIF", 4)) {
|
||||
frames = parse_ivf(ivf_path, &frame_count, &w, &h);
|
||||
} else {
|
||||
if (!override_w || !override_h) {
|
||||
fprintf(stderr,
|
||||
"non-IVF input: explicit [w] [h] required\n");
|
||||
return 2;
|
||||
}
|
||||
w = override_w;
|
||||
h = override_h;
|
||||
frames = parse_annexb(ivf_path, &frame_count);
|
||||
}
|
||||
}
|
||||
if (override_w) w = override_w;
|
||||
if (override_h) h = override_h;
|
||||
printf("parsed %d frames, %ux%u\n", frame_count, w, h);
|
||||
@@ -188,11 +371,16 @@ int main(int argc, char **argv)
|
||||
fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
|
||||
fmt.fmt.pix_mp.width = w;
|
||||
fmt.fmt.pix_mp.height = h;
|
||||
fmt.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_NV12M;
|
||||
fmt.fmt.pix_mp.pixelformat = capture_fourcc;
|
||||
if (ioctl(fd, VIDIOC_S_FMT, &fmt) < 0)
|
||||
die("S_FMT CAPTURE");
|
||||
cap_y_size = fmt.fmt.pix_mp.plane_fmt[0].sizeimage;
|
||||
cap_uv_size = fmt.fmt.pix_mp.plane_fmt[1].sizeimage;
|
||||
cap_uv_size = capture_num_planes > 1 ?
|
||||
fmt.fmt.pix_mp.plane_fmt[1].sizeimage : 0;
|
||||
printf("CAPTURE fmt=%c%c%c%c planes=%u sizeimage=[%zu,%zu]\n",
|
||||
capture_fourcc & 0xff, (capture_fourcc >> 8) & 0xff,
|
||||
(capture_fourcc >> 16) & 0xff, (capture_fourcc >> 24) & 0xff,
|
||||
fmt.fmt.pix_mp.num_planes, cap_y_size, cap_uv_size);
|
||||
|
||||
/* REQBUFS OUTPUT + mmap each */
|
||||
memset(&reqbuf, 0, sizeof(reqbuf));
|
||||
@@ -237,17 +425,23 @@ int main(int argc, char **argv)
|
||||
buf.memory = V4L2_MEMORY_MMAP;
|
||||
buf.index = i;
|
||||
buf.m.planes = planes;
|
||||
buf.length = 2;
|
||||
buf.length = capture_num_planes;
|
||||
if (ioctl(fd, VIDIOC_QUERYBUF, &buf) < 0)
|
||||
die("QUERYBUF CAPTURE");
|
||||
cap_y[i] = mmap(NULL, planes[0].length,
|
||||
PROT_READ, MAP_SHARED, fd,
|
||||
planes[0].m.mem_offset);
|
||||
cap_uv[i] = mmap(NULL, planes[1].length,
|
||||
PROT_READ, MAP_SHARED, fd,
|
||||
planes[1].m.mem_offset);
|
||||
if (cap_y[i] == MAP_FAILED || cap_uv[i] == MAP_FAILED)
|
||||
die("mmap CAPTURE");
|
||||
if (cap_y[i] == MAP_FAILED)
|
||||
die("mmap CAPTURE Y");
|
||||
if (capture_num_planes > 1) {
|
||||
cap_uv[i] = mmap(NULL, planes[1].length,
|
||||
PROT_READ, MAP_SHARED, fd,
|
||||
planes[1].m.mem_offset);
|
||||
if (cap_uv[i] == MAP_FAILED)
|
||||
die("mmap CAPTURE UV");
|
||||
} else {
|
||||
cap_uv[i] = NULL;
|
||||
}
|
||||
|
||||
/* QBUF all capture buffers up front */
|
||||
memset(&buf, 0, sizeof(buf));
|
||||
@@ -256,7 +450,7 @@ int main(int argc, char **argv)
|
||||
buf.memory = V4L2_MEMORY_MMAP;
|
||||
buf.index = i;
|
||||
buf.m.planes = planes;
|
||||
buf.length = 2;
|
||||
buf.length = capture_num_planes;
|
||||
if (ioctl(fd, VIDIOC_QBUF, &buf) < 0)
|
||||
die("QBUF CAPTURE init");
|
||||
}
|
||||
@@ -273,12 +467,18 @@ int main(int argc, char **argv)
|
||||
if (!of)
|
||||
die("fopen out");
|
||||
|
||||
per_frame_us = calloc((size_t) frame_count, sizeof(*per_frame_us));
|
||||
if (!per_frame_us)
|
||||
die("calloc per_frame_us");
|
||||
total_start = now_us();
|
||||
|
||||
/* Feed one bitstream frame at a time; serialise DQBUF after each. */
|
||||
for (i = 0; i < frame_count; i++) {
|
||||
int idx = i % NUM_OUTPUT_BUFS;
|
||||
struct pollfd p = { .fd = fd, .events = POLLIN | POLLOUT };
|
||||
size_t y_actual, uv_actual;
|
||||
int cap_idx;
|
||||
uint64_t frame_start = now_us();
|
||||
|
||||
if (frames[i].size > out_map_size) {
|
||||
fprintf(stderr, "frame %d too big: %u > %zu\n",
|
||||
@@ -317,7 +517,7 @@ int main(int argc, char **argv)
|
||||
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
|
||||
buf.memory = V4L2_MEMORY_MMAP;
|
||||
buf.m.planes = planes;
|
||||
buf.length = 2;
|
||||
buf.length = capture_num_planes;
|
||||
if (ioctl(fd, VIDIOC_DQBUF, &buf) < 0)
|
||||
die("DQBUF CAPTURE");
|
||||
cap_idx = buf.index;
|
||||
@@ -327,10 +527,12 @@ int main(int argc, char **argv)
|
||||
}
|
||||
y_actual = planes[0].bytesused ? planes[0].bytesused
|
||||
: cap_y_size;
|
||||
uv_actual = planes[1].bytesused ? planes[1].bytesused
|
||||
: cap_uv_size;
|
||||
uv_actual = (capture_num_planes > 1 && planes[1].bytesused)
|
||||
? planes[1].bytesused : cap_uv_size;
|
||||
fwrite(cap_y[cap_idx], 1, y_actual, of);
|
||||
fwrite(cap_uv[cap_idx], 1, uv_actual, of);
|
||||
if (capture_num_planes > 1 && cap_uv[cap_idx])
|
||||
fwrite(cap_uv[cap_idx], 1, uv_actual, of);
|
||||
per_frame_us[decoded] = now_us() - frame_start;
|
||||
decoded++;
|
||||
|
||||
/* Recycle the CAPTURE buffer */
|
||||
@@ -340,14 +542,39 @@ int main(int argc, char **argv)
|
||||
buf.memory = V4L2_MEMORY_MMAP;
|
||||
buf.index = cap_idx;
|
||||
buf.m.planes = planes;
|
||||
buf.length = 2;
|
||||
buf.length = capture_num_planes;
|
||||
if (ioctl(fd, VIDIOC_QBUF, &buf) < 0)
|
||||
die("QBUF CAPTURE recycle");
|
||||
}
|
||||
|
||||
total_us = now_us() - total_start;
|
||||
fclose(of);
|
||||
printf("decoded %d / %d frames to %s\n", decoded, frame_count, out_path);
|
||||
|
||||
if (decoded > 0) {
|
||||
uint64_t *sorted = malloc(decoded * sizeof(*sorted));
|
||||
uint64_t sum = 0;
|
||||
double mean_us, fps;
|
||||
int i;
|
||||
|
||||
memcpy(sorted, per_frame_us, decoded * sizeof(*sorted));
|
||||
qsort(sorted, decoded, sizeof(*sorted), cmp_u64);
|
||||
for (i = 0; i < decoded; i++)
|
||||
sum += per_frame_us[i];
|
||||
mean_us = (double) sum / (double) decoded;
|
||||
fps = 1e6 * (double) decoded / (double) total_us;
|
||||
printf("perf: mean=%.0fus p50=%luus p99=%luus min=%luus max=%luus | wall=%lums fps=%.1f\n",
|
||||
mean_us,
|
||||
(unsigned long) sorted[decoded / 2],
|
||||
(unsigned long) sorted[(decoded * 99) / 100],
|
||||
(unsigned long) sorted[0],
|
||||
(unsigned long) sorted[decoded - 1],
|
||||
(unsigned long) (total_us / 1000),
|
||||
fps);
|
||||
free(sorted);
|
||||
}
|
||||
free(per_frame_us);
|
||||
|
||||
t = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
|
||||
ioctl(fd, VIDIOC_STREAMOFF, &t);
|
||||
t = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
|
||||
|
||||
Reference in New Issue
Block a user