Phase 8.8: throughput baseline + multi-codec streams + HDR
Per the correctness-before-speed principle: measure before
optimising. Roadmap going in said "QPU dispatch substitution
to hit 30fps@1080p". Measurement on hertz shows the FFmpeg
software path already hits 65-88 fps@1080p across all three
codecs — QPU substitution would be premature optimisation.
So 8.8 ships what's actually useful:
1. Per-frame timing in test_m2m_stream.
2. Multi-frame AV1 + H.264 streams verified byte-exact at
1080p (closes the "VP9-only stream tests" gap from 8.7).
3. HDR / 10-bit via V4L2_PIX_FMT_P010 + daemon
pack_p010_to_plane.
Test harness (tools/test_m2m_stream.c):
- Per-frame µs timing via CLOCK_MONOTONIC; reports mean/p50/
p99/min/max + wall ms + fps.
- Annex-B H.264 parser: split on 3-/4-byte start codes,
accumulate NALs into access units (push on VCL NAL types
1 or 5). Without AU grouping FFmpeg rejects SPS/PPS-only
buffers as "no frame!".
- Format auto-detect (DKIF magic → IVF; else Annex-B).
- Optional 6th arg `[capture]`: nv12m | p010.
- CAPTURE mmap path generalised for num_planes==1 (P010).
Kernel (kernel/daedalus_v4l2_main.c):
- CAPTURE formats array {NV12M, P010}; enum_fmt walks it.
- daedalus_fill_capture_fmt takes a fourcc:
NV12M: 2 planes, W*H + W*H/2 bytes, bpl=W
P010: 1 plane, W*H*2 + W*H bytes, bpl=W*2
- try_fmt preserves caller fourcc when supported.
- daedalus_complete_resp_frame's dmabuf path now sets each
plane's payload to vb2_plane_size(vb,p) — generalises
cleanly across 1-plane (P010) and 2-plane (NV12M) layouts;
the daemon fully populates the plane so payload =
sizeimage.
Daemon (daemon/src/decoder.c):
- pack_p010_to_plane: YUV420P10LE → P010 single-plane.
10-bit samples shifted left by 6 to MSB-align in 16-bit
words per V4L2 ABI. Y at base+0, interleaved CbCr right
after Y plane (per format spec for single-plane P010).
Strips source stride padding; respects destination stride.
- daedalus_decoder_run_request dispatches on
req->capture_pix_fmt (NV12M → pack_nv12_to_planes; P010
→ pack_p010_to_plane; else warn + skip).
- Includes <linux/videodev2.h> for fourcc constants.
Verification on hertz (Pi 5, 6.12.75+rpt-rpi-2712):
1080p throughput baseline (30 frames testsrc, dmabuf path):
VP9 1080p: mean 12.0 ms, p99 15.9 ms, fps **83.1**, byte-exact ✓
AV1 1080p: mean 15.4 ms, p99 41.0 ms, fps **65.0**, byte-exact ✓
H.264 1080p: mean 11.3 ms, p99 21.5 ms, fps **88.3**, byte-exact ✓
All 2-3× over the 30fps-floor-is-fine criterion.
HDR / 10-bit 1080p P010:
10 frames, 62 MB output, fps **48.8**, byte-exact vs
`ffmpeg -pix_fmt p010le -f rawvideo`.
Small-frame P010 (320×240): fps 966 — fixed daemon overhead
dominates at low resolutions.
v4l2-compliance unchanged from 8.7: 49/49 passing.
Format enumeration confirms NM12 + P010 on CAPTURE.
Clean SIGTERM + rmmod; no kernel oops/WARN.
Roadmap update (docs/roadmap.md):
- 8.8 marked closed with closure-doc reference, including
the explicit "QPU substitution not needed" rationale.
- 8.9 reshaped: libva-v4l2-request consumer integration
(per project_consumer_target memory) — the actual
user-facing endpoint.
Per correctness-before-speed:
- Measured first; QPU work explicitly justified-out via data.
- Byte-exact pixel comparison for every codec/format combo
(NV12: VP9, AV1, H.264; P010: VP9 10-bit at 320×240 and
1080p).
- AU grouping in the Annex-B parser is the correct
semantic boundary, not just a workaround.
- vb2_plane_size for payload generalises to any plane
count, not hardcoded to 2.
Phase 8.9 next: libva-v4l2-request integration — close
the loop from YouTube/Firefox to /dev/video0 + daemon
playback.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+79
-32
@@ -55,9 +55,11 @@
|
||||
#define DAEDALUS_VIDEO_NAME "daedalus"
|
||||
|
||||
/*
|
||||
* Phase 8.6: OUTPUT side advertises VP9 + AV1 + H.264 stateless
|
||||
* formats (the daemon decodes all three via FFmpeg dlopen).
|
||||
* CAPTURE is NV12M for now; HDR / 10-bit comes later.
|
||||
* OUTPUT side advertises VP9 + AV1 + H.264 stateless formats
|
||||
* (the daemon decodes all three via FFmpeg dlopen). CAPTURE
|
||||
* advertises NV12M (8-bit, two-plane) + P010 (10-bit,
|
||||
* single-plane interleaved CbCr) added in Phase 8.8 for HDR
|
||||
* content.
|
||||
*/
|
||||
static const u32 daedalus_output_formats[] = {
|
||||
V4L2_PIX_FMT_VP9_FRAME,
|
||||
@@ -66,7 +68,22 @@ static const u32 daedalus_output_formats[] = {
|
||||
};
|
||||
#define DAEDALUS_NUM_OUTPUT_FMTS ARRAY_SIZE(daedalus_output_formats)
|
||||
#define DAEDALUS_DEFAULT_OUTPUT_FOURCC V4L2_PIX_FMT_VP9_FRAME
|
||||
#define DAEDALUS_CAPTURE_FOURCC V4L2_PIX_FMT_NV12M /* planar Y + interleaved CbCr */
|
||||
|
||||
static const u32 daedalus_capture_formats[] = {
|
||||
V4L2_PIX_FMT_NV12M,
|
||||
V4L2_PIX_FMT_P010,
|
||||
};
|
||||
#define DAEDALUS_NUM_CAPTURE_FMTS ARRAY_SIZE(daedalus_capture_formats)
|
||||
#define DAEDALUS_DEFAULT_CAPTURE_FOURCC V4L2_PIX_FMT_NV12M
|
||||
|
||||
static bool daedalus_is_supported_capture(u32 fourcc)
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; i < DAEDALUS_NUM_CAPTURE_FMTS; i++)
|
||||
if (daedalus_capture_formats[i] == fourcc)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static u32 daedalus_fourcc_to_codec_id(u32 fourcc)
|
||||
{
|
||||
@@ -186,21 +203,40 @@ static int daedalus_register_stateless_ctrls(struct v4l2_ctrl_handler *hdl)
|
||||
|
||||
/* -- format helpers -------------------------------------------------- */
|
||||
|
||||
/* NV12M = 2 planes: plane 0 = Y (W*H), plane 1 = interleaved CbCr (W*H/2). */
|
||||
/*
|
||||
* CAPTURE format fill. Two layouts supported:
|
||||
* NV12M (default, 8-bit) — 2 planes: Y (W*H bytes) + interleaved
|
||||
* CbCr at half-res (W*H/2 bytes).
|
||||
* P010 (10-bit HDR) — 1 plane: Y first (W*H*2 bytes) then
|
||||
* interleaved CbCr at half-res
|
||||
* (W*H bytes); 16-bit samples,
|
||||
* MSB-aligned 10-bit data (low 6
|
||||
* bits zero per V4L2 ABI).
|
||||
*/
|
||||
static void daedalus_fill_capture_fmt(struct v4l2_pix_format_mplane *f,
|
||||
u32 w, u32 h)
|
||||
u32 fourcc, u32 w, u32 h)
|
||||
{
|
||||
if (!daedalus_is_supported_capture(fourcc))
|
||||
fourcc = DAEDALUS_DEFAULT_CAPTURE_FOURCC;
|
||||
f->width = w;
|
||||
f->height = h;
|
||||
f->pixelformat = DAEDALUS_CAPTURE_FOURCC;
|
||||
f->pixelformat = fourcc;
|
||||
f->field = V4L2_FIELD_NONE;
|
||||
f->colorspace = V4L2_COLORSPACE_REC709;
|
||||
f->num_planes = 2;
|
||||
|
||||
f->plane_fmt[0].bytesperline = w;
|
||||
f->plane_fmt[0].sizeimage = w * h;
|
||||
f->plane_fmt[1].bytesperline = w;
|
||||
f->plane_fmt[1].sizeimage = w * h / 2;
|
||||
if (fourcc == V4L2_PIX_FMT_P010) {
|
||||
f->num_planes = 1;
|
||||
f->plane_fmt[0].bytesperline = w * 2;
|
||||
f->plane_fmt[0].sizeimage = w * h * 2 + w * h;
|
||||
f->plane_fmt[1].bytesperline = 0;
|
||||
f->plane_fmt[1].sizeimage = 0;
|
||||
} else {
|
||||
f->num_planes = 2;
|
||||
f->plane_fmt[0].bytesperline = w;
|
||||
f->plane_fmt[0].sizeimage = w * h;
|
||||
f->plane_fmt[1].bytesperline = w;
|
||||
f->plane_fmt[1].sizeimage = w * h / 2;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -612,30 +648,32 @@ void daedalus_complete_resp_frame(u32 cookie,
|
||||
/*
|
||||
* Two routes the daemon can take, both supported:
|
||||
*
|
||||
* (a) Phase 8.6 dmabuf path — daemon called
|
||||
* (a) dmabuf path (Phase 8.6+) — daemon called
|
||||
* DAEDALUS_IOC_GET_DMABUF, mmap'd the CAPTURE buffer,
|
||||
* wrote pixels in place. RESP_FRAME carries metadata
|
||||
* only (pixels_len == 0). We just set the payload
|
||||
* per plane from the daemon's reported sizes.
|
||||
* only (pixels_len == 0). Each plane's payload is
|
||||
* the full plane size (the daemon wrote everything
|
||||
* the format requires).
|
||||
*
|
||||
* (b) Phase 8.5 inline path — daemon shipped raw NV12 in
|
||||
* the chardev payload (≤ 64 KiB cap). We memcpy
|
||||
* into the vb2 buffer's vmalloc-backed plane. Still
|
||||
* supported for small frames where the daemon hasn't
|
||||
* picked up the GET_DMABUF path.
|
||||
* into the vb2 buffer. Plane payloads come from
|
||||
* the daemon's NV12 luma/chroma counts.
|
||||
*/
|
||||
if (state == VB2_BUF_STATE_DONE) {
|
||||
struct vb2_buffer *vb = &inf->dst_buf->vb2_buf;
|
||||
|
||||
y_size = min_t(u32, fr->luma_len,
|
||||
(u32) vb2_plane_size(vb, 0));
|
||||
uv_size = min_t(u32, fr->chroma_len,
|
||||
(u32) vb2_plane_size(vb, 1));
|
||||
unsigned int p;
|
||||
|
||||
if (pixels_len) {
|
||||
/* (b) inline copy */
|
||||
/* (b) inline NV12 copy — legacy 2-plane only */
|
||||
y_size = min_t(u32, fr->luma_len,
|
||||
(u32) vb2_plane_size(vb, 0));
|
||||
uv_size = vb->num_planes > 1 ?
|
||||
min_t(u32, fr->chroma_len,
|
||||
(u32) vb2_plane_size(vb, 1)) : 0;
|
||||
dst_y = vb2_plane_vaddr(vb, 0);
|
||||
dst_uv = vb2_plane_vaddr(vb, 1);
|
||||
dst_uv = vb->num_planes > 1 ?
|
||||
vb2_plane_vaddr(vb, 1) : NULL;
|
||||
if (dst_y && y_size && pixels_len >= y_size)
|
||||
memcpy(dst_y, pixels, y_size);
|
||||
else
|
||||
@@ -645,11 +683,16 @@ void daedalus_complete_resp_frame(u32 cookie,
|
||||
memcpy(dst_uv, pixels + y_size, uv_size);
|
||||
else
|
||||
uv_size = 0;
|
||||
vb2_set_plane_payload(vb, 0, y_size);
|
||||
if (vb->num_planes > 1)
|
||||
vb2_set_plane_payload(vb, 1, uv_size);
|
||||
} else {
|
||||
/* (a) dmabuf path: plane is fully populated by
|
||||
* the daemon, so payload == sizeimage. */
|
||||
for (p = 0; p < vb->num_planes; p++)
|
||||
vb2_set_plane_payload(vb, p,
|
||||
vb2_plane_size(vb, p));
|
||||
}
|
||||
/* (a) dmabuf path: pixels already there; just set payload */
|
||||
vb2_set_plane_payload(vb, 0, y_size);
|
||||
if (vb->num_planes > 1)
|
||||
vb2_set_plane_payload(vb, 1, uv_size);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -689,9 +732,9 @@ static int daedalus_enum_fmt(struct file *file, void *priv,
|
||||
f->flags |= V4L2_FMT_FLAG_COMPRESSED;
|
||||
return 0;
|
||||
}
|
||||
if (f->index != 0)
|
||||
if (f->index >= DAEDALUS_NUM_CAPTURE_FMTS)
|
||||
return -EINVAL;
|
||||
f->pixelformat = DAEDALUS_CAPTURE_FOURCC;
|
||||
f->pixelformat = daedalus_capture_formats[f->index];
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -735,7 +778,10 @@ static int daedalus_try_fmt(struct file *file, void *priv,
|
||||
fourcc = DAEDALUS_DEFAULT_OUTPUT_FOURCC;
|
||||
daedalus_fill_output_fmt(p, fourcc, w, h);
|
||||
} else if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) {
|
||||
daedalus_fill_capture_fmt(p, w, h);
|
||||
u32 fourcc = p->pixelformat;
|
||||
if (!daedalus_is_supported_capture(fourcc))
|
||||
fourcc = DAEDALUS_DEFAULT_CAPTURE_FOURCC;
|
||||
daedalus_fill_capture_fmt(p, fourcc, w, h);
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -834,6 +880,7 @@ static int daedalus_open(struct file *file)
|
||||
DAEDALUS_DEFAULT_OUTPUT_FOURCC,
|
||||
DAEDALUS_DEFAULT_W, DAEDALUS_DEFAULT_H);
|
||||
daedalus_fill_capture_fmt(&ctx->dst_fmt,
|
||||
DAEDALUS_DEFAULT_CAPTURE_FOURCC,
|
||||
DAEDALUS_DEFAULT_W, DAEDALUS_DEFAULT_H);
|
||||
|
||||
ctx->m2m_ctx = v4l2_m2m_ctx_init(dev->m2m_dev, ctx,
|
||||
|
||||
Reference in New Issue
Block a user