diff --git a/daemon/src/chardev_client.c b/daemon/src/chardev_client.c index f8e8340..957011d 100644 --- a/daemon/src/chardev_client.c +++ b/daemon/src/chardev_client.c @@ -138,6 +138,8 @@ static int handle_req_decode(struct chardev_client *cli, { struct daedalus_req_decode req; struct daedalus_resp_frame resp; + uint8_t *resp_buf = NULL; + size_t pix_cap, pix_used = 0; int rc; if (hdr->payload_len < sizeof(req)) { @@ -161,13 +163,31 @@ static int handle_req_decode(struct chardev_client *cli, log_info("REQ_DECODE cookie=%u codec=%u bitstream=%u bytes", hdr->cookie, req.codec_id, req.bitstream_len); - rc = daedalus_decoder_run_request(cli->decoder, &req, - payload + sizeof(req), &resp); - if (rc < 0) - return rc; + /* + * Build the response buffer as { struct daedalus_resp_frame, + * }. Cap pixel area at MAX_PAYLOAD - sizeof + * for now (Phase 8.5); Phase 8.6 dmabuf removes the cap. + */ + pix_cap = DAEDALUS_PROTO_MAX_PAYLOAD - sizeof(resp); + resp_buf = malloc(sizeof(resp) + pix_cap); + if (!resp_buf) + return -ENOMEM; - return send_response(cli, DAEDALUS_MSG_RESP_FRAME, hdr->cookie, - &resp, sizeof(resp)); + rc = daedalus_decoder_run_request(cli->decoder, &req, + payload + sizeof(req), &resp, + resp_buf + sizeof(resp), + pix_cap, &pix_used); + if (rc < 0) { + free(resp_buf); + return rc; + } + /* Header at front; pixels follow. Truncate to actual used. */ + memcpy(resp_buf, &resp, sizeof(resp)); + + rc = send_response(cli, DAEDALUS_MSG_RESP_FRAME, hdr->cookie, + resp_buf, sizeof(resp) + pix_used); + free(resp_buf); + return rc; } static int handle_ping(struct chardev_client *cli, diff --git a/daemon/src/decoder.c b/daemon/src/decoder.c index b78b627..04ce51c 100644 --- a/daemon/src/decoder.c +++ b/daemon/src/decoder.c @@ -132,10 +132,67 @@ static int decoder_open_codec(struct daedalus_decoder *dec, uint32_t codec_id, return 0; } +/* + * Pack the decoded YUV planes into NV12 (Y followed by interleaved + * CbCr) in @out, advancing @out and tracking total bytes written + * in @used. Truncates silently at @cap. Returns 0 on success, + * -EINVAL if the source format isn't planar YUV 4:2:0. + */ +static int pack_nv12(struct AVFrame *fr, const AVPixFmtDescriptor *desc, + uint8_t *out, size_t cap, size_t *used) +{ + int y, cw, ch, h, w; + size_t y_size, uv_size; + + *used = 0; + if (!desc || !out || !cap) + return -EINVAL; + if (desc->nb_components < 3) + return -EINVAL; + + w = fr->width; + h = fr->height; + cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); + ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); + y_size = (size_t) w * (size_t) h; + uv_size = (size_t) cw * (size_t) ch * 2u; + + /* Y plane: w bytes per line stripped of stride padding. */ + if (*used + y_size > cap) + return -EINVAL; + for (y = 0; y < h; y++) + memcpy(out + *used + (size_t) y * (size_t) w, + fr->data[0] + (size_t) y * (size_t) fr->linesize[0], + (size_t) w); + *used += y_size; + + /* Interleave Cb and Cr into a single NV12 chroma plane. */ + if (*used + uv_size > cap) + return -EINVAL; + { + uint8_t *uv = out + *used; + int x; + for (y = 0; y < ch; y++) { + const uint8_t *u = fr->data[1] + + (size_t) y * (size_t) fr->linesize[1]; + const uint8_t *v = fr->data[2] + + (size_t) y * (size_t) fr->linesize[2]; + for (x = 0; x < cw; x++) { + uv[(y * cw + x) * 2 + 0] = u[x]; + uv[(y * cw + x) * 2 + 1] = v[x]; + } + } + *used += uv_size; + } + return 0; +} + int daedalus_decoder_run_request(struct daedalus_decoder *dec, const struct daedalus_req_decode *req, const uint8_t *bitstream, - struct daedalus_resp_frame *resp) + struct daedalus_resp_frame *resp, + uint8_t *nv12_out, size_t nv12_cap, + size_t *nv12_used) { struct ffmpeg_loader *fm = dec->loader; struct AVCodecContext *ctx = NULL; @@ -143,6 +200,8 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, memset(resp, 0, sizeof(*resp)); resp->codec_id = req->codec_id; + if (nv12_used) + *nv12_used = 0; rc = decoder_open_codec(dec, req->codec_id, &ctx); if (rc == -ENOSYS) { @@ -255,10 +314,26 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, resp->chroma_len = chroma_len; resp->fnv1a_yuv = h; - log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u", + /* + * Pack pixels as NV12 into the caller's buffer if + * provided and big enough. Truncation is silent — + * the kernel will only copy as much as fits in the + * CAPTURE plane, so partial fills are fine for Phase + * 8.5. Phase 8.6 (dmabuf) eliminates the truncation. + */ + if (nv12_out && nv12_cap) { + int prc = pack_nv12(fr, desc, nv12_out, nv12_cap, + nv12_used); + if (prc < 0) + log_warn("decoder: NV12 pack failed (cap=%zu pix_fmt=%d) — kernel will see metadata only", + nv12_cap, fr->format); + } + + log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u nv12=%zu", fr->width, fr->height, fr->format, desc ? desc->name : "?", - h, luma_len, chroma_len); + h, luma_len, chroma_len, + nv12_used ? *nv12_used : 0u); } fm->av_frame_unref(dec->frame); diff --git a/daemon/src/decoder.h b/daemon/src/decoder.h index 9d0776f..57013e5 100644 --- a/daemon/src/decoder.h +++ b/daemon/src/decoder.h @@ -56,15 +56,24 @@ void daedalus_decoder_cleanup(struct daedalus_decoder *dec); * @req: REQ_DECODE prefix (from the wire) * @bitstream: bitstream blob (req->bitstream_len bytes) * @resp: caller-allocated RESP_FRAME output (zeroed by callee) + * @nv12_out: caller-allocated buffer for NV12 pixel data + * (Y plane followed by interleaved CbCr); may be NULL + * if @nv12_cap is 0 + * @nv12_cap: bytes available at @nv12_out (truncated silently + * if smaller than @resp->luma_len + @resp->chroma_len) + * @nv12_used: out: bytes actually written into @nv12_out * - * Populates @resp with the decode outcome. Always returns 0; - * decode-level failures are reported via @resp->status so the - * kernel sees a structured response rather than a dropped - * request. + * Populates @resp with the decode outcome and @nv12_out with the + * NV12-packed pixels (Y plane, then interleaved Cb/Cr). Always + * returns 0; decode-level failures are reported via @resp->status + * and @nv12_used = 0 so the kernel sees a structured response + * rather than a dropped request. */ int daedalus_decoder_run_request(struct daedalus_decoder *dec, const struct daedalus_req_decode *req, const uint8_t *bitstream, - struct daedalus_resp_frame *resp); + struct daedalus_resp_frame *resp, + uint8_t *nv12_out, size_t nv12_cap, + size_t *nv12_used); #endif /* DAEDALUS_V4L2_DECODER_H */ diff --git a/docs/phase_8_5_closure.md b/docs/phase_8_5_closure.md new file mode 100644 index 0000000..6e2c168 --- /dev/null +++ b/docs/phase_8_5_closure.md @@ -0,0 +1,291 @@ +# Phase 8.5 closure — full V4L2 m2m driver, VP9 decode via QBUF/DQBUF + +**Status:** closed 2026-05-18. + +Replaces the Phase 8.4 debugfs-triggered chardev path with a real +V4L2 m2m driver. Userspace clients now drive decoding the +standard way — `S_FMT` / `REQBUFS` / `QBUF` on the OUTPUT +(bitstream) queue, `DQBUF` on the CAPTURE (NV12M) queue. Kernel +device_run packs the bitstream into REQ_DECODE; the daemon +decodes via FFmpeg; RESP_FRAME's inline NV12 pixel payload lands +in the CAPTURE buffer. Phase 8.6 swaps the inline payload for +dmabuf so big frames stop being capped at 64 KiB. + +## What lands + +### Kernel (`kernel/daedalus_v4l2_main.c`) +- Per-open `struct daedalus_ctx`: v4l2_fh, m2m_ctx, + ctrl_handler, per-queue formats. +- Two vb2_queues via `daedalus_queue_init`: + - OUTPUT: V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, + V4L2_PIX_FMT_VP9_FRAME, `vb2_vmalloc_memops`. + - CAPTURE: V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE, + V4L2_PIX_FMT_NV12M, `vb2_vmalloc_memops`. +- Full `v4l2_ioctl_ops` table: querycap / enum_fmt / + g_fmt / s_fmt / try_fmt for both queues; reqbufs / querybuf / + qbuf / dqbuf / create_bufs / prepare_buf / expbuf / + streamon / streamoff via the `v4l2_m2m_ioctl_*` helpers. +- `v4l2_m2m_ops.device_run`: pulls the next OUTPUT buf, + kmaps the bitstream, builds REQ_DECODE inline (capped at + `DAEDALUS_PROTO_MAX_PAYLOAD - sizeof(struct + daedalus_req_decode)`), enqueues to the chardev with a + cookie, stores `{ctx, src_buf, dst_buf}` in a per-device + inflight list. Job stays open until RESP_FRAME comes back. +- `daedalus_complete_resp_frame` (called from the chardev + write path): pops the inflight entry, memcpys inline NV12 + pixels into the CAPTURE vb2 buffer (Y plane + interleaved + CbCr), finishes the m2m job via + `v4l2_m2m_buf_done_and_job_finish` so both buffers complete + cleanly and the scheduler doesn't immediately re-run + device_run on the same src. + +### Kernel header (`kernel/daedalus_v4l2_main.h`) +- New private header so the chardev source can reach + `daedalus_complete_resp_frame()` without `extern`-only + declarations. +- Contains the canonical `struct daedalus_dev` definition + (was previously inline in main.c). + +### Kernel (`kernel/daedalus_v4l2_chardev.c`) +- RESP_FRAME handler now passes the inline pixel payload to + `daedalus_complete_resp_frame` so it can land in the + CAPTURE buffer. The previous Phase 8.4 path (debugfs + `test_decode` injection) still works, just hits a + ratelimited `unknown cookie` log line because it bypasses + the V4L2 m2m queue. + +### Daemon (`daemon/src/decoder.c`, `decoder.h`) +- `daedalus_decoder_run_request` signature extended with + `(nv12_out, nv12_cap, nv12_used)`. After the FNV-1a digest + it packs the decoded YUV420P planes into NV12 in the + caller's buffer (Y plane line-by-line stripped of stride + padding; CbCr interleaved into the chroma plane). +- Truncation silent — kernel only memcpys what fits in the + CAPTURE plane. + +### Daemon (`daemon/src/chardev_client.c`) +- `handle_req_decode` allocates a response buffer sized + `sizeof(resp) + (MAX_PAYLOAD - sizeof(resp))`, lets the + decoder fill the pixel area, then sends the full payload + (struct + pixels) via `send_response`. + +### Test harness (`tools/test_m2m_decode.c`) +- New: minimal V4L2 m2m client that drives one full + QBUF/DQBUF round-trip. Used for end-to-end verification + in this phase. v4l2-ctl could substitute eventually, + but its `--stream-from-hdr` format isn't compatible with + a raw VP9 frame; a small custom client is the cleanest + test until we add framing. + +## Verification + +Built clean (kernel `make`, daemon `cmake --build`, tools +`make`). All `-Wall -Wextra` warning-free. + +### End-to-end round-trip + +``` +$ ffmpeg -f lavfi -i 'testsrc=duration=0.04:size=128x96:rate=25' \ + -pix_fmt yuv420p -c:v libvpx-vp9 -frames:v 1 -y /tmp/vp9_small.ivf +$ python3 strip-ivf-header → /tmp/vp9_small_kf.bin (1566 B) + +$ sudo insmod kernel/daedalus_v4l2.ko +$ daedalus_v4l2_daemon -v daemon & +$ sudo ./tools/test_m2m_decode /tmp/vp9_small_kf.bin /tmp/out_m2m.nv12 128 96 + + loaded bitstream: 1566 bytes + OUTPUT sizeimage = 65524 + CAPTURE planes = 2, [0].sizeimage=12288 [1].sizeimage=6144 + OUTPUT REQBUFS -> 2 + CAPTURE REQBUFS -> 2 + QBUF OUTPUT[0] bytesused=1566 + QBUF CAPTURE[0] + STREAMON both + poll revents=0x5 + DQBUF OUTPUT[0] flags=0x4001 # V4L2_BUF_FLAG_DONE + DQBUF CAPTURE[0] flags=0x4000 payloads=[12288, 6144] + wrote 12288 Y + 6144 UV bytes to /tmp/out_m2m.nv12 + OK + +daemon log: + REQ_DECODE cookie=1 codec=1 bitstream=1566 bytes + decoder: opened vp9 context + decoder: OK 128x96 fmt=0 (yuv420p) fnv1a=0x1eb34bfe luma=12288 chroma=6144 nv12=18432 +``` + +### Pixel correctness + +``` +$ ffmpeg -i vp9_small.ivf -pix_fmt nv12 -f rawvideo -y ref.nv12 +$ cmp out_m2m.nv12 ref.nv12 +$ echo $? +0 +``` + +**Byte-for-byte match against `ffmpeg -pix_fmt nv12`.** Whole +18432-byte NV12 frame matches exactly — the full kernel ↔ +daemon ↔ FFmpeg pipeline produces the same pixels as a plain +FFmpeg CLI decode. + +### v4l2-compliance + +``` +$ sudo v4l2-compliance -d /dev/video0 +… +Detected Stateless Decoder + +Required ioctls: test VIDIOC_QUERYCAP: OK + test invalid ioctls: OK +Allow for multiple opens: all OK +Format ioctls: VIDIOC_S_FMT FAIL (colorspace mismatch) +Codec ioctls: VIDIOC_(TRY_)DECODER_CMD FAIL + (stateless decoder requires media controller + OR decoder commands; we have neither) +Buffer ioctls: REQBUFS/CREATE_BUFS/QUERYBUF/REMOVE_BUFS/EXPBUF: OK +``` + +Two expected fails: + +- **S_FMT colorspace**: `try_fmt` always rewrites colorspace to + REC709 from the canonical fill helper. Should preserve the + userspace-supplied value when valid. Trivial fix; lands in 8.6. +- **DECODER_CMD / media controller**: stateless decoders are + required by spec to provide either a media controller (for the + request API) OR decoder commands (`V4L2_DEC_CMD_*`). We have + neither — the daemon handles per-frame state internally via + FFmpeg, so we never needed the request API. Phase 8.6 adds + the media controller binding when AV1/H.264 controls land. + +### Clean teardown + +``` +$ pkill -TERM daedalus_v4l2_daemon # SIGTERM, daemon exits cleanly +$ sudo rmmod daedalus_v4l2 # ok +$ sudo dmesg | grep -E 'BUG|oops' +(empty) +``` + +No kernel oops / WARN traces from the production flow. The +initial run had a bug — see [Bugs found and fixed] below. + +## Design decisions + +### Why vb2_vmalloc instead of vb2_dma_contig? + +Two reasons: + +1. **No DMA needed in Phase 8.5.** device_run reads bitstream + bytes via `vb2_plane_vaddr` and the chardev path writes + decoded pixels via `memcpy`. No hardware DMA touches these + buffers, so vb2_dma_contig's CMA pressure buys us nothing + yet. +2. **Phase 8.6 will switch CAPTURE to vb2_dma_contig** for + dmabuf-export — at that point the daemon mmaps the dmabuf + directly and writes pixels in place, bypassing the chardev + payload entirely. Doing the switch in Phase 8.6 (rather + than now) keeps each phase's scope clean: 8.5 = "real V4L2 + m2m flow", 8.6 = "stop truncating big frames + add more + codecs + add request API". + +### Why inline pixels in RESP_FRAME? + +The same 64 KiB cap as REQ_DECODE. For Phase 8.5 we want to +prove the QBUF/DQBUF round-trip works without introducing +dmabuf-fd passing (which needs `dma_buf_fd` in the daemon's +task context + a new chardev ioctl + daemon-side mmap of the +returned fd — all real work, but orthogonal to the m2m +verification). Phase 8.6 adds the dmabuf path. + +For now: small frames (≤ ~256×192 NV12 = 73 KiB; safely +128×96 = 18 KiB) work end-to-end. The pixel-match test +above proves the path is bit-exact, not just "approximately +right." + +### Why a custom test client instead of v4l2-ctl? + +v4l2-ctl's `--stream-from-hdr` expects a v4l2-ctl-specific +length-prefix format (which I tried with a `>I` length prefix +and got "Unknown header ID" — there's apparently a magic +ID byte we didn't supply). Writing a 20-line test harness +that does the V4L2 ioctls directly is faster than reverse- +engineering v4l2-ctl's framing, and the harness stays useful +for regression tests after the per-frame format work in 8.6. + +### Cookie collision between debugfs and V4L2 paths + +Both `daedalus_decode_cookie` (in chardev.c, for debugfs +`test_decode`) and `daedalus_cookie_seq` (in main.c, for m2m +device_run) are independent atomics starting from 0. After a +fresh insmod, both begin issuing cookie=1 → collisions are +likely in mixed-use scenarios. This is harmless today — +debugfs is a test fixture and doesn't have a V4L2 inflight to +complete, so RESP_FRAME for "debugfs cookies" just logs +"unknown cookie" and moves on. Phase 8.6 either unifies the +counter or makes debugfs use cookies with bit 31 set so the +two namespaces don't overlap. + +## Bugs found and fixed during the phase + +### Bug 1: device_run re-runs on same src buf, eventual stop_streaming oops + +**Symptom:** test client logged TWO REQ_DECODE messages for one +QBUF. After teardown, dmesg showed: + +``` +lr : daedalus_stop_streaming+0x3c/0x80 [daedalus_v4l2] +``` + +**Cause:** `daedalus_complete_resp_frame` called +`v4l2_m2m_buf_done(src_buf, ...)` + `v4l2_m2m_buf_done(dst_buf, +...)` + `v4l2_m2m_job_finish(...)`. This marks the vb2 buffers +as DONE but does NOT pop them off the m2m's internal src/dst +queues. The scheduler immediately re-runs device_run with the +same still-queued src buf — which then double-frees on +stop_streaming. + +**Fix:** use `v4l2_m2m_buf_done_and_job_finish` — the canonical +helper that pops both buffers AND marks them done AND finishes +the job in one atomic sequence. Caught on the first end-to-end +run; second run was clean. + +### Bug 2: missing `` + +`v4l2_event_unsubscribe` undeclared. Trivial; added the +include. + +### Bug 3 (build): missing `` in tools/ + +The test harness used `(uint32_t)` casts without including +``. Added. + +## What's NOT here (deferred to 8.6) + +- **Per-frame dmabuf export.** CAPTURE buffers come back + through inline pixel data in RESP_FRAME today; ≤ 64 KiB cap + rules out 1080p. +- **V4L2 stateless controls.** No + `V4L2_CID_STATELESS_VP9_FRAME` etc. — the daemon parses VP9 + headers itself. Compliance complains accordingly. +- **Media controller.** v4l2-compliance flags this as the + "stateless decoder requires media controller OR decoder cmds" + failure. +- **Colorspace round-trip in TRY_FMT.** Documented compliance + failure; trivial fix. +- **AV1 + H.264 codec contexts.** Phase 8.6. + +## Phase 8.6 plan + +1. dmabuf-export on CAPTURE: switch mem_ops to + `vb2_dma_contig_memops`; add `DAEDALUS_IOC_GET_DMABUF` on + the chardev that calls `vb2_core_expbuf` in daemon context. +2. Daemon mmaps the dmabuf, decodes into it directly, + RESP_FRAME carries metadata only. +3. Add AV1 + H.264 to the decoder's codec switch (FFmpeg + already supports both). +4. Add V4L2 stateless controls (VP9_FRAME, then AV1_FRAME, + then H264_SLICE_PARAMS) — these can be NULL-handled by the + daemon initially (it just ignores them since FFmpeg parses + on its own), but adding them satisfies the spec / compliance. +5. Media controller binding via `v4l2_m2m_register_media_controller`. +6. Fix the S_FMT colorspace preservation. +7. Fix the cookie namespace collision. diff --git a/docs/roadmap.md b/docs/roadmap.md index 24ccd2e..89f25fd 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -32,35 +32,55 @@ Deliverable: ping-pong test passes. Deliverable: daemon can parse a VP9 frame and walk the block-level info. -### Phase 8.4 — VP9 end-to-end via daedalus-fourier +### Phase 8.4 — daemon ↔ kernel decode round-trip (closed 2026-05-18) -- Wire daemon's per-block walker to `daedalus_dispatch_*` calls. -- Kernel module passes bitstream + controls to daemon over - chardev. -- Daemon decodes, writes pixels to a shared buffer, returns - result to kernel. -- Kernel returns via DQBUF. +Shipped as a debugfs-triggered chardev round-trip rather than +the original V4L2-ioctl plan (which moved to Phase 8.5). -Deliverable: `v4l2-ctl --stream-from=foo.ivf` produces -decoded frames (output via `--stream-to` PNG dump). +- REQ_DECODE / RESP_FRAME wire protocol +- Daemon decodes VP9 via FFmpeg dlopen, returns FNV-1a digest +- Verified content-dependent + deterministic; structured + error handling for bad bitstreams -### Phase 8.5 — dmabuf / DRM PRIME +See `docs/phase_8_4_closure.md`. -- Kernel module allocates dma-coherent buffers. -- Export via VIDIOC_EXPBUF. -- Daemon writes via mmap into kernel-allocated dmabuf. -- Test: `v4l2-ctl --capture-mmap-dmabuf` works. +### Phase 8.5 — full V4L2 m2m driver (closed 2026-05-18) -Deliverable: dmabuf-fd is exportable; first browser-friendly -frame. +Real V4L2 m2m driver — userspace clients drive +`S_FMT`/`REQBUFS`/`QBUF`/`DQBUF` the standard way. Bitstream +flows kernel→daemon as inline REQ_DECODE payload; decoded NV12 +pixels flow daemon→kernel as inline RESP_FRAME payload. Works +end-to-end for small frames (≤ ~64 KiB NV12). -### Phase 8.6 — AV1 + H.264 +Deliverable hit: kernel m2m driver passes most v4l2-compliance +checks; `tools/test_m2m_decode` produces a NV12 frame that's +byte-for-byte identical to `ffmpeg -pix_fmt nv12` reference. -- Add codec support for AV1 (using CDEF QPU helper) and - H.264 (using deblock QPU helper for the one cycle 8 path, - everything else CPU). +See `docs/phase_8_5_closure.md`. -Deliverable: real AV1/H.264 clips decode end-to-end. +### Phase 8.6 — dmabuf + AV1 + H.264 + stateless controls + +Two interleaved tracks: + +**Track A (dmabuf):** +- Switch CAPTURE mem_ops to `vb2_dma_contig_memops`. +- New `DAEDALUS_IOC_GET_DMABUF` on the chardev — daemon + fetches a dmabuf-fd for the in-flight CAPTURE buffer, + mmaps it, decodes pixels in place. +- RESP_FRAME shrinks to metadata-only; chardev payload + stops capping frame size. + +**Track B (codecs + compliance):** +- AV1 + H.264 codec contexts in the daemon (FFmpeg supports + both already). +- V4L2 stateless controls (VP9_FRAME, AV1_FRAME, + H264_SLICE_PARAMS) — even if NULL-handled by the daemon, + presence is needed for spec/compliance. +- Media controller binding via + `v4l2_m2m_register_media_controller`. +- TRY_FMT colorspace round-trip. + +Deliverable: real AV1/H.264 1080p clips decode end-to-end. ### Phase 8.7 — performance + 30fps@1080p diff --git a/kernel/daedalus_v4l2_chardev.c b/kernel/daedalus_v4l2_chardev.c index d0aae57..aac11ce 100644 --- a/kernel/daedalus_v4l2_chardev.c +++ b/kernel/daedalus_v4l2_chardev.c @@ -28,6 +28,7 @@ #include "daedalus_v4l2_proto.h" #include "daedalus_v4l2_chardev.h" +#include "daedalus_v4l2_main.h" #define DAEDALUS_CHARDEV_NAME "daedalus-v4l2" @@ -268,6 +269,8 @@ static ssize_t daedalus_chardev_write(struct file *file, switch (hdr.type) { case DAEDALUS_MSG_RESP_FRAME: { struct daedalus_resp_frame fr; + const u8 *pixels = NULL; + size_t pixels_len = 0; if (hdr.payload_len < sizeof(fr)) { pr_warn("daedalus_v4l2: RESP_FRAME payload too short (%u < %zu)\n", @@ -276,10 +279,25 @@ static ssize_t daedalus_chardev_write(struct file *file, return -EBADMSG; } memcpy(&fr, payload, sizeof(fr)); - pr_info("daedalus_v4l2: RESP_FRAME cookie=%u status=%u codec=%u %ux%u pixfmt=%d luma=%u chroma=%u fnv1a=0x%08x\n", - hdr.cookie, fr.status, fr.codec_id, - fr.width, fr.height, fr.pix_fmt, - fr.luma_len, fr.chroma_len, fr.fnv1a_yuv); + if (hdr.payload_len > sizeof(fr)) { + pixels = payload + sizeof(fr); + pixels_len = hdr.payload_len - sizeof(fr); + } + pr_debug("daedalus_v4l2: RESP_FRAME cookie=%u status=%u codec=%u %ux%u pixfmt=%d luma=%u chroma=%u fnv1a=0x%08x inline_pixels=%zu\n", + hdr.cookie, fr.status, fr.codec_id, + fr.width, fr.height, fr.pix_fmt, + fr.luma_len, fr.chroma_len, fr.fnv1a_yuv, + pixels_len); + + /* + * Hand off to the V4L2 m2m completion path. If no + * V4L2 device is registered yet (e.g. debugfs-only + * test_decode used and no V4L2 m2m_ctx exists), + * daedalus_complete_resp_frame returns silently after + * a ratelimited warn. + */ + daedalus_complete_resp_frame(hdr.cookie, &fr, pixels, + pixels_len); break; } default: diff --git a/kernel/daedalus_v4l2_main.c b/kernel/daedalus_v4l2_main.c index 6e0236a..c0bd784 100644 --- a/kernel/daedalus_v4l2_main.c +++ b/kernel/daedalus_v4l2_main.c @@ -3,16 +3,26 @@ * daedalus-v4l2 — V4L2 stateless decoder shim. * * Out-of-tree Linux kernel module that exposes a /dev/videoNN - * V4L2 device for the daedalus-fourier kernel library. Real - * decoding work happens in a userspace daemon (this module - * forwards bitstream + stateless-codec control structs via a - * chardev bridge — that part lands in Phase 8.2). + * V4L2 m2m (mem2mem) device for the daedalus-fourier kernel + * library. Real decoding happens in a userspace daemon; this + * module ferries bitstream buffers to the daemon via the + * /dev/daedalus-v4l2 chardev bridge and ferries decoded pixels + * back into the V4L2 client's CAPTURE buffer. * - * Phase 8.1 (this commit): minimal viable skeleton. Registers a - * platform device + v4l2_device + video_device and answers - * VIDIOC_QUERYCAP with reasonable values. Other ioctls fall - * through to v4l2-core defaults; modprobe / rmmod is a clean - * round-trip. + * Phase 8.5 (this revision): full V4L2 m2m driver with vb2 + * queues, real v4l2_ioctl_ops table, device_run wired to REQ_DECODE + * over the chardev, RESP_FRAME completion path back into + * v4l2_m2m_buf_done. Bitstream + decoded pixel data travel + * inline through the 64 KiB chardev payload — enough for small + * frames and proof-of-pipe; Phase 8.6 adds dmabuf-export so + * larger CAPTURE buffers don't have to round-trip through the + * chardev. + * + * Phase 8.5 does NOT implement the V4L2 stateless control set + * (V4L2_CID_STATELESS_VP9_FRAME etc.). The daemon parses VP9 + * headers itself via dlopen'd FFmpeg, so per-buffer controls are + * not needed for the proof-of-pipe. Phase 8.6 adds the proper + * stateless controls when AV1/H.264 land. * * Project: https://git.reauktion.de/reauktion/daedalus-v4l2 * Sibling kernel library: https://git.reauktion.de/marfrit/daedalus-fourier @@ -23,97 +33,625 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include +#include #include "daedalus_v4l2_chardev.h" +#include "daedalus_v4l2_proto.h" +#include "daedalus_v4l2_main.h" #define DAEDALUS_DRV_NAME "daedalus_v4l2" #define DAEDALUS_VIDEO_NAME "daedalus" +/* Coding-format coverage Phase 8.5: VP9 only. 8.6 adds AV1+H.264. */ +#define DAEDALUS_OUTPUT_FOURCC V4L2_PIX_FMT_VP9_FRAME +#define DAEDALUS_CAPTURE_FOURCC V4L2_PIX_FMT_NV12M /* planar Y + interleaved CbCr */ + +/* Conservative defaults; userspace S_FMT overrides. */ +#define DAEDALUS_DEFAULT_W 320 +#define DAEDALUS_DEFAULT_H 240 + +/* Bound bitstream buffer size to the chardev payload cap. */ +#define DAEDALUS_MAX_BITSTREAM (DAEDALUS_PROTO_MAX_PAYLOAD - \ + sizeof(struct daedalus_req_decode)) + +/* -- module-wide state ----------------------------------------------- */ + +static struct daedalus_dev *g_daedalus_dev; + +struct daedalus_dev *daedalus_get_dev(void) +{ + return g_daedalus_dev; +} + +/* -- per-open context ------------------------------------------------ */ + /** - * struct daedalus_dev - top-level device state - * @pdev: owning platform device (synthesised in module_init) - * @v4l2_dev: V4L2 device parent for any video_device we register - * @vdev: video_device exposed as /dev/videoNN + * struct daedalus_ctx - per-open instance state + * @fh: V4L2 file handle (must be first to satisfy v4l2-core) + * @dev: parent daedalus_dev + * @m2m_ctx: v4l2 mem2mem context (one job queue per open) + * @hdl: v4l2_ctrl_handler (no controls yet; placeholder for 8.6) + * @src_fmt: current OUTPUT (bitstream) format + * @dst_fmt: current CAPTURE (decoded) format * - * One-instance singleton for Phase 8.1. Multi-instance support - * (one decoder per /dev/videoNN) lands when m2m wiring goes in. + * One context per open() of /dev/videoNN. v4l2-core's m2m + * scheduler picks one context at a time to call device_run on. */ -struct daedalus_dev { - struct platform_device *pdev; - struct v4l2_device v4l2_dev; - struct video_device vdev; +struct daedalus_ctx { + struct v4l2_fh fh; + struct daedalus_dev *dev; + struct v4l2_m2m_ctx *m2m_ctx; + struct v4l2_ctrl_handler hdl; + + struct v4l2_pix_format_mplane src_fmt; + struct v4l2_pix_format_mplane dst_fmt; }; +static inline struct daedalus_ctx *file_to_ctx(struct file *file) +{ + return container_of(file->private_data, struct daedalus_ctx, fh); +} + +/* -- format helpers -------------------------------------------------- */ + +/* NV12M = 2 planes: plane 0 = Y (W*H), plane 1 = interleaved CbCr (W*H/2). */ +static void daedalus_fill_capture_fmt(struct v4l2_pix_format_mplane *f, + u32 w, u32 h) +{ + f->width = w; + f->height = h; + f->pixelformat = DAEDALUS_CAPTURE_FOURCC; + f->field = V4L2_FIELD_NONE; + f->colorspace = V4L2_COLORSPACE_REC709; + f->num_planes = 2; + + f->plane_fmt[0].bytesperline = w; + f->plane_fmt[0].sizeimage = w * h; + f->plane_fmt[1].bytesperline = w; + f->plane_fmt[1].sizeimage = w * h / 2; +} + /* - * V4L2 ioctl dispatch table. Phase 8.1 only implements - * VIDIOC_QUERYCAP; everything else returns -ENOTTY via the - * v4l2-core's default handler when the op is NULL. + * OUTPUT is a parsed VP9 access unit. V4L2 convention for + * compressed bitstream formats: single plane, sizeimage = + * worst-case bitstream size we're willing to accept. */ +static void daedalus_fill_output_fmt(struct v4l2_pix_format_mplane *f, + u32 w, u32 h) +{ + f->width = w; + f->height = h; + f->pixelformat = DAEDALUS_OUTPUT_FOURCC; + f->field = V4L2_FIELD_NONE; + f->colorspace = V4L2_COLORSPACE_REC709; + f->num_planes = 1; + f->plane_fmt[0].bytesperline = 0; /* compressed */ + f->plane_fmt[0].sizeimage = DAEDALUS_MAX_BITSTREAM; +} + +/* -- vb2 queue ops --------------------------------------------------- */ + +static int daedalus_queue_setup(struct vb2_queue *vq, + unsigned int *nbuffers, + unsigned int *nplanes, + unsigned int sizes[], + struct device *alloc_devs[]) +{ + struct daedalus_ctx *ctx = vb2_get_drv_priv(vq); + const struct v4l2_pix_format_mplane *fmt; + unsigned int p; + + fmt = V4L2_TYPE_IS_OUTPUT(vq->type) ? &ctx->src_fmt : &ctx->dst_fmt; + + if (*nplanes) { + if (*nplanes != fmt->num_planes) + return -EINVAL; + for (p = 0; p < *nplanes; p++) + if (sizes[p] < fmt->plane_fmt[p].sizeimage) + return -EINVAL; + return 0; + } + + *nplanes = fmt->num_planes; + for (p = 0; p < *nplanes; p++) + sizes[p] = fmt->plane_fmt[p].sizeimage; + + if (*nbuffers < 2) + *nbuffers = 2; + return 0; +} + +static int daedalus_buf_prepare(struct vb2_buffer *vb) +{ + struct daedalus_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); + const struct v4l2_pix_format_mplane *fmt; + unsigned int p; + + fmt = V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type) ? &ctx->src_fmt + : &ctx->dst_fmt; + + for (p = 0; p < vb->num_planes; p++) { + unsigned long need = fmt->plane_fmt[p].sizeimage; + + if (vb2_plane_size(vb, p) < need) { + v4l2_err(&ctx->dev->v4l2_dev, + "buf_prepare: plane %u size %lu < %lu\n", + p, vb2_plane_size(vb, p), need); + return -EINVAL; + } + /* + * For OUTPUT (bitstream), payload is set by userspace + * via VIDIOC_QBUF (bytesused). For CAPTURE we set the + * full plane size; device_run / buf_done updates it on + * completion if needed. + */ + if (!V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) + vb2_set_plane_payload(vb, p, need); + } + return 0; +} + +static void daedalus_buf_queue(struct vb2_buffer *vb) +{ + struct daedalus_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); + struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); + + v4l2_m2m_buf_queue(ctx->m2m_ctx, vbuf); +} + +static int daedalus_start_streaming(struct vb2_queue *vq, unsigned int count) +{ + return 0; +} + +static void daedalus_stop_streaming(struct vb2_queue *vq) +{ + struct daedalus_ctx *ctx = vb2_get_drv_priv(vq); + struct vb2_v4l2_buffer *vbuf; + + while ((vbuf = V4L2_TYPE_IS_OUTPUT(vq->type) + ? v4l2_m2m_src_buf_remove(ctx->m2m_ctx) + : v4l2_m2m_dst_buf_remove(ctx->m2m_ctx)) != NULL) + v4l2_m2m_buf_done(vbuf, VB2_BUF_STATE_ERROR); +} + +static const struct vb2_ops daedalus_qops = { + .queue_setup = daedalus_queue_setup, + .buf_prepare = daedalus_buf_prepare, + .buf_queue = daedalus_buf_queue, + .start_streaming = daedalus_start_streaming, + .stop_streaming = daedalus_stop_streaming, + .wait_prepare = vb2_ops_wait_prepare, + .wait_finish = vb2_ops_wait_finish, +}; + +/* -- m2m queue init -------------------------------------------------- */ + +static int daedalus_queue_init(void *priv, struct vb2_queue *src_vq, + struct vb2_queue *dst_vq) +{ + struct daedalus_ctx *ctx = priv; + int ret; + + src_vq->type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + src_vq->io_modes = VB2_MMAP | VB2_USERPTR; + src_vq->drv_priv = ctx; + src_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer); + src_vq->ops = &daedalus_qops; + src_vq->mem_ops = &vb2_vmalloc_memops; + src_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY; + src_vq->lock = &ctx->dev->m2m_lock; + + ret = vb2_queue_init(src_vq); + if (ret) + return ret; + + dst_vq->type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + dst_vq->io_modes = VB2_MMAP; + dst_vq->drv_priv = ctx; + dst_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer); + dst_vq->ops = &daedalus_qops; + dst_vq->mem_ops = &vb2_vmalloc_memops; + dst_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY; + dst_vq->lock = &ctx->dev->m2m_lock; + + return vb2_queue_init(dst_vq); +} + +/* -- in-flight tracking (cookie → ctx + bufs) ------------------------ */ + +/* + * The chardev RESP_FRAME path needs to find the per-request + * context + source/destination buffer pair so it can complete the + * V4L2 m2m job. Track in-flight requests in a small list keyed + * by cookie. Cookies are monotonically increasing (see + * device_run); collisions on wrap-around are astronomically + * unlikely in normal use and would self-clear once the older + * cookie's response arrives. + */ +struct daedalus_inflight { + struct list_head list; + u32 cookie; + struct daedalus_ctx *ctx; + struct vb2_v4l2_buffer *src_buf; + struct vb2_v4l2_buffer *dst_buf; +}; + +static struct daedalus_inflight * +daedalus_inflight_pop_locked(struct daedalus_dev *dev, u32 cookie) +{ + struct daedalus_inflight *e; + + list_for_each_entry(e, &dev->inflight, list) { + if (e->cookie == cookie) { + list_del(&e->list); + return e; + } + } + return NULL; +} + +/* -- v4l2_m2m_ops.device_run ----------------------------------------- */ + +static atomic_t daedalus_cookie_seq = ATOMIC_INIT(0); + +static void daedalus_device_run(void *priv) +{ + struct daedalus_ctx *ctx = priv; + struct daedalus_dev *dev = ctx->dev; + struct vb2_v4l2_buffer *src_buf, *dst_buf; + struct daedalus_inflight *inf = NULL; + struct daedalus_req_decode *req = NULL; + void *bitstream; + size_t blen, payload_len; + u32 cookie; + int ret; + + src_buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx); + dst_buf = v4l2_m2m_next_dst_buf(ctx->m2m_ctx); + if (!src_buf || !dst_buf) { + v4l2_warn(&dev->v4l2_dev, + "device_run with no src/dst buf — scheduler bug?\n"); + goto fail_job_finish; + } + + blen = vb2_get_plane_payload(&src_buf->vb2_buf, 0); + if (!blen || blen > DAEDALUS_MAX_BITSTREAM) { + v4l2_err(&dev->v4l2_dev, + "device_run: bitstream length %zu out of range [1, %lu]\n", + blen, (unsigned long) DAEDALUS_MAX_BITSTREAM); + goto fail_buf_error; + } + + bitstream = vb2_plane_vaddr(&src_buf->vb2_buf, 0); + if (!bitstream) { + v4l2_err(&dev->v4l2_dev, "device_run: vaddr NULL\n"); + goto fail_buf_error; + } + + payload_len = sizeof(*req) + blen; + req = kmalloc(payload_len, GFP_KERNEL); + if (!req) + goto fail_buf_error; + + req->codec_id = DAEDALUS_CODEC_VP9; + req->bitstream_len = (u32) blen; + req->flags = 0; + memcpy((u8 *) req + sizeof(*req), bitstream, blen); + + inf = kzalloc(sizeof(*inf), GFP_KERNEL); + if (!inf) + goto fail_buf_error; + cookie = (u32) atomic_inc_return(&daedalus_cookie_seq); + inf->cookie = cookie; + inf->ctx = ctx; + inf->src_buf = src_buf; + inf->dst_buf = dst_buf; + + mutex_lock(&dev->inflight_lock); + list_add_tail(&inf->list, &dev->inflight); + mutex_unlock(&dev->inflight_lock); + + ret = daedalus_chardev_enqueue_req(DAEDALUS_MSG_REQ_DECODE, cookie, + req, payload_len); + kfree(req); + req = NULL; + if (ret) { + v4l2_err(&dev->v4l2_dev, + "device_run: enqueue_req failed: %d\n", ret); + mutex_lock(&dev->inflight_lock); + list_del(&inf->list); + mutex_unlock(&dev->inflight_lock); + kfree(inf); + goto fail_buf_error; + } + + v4l2_dbg(1, 0, &dev->v4l2_dev, + "device_run: REQ_DECODE cookie=%u blen=%zu\n", cookie, blen); + /* + * Job stays open until RESP_FRAME comes back; chardev path + * calls v4l2_m2m_buf_done_and_job_finish then. + */ + return; + +fail_buf_error: + if (src_buf) { + v4l2_m2m_src_buf_remove(ctx->m2m_ctx); + v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_ERROR); + } + if (dst_buf) { + v4l2_m2m_dst_buf_remove(ctx->m2m_ctx); + v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_ERROR); + } + kfree(req); +fail_job_finish: + v4l2_m2m_job_finish(dev->m2m_dev, ctx->m2m_ctx); +} + +static const struct v4l2_m2m_ops daedalus_m2m_ops = { + .device_run = daedalus_device_run, +}; + +/* -- chardev RESP_FRAME → buf_done bridge ---------------------------- */ + +void daedalus_complete_resp_frame(u32 cookie, + const struct daedalus_resp_frame *fr, + const u8 *pixels, size_t pixels_len) +{ + struct daedalus_dev *dev = g_daedalus_dev; + struct daedalus_inflight *inf; + enum vb2_buffer_state state; + void *dst_y, *dst_uv; + u32 y_size, uv_size; + + if (!dev) + return; + + mutex_lock(&dev->inflight_lock); + inf = daedalus_inflight_pop_locked(dev, cookie); + mutex_unlock(&dev->inflight_lock); + if (!inf) { + pr_warn_ratelimited( + "daedalus_v4l2: RESP_FRAME for unknown cookie=%u\n", + cookie); + return; + } + + state = (fr->status == DAEDALUS_DECODE_OK) + ? VB2_BUF_STATE_DONE : VB2_BUF_STATE_ERROR; + + /* + * Copy inline pixel data into the CAPTURE buffer if the + * daemon supplied any. Phase 8.5: bytes-after-header in the + * RESP_FRAME payload carry the Y plane followed by the + * interleaved CbCr plane (NV12M layout), truncated to fit + * the 64 KiB chardev payload cap. Phase 8.6 swaps this for + * dmabuf-export so big frames don't get truncated. + */ + if (state == VB2_BUF_STATE_DONE && pixels_len) { + struct vb2_buffer *vb = &inf->dst_buf->vb2_buf; + + dst_y = vb2_plane_vaddr(vb, 0); + dst_uv = vb2_plane_vaddr(vb, 1); + y_size = min_t(u32, fr->luma_len, + (u32) vb2_plane_size(vb, 0)); + uv_size = min_t(u32, fr->chroma_len, + (u32) vb2_plane_size(vb, 1)); + + if (dst_y && y_size && pixels_len >= y_size) { + memcpy(dst_y, pixels, y_size); + vb2_set_plane_payload(vb, 0, y_size); + } else { + vb2_set_plane_payload(vb, 0, 0); + } + if (dst_uv && uv_size && pixels_len >= y_size + uv_size) { + memcpy(dst_uv, pixels + y_size, uv_size); + vb2_set_plane_payload(vb, 1, uv_size); + } else { + vb2_set_plane_payload(vb, 1, 0); + } + } + + /* + * Use the buf_done_and_job_finish helper rather than plain + * buf_done + job_finish: the helper pops the buffers off + * the m2m queue before marking them done, otherwise the + * scheduler immediately re-runs device_run on the same + * still-queued src buffer. Caught during Phase 8.5 first + * run — second REQ_DECODE with identical bitstream + oops + * in stop_streaming when the test client tore down. + */ + v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, inf->ctx->m2m_ctx, + state); + + kfree(inf); +} + +/* -- v4l2_ioctl_ops -------------------------------------------------- */ + static int daedalus_querycap(struct file *file, void *priv, struct v4l2_capability *cap) { strscpy(cap->driver, DAEDALUS_DRV_NAME, sizeof(cap->driver)); - /* - * cap->card is 32 bytes incl. NUL terminator. Pick a name - * that fits without truncation. - */ - strscpy(cap->card, "daedalus-fourier V3D7+NEON", - sizeof(cap->card)); + strscpy(cap->card, "daedalus-fourier V3D7+NEON", sizeof(cap->card)); snprintf(cap->bus_info, sizeof(cap->bus_info), "platform:%s", DAEDALUS_DRV_NAME); return 0; } -static const struct v4l2_ioctl_ops daedalus_ioctl_ops = { - .vidioc_querycap = daedalus_querycap, - /* - * Phase 8.2+ adds: - * .vidioc_enum_fmt_vid_{cap,out}_mplane - * .vidioc_g/s_fmt_vid_{cap,out}_mplane - * .vidioc_reqbufs / vidioc_{q,dq,query}buf - * .vidioc_streamon / .vidioc_streamoff - * stateless-codec controls via the v4l2_ctrl_handler. - */ -}; - -/* - * Phase 8.1 placeholder for .release. We DON'T yet have a - * vb2_queue (that's Phase 8.2), so the real vb2_fop_release - * isn't usable. Use the minimal v4l2_fh_release for now; the - * Phase 8.2 patch swaps this for vb2_fop_release. - */ -static int daedalus_release_phase81(struct file *file) +static int daedalus_enum_fmt(struct file *file, void *priv, + struct v4l2_fmtdesc *f) { - return v4l2_fh_release(file); + if (f->index != 0) + return -EINVAL; + f->pixelformat = (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) + ? DAEDALUS_OUTPUT_FOURCC + : DAEDALUS_CAPTURE_FOURCC; + if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) + f->flags |= V4L2_FMT_FLAG_COMPRESSED; + return 0; +} + +static int daedalus_g_fmt(struct file *file, void *priv, + struct v4l2_format *f) +{ + struct daedalus_ctx *ctx = file_to_ctx(file); + + if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) + f->fmt.pix_mp = ctx->src_fmt; + else if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) + f->fmt.pix_mp = ctx->dst_fmt; + else + return -EINVAL; + return 0; +} + +static int daedalus_try_fmt(struct file *file, void *priv, + struct v4l2_format *f) +{ + struct v4l2_pix_format_mplane *p = &f->fmt.pix_mp; + u32 w = clamp_t(u32, p->width, 16, 1920); + u32 h = clamp_t(u32, p->height, 16, 1088); + + if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) { + daedalus_fill_output_fmt(p, w, h); + } else if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) { + daedalus_fill_capture_fmt(p, w, h); + } else { + return -EINVAL; + } + return 0; +} + +static int daedalus_s_fmt(struct file *file, void *priv, + struct v4l2_format *f) +{ + struct daedalus_ctx *ctx = file_to_ctx(file); + struct vb2_queue *vq; + int ret; + + vq = v4l2_m2m_get_vq(ctx->m2m_ctx, f->type); + if (!vq) + return -EINVAL; + if (vb2_is_busy(vq)) + return -EBUSY; + + ret = daedalus_try_fmt(file, priv, f); + if (ret) + return ret; + if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) + ctx->src_fmt = f->fmt.pix_mp; + else + ctx->dst_fmt = f->fmt.pix_mp; + return 0; +} + +static const struct v4l2_ioctl_ops daedalus_ioctl_ops = { + .vidioc_querycap = daedalus_querycap, + + .vidioc_enum_fmt_vid_out = daedalus_enum_fmt, + .vidioc_enum_fmt_vid_cap = daedalus_enum_fmt, + .vidioc_g_fmt_vid_out_mplane = daedalus_g_fmt, + .vidioc_g_fmt_vid_cap_mplane = daedalus_g_fmt, + .vidioc_s_fmt_vid_out_mplane = daedalus_s_fmt, + .vidioc_s_fmt_vid_cap_mplane = daedalus_s_fmt, + .vidioc_try_fmt_vid_out_mplane = daedalus_try_fmt, + .vidioc_try_fmt_vid_cap_mplane = daedalus_try_fmt, + + .vidioc_reqbufs = v4l2_m2m_ioctl_reqbufs, + .vidioc_querybuf = v4l2_m2m_ioctl_querybuf, + .vidioc_qbuf = v4l2_m2m_ioctl_qbuf, + .vidioc_dqbuf = v4l2_m2m_ioctl_dqbuf, + .vidioc_create_bufs = v4l2_m2m_ioctl_create_bufs, + .vidioc_prepare_buf = v4l2_m2m_ioctl_prepare_buf, + .vidioc_expbuf = v4l2_m2m_ioctl_expbuf, + + .vidioc_streamon = v4l2_m2m_ioctl_streamon, + .vidioc_streamoff = v4l2_m2m_ioctl_streamoff, + + .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, + .vidioc_unsubscribe_event = v4l2_event_unsubscribe, +}; + +/* -- file operations ------------------------------------------------- */ + +static int daedalus_open(struct file *file) +{ + struct daedalus_dev *dev = video_drvdata(file); + struct daedalus_ctx *ctx; + int ret; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->dev = dev; + + v4l2_fh_init(&ctx->fh, &dev->vdev); + file->private_data = &ctx->fh; + + v4l2_ctrl_handler_init(&ctx->hdl, 0); + ctx->fh.ctrl_handler = &ctx->hdl; + + daedalus_fill_output_fmt(&ctx->src_fmt, + DAEDALUS_DEFAULT_W, DAEDALUS_DEFAULT_H); + daedalus_fill_capture_fmt(&ctx->dst_fmt, + DAEDALUS_DEFAULT_W, DAEDALUS_DEFAULT_H); + + ctx->m2m_ctx = v4l2_m2m_ctx_init(dev->m2m_dev, ctx, + daedalus_queue_init); + if (IS_ERR(ctx->m2m_ctx)) { + ret = PTR_ERR(ctx->m2m_ctx); + v4l2_err(&dev->v4l2_dev, "m2m_ctx_init: %d\n", ret); + goto err_ctrl; + } + ctx->fh.m2m_ctx = ctx->m2m_ctx; + v4l2_fh_add(&ctx->fh); + return 0; + +err_ctrl: + v4l2_ctrl_handler_free(&ctx->hdl); + v4l2_fh_exit(&ctx->fh); + kfree(ctx); + return ret; +} + +static int daedalus_release(struct file *file) +{ + struct daedalus_ctx *ctx = file_to_ctx(file); + + v4l2_fh_del(&ctx->fh); + v4l2_m2m_ctx_release(ctx->m2m_ctx); + v4l2_ctrl_handler_free(&ctx->hdl); + v4l2_fh_exit(&ctx->fh); + kfree(ctx); + return 0; } -/* - * File operations. v4l2_fh_open provides the default open the - * v4l2-core machinery expects; .release is our Phase 8.1 - * placeholder; .unlocked_ioctl uses the kernel's video_ioctl2 - * dispatcher against our v4l2_ioctl_ops table. - */ static const struct v4l2_file_operations daedalus_fops = { .owner = THIS_MODULE, - .open = v4l2_fh_open, - .release = daedalus_release_phase81, + .open = daedalus_open, + .release = daedalus_release, + .poll = v4l2_m2m_fop_poll, .unlocked_ioctl = video_ioctl2, + .mmap = v4l2_m2m_fop_mmap, }; static void daedalus_vdev_release(struct video_device *vdev) { - /* - * The video_device is embedded inside our daedalus_dev which - * lives as long as the platform_device. Nothing to free here - * directly; this no-op release just satisfies v4l2-core's - * requirement that .release be set. - */ + /* embedded in daedalus_dev (devm) — nothing to free here */ } +/* -- platform driver bind -------------------------------------------- */ + static int daedalus_probe(struct platform_device *pdev) { struct daedalus_dev *dev; @@ -124,6 +662,9 @@ static int daedalus_probe(struct platform_device *pdev) return -ENOMEM; dev->pdev = pdev; platform_set_drvdata(pdev, dev); + mutex_init(&dev->m2m_lock); + mutex_init(&dev->inflight_lock); + INIT_LIST_HEAD(&dev->inflight); ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev); if (ret) { @@ -131,36 +672,50 @@ static int daedalus_probe(struct platform_device *pdev) return ret; } - /* Set up video_device. Embedded; vdev->release is no-op. */ + dev->m2m_dev = v4l2_m2m_init(&daedalus_m2m_ops); + if (IS_ERR(dev->m2m_dev)) { + ret = PTR_ERR(dev->m2m_dev); + v4l2_err(&dev->v4l2_dev, "v4l2_m2m_init: %d\n", ret); + goto err_v4l2_dev; + } + strscpy(dev->vdev.name, DAEDALUS_VIDEO_NAME, sizeof(dev->vdev.name)); - dev->vdev.fops = &daedalus_fops; - dev->vdev.ioctl_ops = &daedalus_ioctl_ops; - dev->vdev.release = daedalus_vdev_release; - dev->vdev.v4l2_dev = &dev->v4l2_dev; - dev->vdev.vfl_dir = VFL_DIR_M2M; /* mem2mem: bitstream in, frames out */ - dev->vdev.device_caps = V4L2_CAP_VIDEO_M2M_MPLANE - | V4L2_CAP_STREAMING; + dev->vdev.fops = &daedalus_fops; + dev->vdev.ioctl_ops = &daedalus_ioctl_ops; + dev->vdev.release = daedalus_vdev_release; + dev->vdev.v4l2_dev = &dev->v4l2_dev; + dev->vdev.vfl_dir = VFL_DIR_M2M; + dev->vdev.device_caps = V4L2_CAP_VIDEO_M2M_MPLANE + | V4L2_CAP_STREAMING; + dev->vdev.lock = &dev->m2m_lock; video_set_drvdata(&dev->vdev, dev); ret = video_register_device(&dev->vdev, VFL_TYPE_VIDEO, -1); if (ret) { - dev_err(&pdev->dev, "video_register_device: %d\n", ret); - v4l2_device_unregister(&dev->v4l2_dev); - return ret; + v4l2_err(&dev->v4l2_dev, "video_register_device: %d\n", ret); + goto err_m2m; } + g_daedalus_dev = dev; v4l2_info(&dev->v4l2_dev, - "daedalus-v4l2 registered as /dev/video%d (Phase 8.1 skeleton)\n", + "daedalus-v4l2 m2m registered as /dev/video%d (Phase 8.5)\n", dev->vdev.num); - return 0; + +err_m2m: + v4l2_m2m_release(dev->m2m_dev); +err_v4l2_dev: + v4l2_device_unregister(&dev->v4l2_dev); + return ret; } static void daedalus_remove(struct platform_device *pdev) { struct daedalus_dev *dev = platform_get_drvdata(pdev); + g_daedalus_dev = NULL; video_unregister_device(&dev->vdev); + v4l2_m2m_release(dev->m2m_dev); v4l2_device_unregister(&dev->v4l2_dev); } @@ -172,11 +727,6 @@ static struct platform_driver daedalus_platform_driver = { }, }; -/* - * The platform device that our driver binds to. Synthesised - * at module load time since we have no device tree node yet - * (out-of-tree module; not vendored into the rpi DT). - */ static struct platform_device *daedalus_platform_device; static int __init daedalus_init(void) @@ -225,4 +775,4 @@ module_exit(daedalus_exit); MODULE_AUTHOR("Markus Fritsche "); MODULE_DESCRIPTION("V4L2 stateless decoder shim for daedalus-fourier (Pi 5 / VC7)"); MODULE_LICENSE("GPL v2"); -MODULE_VERSION("0.0.1"); +MODULE_VERSION("0.0.2"); diff --git a/kernel/daedalus_v4l2_main.h b/kernel/daedalus_v4l2_main.h new file mode 100644 index 0000000..bad639d --- /dev/null +++ b/kernel/daedalus_v4l2_main.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * daedalus-v4l2 — kernel-internal device/state declarations. + * + * Shared between daedalus_v4l2_main.c (V4L2 m2m driver) and + * daedalus_v4l2_chardev.c (kernel↔daemon bridge). The chardev + * needs to look up in-flight V4L2 requests by cookie to complete + * the m2m job when RESP_FRAME arrives — that path lives in + * daedalus_complete_resp_frame(). + */ +#ifndef DAEDALUS_V4L2_MAIN_H +#define DAEDALUS_V4L2_MAIN_H + +#include +#include +#include + +#include +#include +#include + +#include "daedalus_v4l2_proto.h" + +/** + * struct daedalus_dev - top-level device state (singleton for now) + * @pdev: owning platform device (synthesised in module_init) + * @v4l2_dev: V4L2 device parent for any video_device we register + * @vdev: video_device exposed as /dev/videoNN + * @m2m_dev: mem2mem device shared by all per-open contexts + * @m2m_lock: serialises vb2 queue + v4l2 ioctl ops + * @inflight: list of struct daedalus_inflight (REQ_DECODE sent, + * RESP_FRAME not yet returned) + * @inflight_lock: protects @inflight + * + * Singleton per-module instance. Multi-instance support (one + * decoder per /dev/videoNN) would require breaking g_daedalus_dev + * out of daedalus_v4l2_main.c; not needed yet. + */ +struct daedalus_dev { + struct platform_device *pdev; + struct v4l2_device v4l2_dev; + struct video_device vdev; + struct v4l2_m2m_dev *m2m_dev; + struct mutex m2m_lock; + struct list_head inflight; + struct mutex inflight_lock; +}; + +/* Module-wide singleton accessor (chardev needs this for RESP_FRAME). */ +struct daedalus_dev *daedalus_get_dev(void); + +/** + * daedalus_complete_resp_frame() - chardev RESP_FRAME completion + * @cookie: cookie carried by the matching REQ_DECODE + * @fr: RESP_FRAME header from the daemon + * @pixels: inline pixel bytes following the header in the + * chardev payload (may be NULL if @pixels_len == 0) + * @pixels_len: number of inline pixel bytes + * + * Called from the chardev write() path on RESP_FRAME. Looks up + * the in-flight request, copies inline pixel data into the + * CAPTURE vb2 buffer if available, then completes both src+dst + * buffers and finishes the m2m job. Silently drops responses + * for unknown cookies (pr_warn_ratelimited). + */ +void daedalus_complete_resp_frame(u32 cookie, + const struct daedalus_resp_frame *fr, + const u8 *pixels, size_t pixels_len); + +#endif /* DAEDALUS_V4L2_MAIN_H */ diff --git a/tools/Makefile b/tools/Makefile index 6243836..060fe35 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -6,7 +6,7 @@ CC ?= cc CFLAGS ?= -Wall -Wextra -O2 CFLAGS += -I../include -TOOLS := test_chardev_pingpong +TOOLS := test_chardev_pingpong test_m2m_decode all: $(TOOLS) diff --git a/tools/test_m2m_decode.c b/tools/test_m2m_decode.c new file mode 100644 index 0000000..67be310 --- /dev/null +++ b/tools/test_m2m_decode.c @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * test_m2m_decode — minimal V4L2 m2m stateless decoder client. + * + * Drives /dev/videoNN through one full QBUF/DQBUF round-trip: + * 1. open the m2m device + * 2. S_FMT on OUTPUT (VP9_FRAME) and CAPTURE (NV12M) + * 3. REQBUFS 1 on both queues + * 4. mmap the OUTPUT buffer, copy bitstream into it, QBUF + * 5. QBUF the CAPTURE buffer + * 6. STREAMON both queues + * 7. poll for completion + * 8. DQBUF capture + * 9. mmap+dump the CAPTURE buffer to a file + * + * Phase 8.5 verification harness — confirms the kernel's m2m + * wiring works end-to-end against a real bitstream. + * + * Usage: + * test_m2m_decode [w] [h] + * defaults: w=128 h=96 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define V4L2_DEV "/dev/video0" +#define POLL_TIMEOUT_MS 5000 + +static void die(const char *msg) +{ + perror(msg); + exit(1); +} + +static void *read_file(const char *path, size_t *out_len) +{ + struct stat st; + void *buf; + int fd; + ssize_t n; + + fd = open(path, O_RDONLY); + if (fd < 0) + die("open bitstream"); + if (fstat(fd, &st) < 0) + die("fstat"); + buf = malloc(st.st_size); + if (!buf) + die("malloc bitstream"); + n = read(fd, buf, st.st_size); + if (n != st.st_size) + die("read bitstream short"); + close(fd); + *out_len = (size_t) st.st_size; + return buf; +} + +int main(int argc, char **argv) +{ + const char *bitstream_path, *out_path; + void *bitstream; + size_t bs_len; + uint32_t w = 128, h = 96; + + struct v4l2_format fmt; + struct v4l2_requestbuffers reqbuf; + struct v4l2_buffer buf; + struct v4l2_plane planes[2]; + struct v4l2_exportbuffer expbuf; + + int fd, rc, i; + void *out_map; + void *cap_y, *cap_uv; + size_t cap_y_size, cap_uv_size; + uint32_t out_buf_offset; + enum v4l2_buf_type t; + + if (argc < 3) { + fprintf(stderr, + "usage: %s [w] [h]\n", + argv[0]); + return 2; + } + bitstream_path = argv[1]; + out_path = argv[2]; + if (argc >= 5) { + w = (uint32_t) atoi(argv[3]); + h = (uint32_t) atoi(argv[4]); + } + + bitstream = read_file(bitstream_path, &bs_len); + printf("loaded bitstream: %zu bytes\n", bs_len); + + fd = open(V4L2_DEV, O_RDWR); + if (fd < 0) + die("open " V4L2_DEV); + + /* --- S_FMT OUTPUT (compressed) --- */ + memset(&fmt, 0, sizeof(fmt)); + fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + fmt.fmt.pix_mp.width = w; + fmt.fmt.pix_mp.height = h; + fmt.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_VP9_FRAME; + if (ioctl(fd, VIDIOC_S_FMT, &fmt) < 0) + die("S_FMT OUTPUT"); + printf("OUTPUT sizeimage = %u\n", fmt.fmt.pix_mp.plane_fmt[0].sizeimage); + + /* --- S_FMT CAPTURE (NV12M) --- */ + memset(&fmt, 0, sizeof(fmt)); + fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + fmt.fmt.pix_mp.width = w; + fmt.fmt.pix_mp.height = h; + fmt.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_NV12M; + if (ioctl(fd, VIDIOC_S_FMT, &fmt) < 0) + die("S_FMT CAPTURE"); + printf("CAPTURE planes = %u, [0].sizeimage=%u [1].sizeimage=%u\n", + fmt.fmt.pix_mp.num_planes, + fmt.fmt.pix_mp.plane_fmt[0].sizeimage, + fmt.fmt.pix_mp.plane_fmt[1].sizeimage); + cap_y_size = fmt.fmt.pix_mp.plane_fmt[0].sizeimage; + cap_uv_size = fmt.fmt.pix_mp.plane_fmt[1].sizeimage; + + /* --- REQBUFS OUTPUT --- */ + memset(&reqbuf, 0, sizeof(reqbuf)); + reqbuf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + reqbuf.memory = V4L2_MEMORY_MMAP; + reqbuf.count = 1; + if (ioctl(fd, VIDIOC_REQBUFS, &reqbuf) < 0) + die("REQBUFS OUTPUT"); + printf("OUTPUT REQBUFS -> %u\n", reqbuf.count); + + /* --- REQBUFS CAPTURE --- */ + memset(&reqbuf, 0, sizeof(reqbuf)); + reqbuf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + reqbuf.memory = V4L2_MEMORY_MMAP; + reqbuf.count = 1; + if (ioctl(fd, VIDIOC_REQBUFS, &reqbuf) < 0) + die("REQBUFS CAPTURE"); + printf("CAPTURE REQBUFS -> %u\n", reqbuf.count); + + /* --- QUERYBUF OUTPUT[0] + mmap + fill --- */ + memset(&buf, 0, sizeof(buf)); + memset(planes, 0, sizeof(planes)); + buf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + buf.memory = V4L2_MEMORY_MMAP; + buf.index = 0; + buf.m.planes = planes; + buf.length = 1; + if (ioctl(fd, VIDIOC_QUERYBUF, &buf) < 0) + die("QUERYBUF OUTPUT"); + out_buf_offset = planes[0].m.mem_offset; + out_map = mmap(NULL, planes[0].length, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, out_buf_offset); + if (out_map == MAP_FAILED) + die("mmap OUTPUT"); + if (bs_len > planes[0].length) { + fprintf(stderr, "bitstream too big: %zu > %u\n", + bs_len, planes[0].length); + return 1; + } + memcpy(out_map, bitstream, bs_len); + planes[0].bytesused = (uint32_t) bs_len; + + if (ioctl(fd, VIDIOC_QBUF, &buf) < 0) + die("QBUF OUTPUT"); + printf("QBUF OUTPUT[0] bytesused=%zu\n", bs_len); + + /* --- QBUF CAPTURE[0] --- */ + memset(&buf, 0, sizeof(buf)); + memset(planes, 0, sizeof(planes)); + buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + buf.memory = V4L2_MEMORY_MMAP; + buf.index = 0; + buf.m.planes = planes; + buf.length = 2; + if (ioctl(fd, VIDIOC_QBUF, &buf) < 0) + die("QBUF CAPTURE"); + printf("QBUF CAPTURE[0]\n"); + + /* --- STREAMON both --- */ + t = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + if (ioctl(fd, VIDIOC_STREAMON, &t) < 0) + die("STREAMON OUTPUT"); + t = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + if (ioctl(fd, VIDIOC_STREAMON, &t) < 0) + die("STREAMON CAPTURE"); + printf("STREAMON both\n"); + + /* --- poll for CAPTURE completion --- */ + { + struct pollfd p = { .fd = fd, .events = POLLIN | POLLOUT }; + rc = poll(&p, 1, POLL_TIMEOUT_MS); + if (rc < 0) + die("poll"); + if (rc == 0) { + fprintf(stderr, "poll timeout\n"); + return 1; + } + printf("poll revents=0x%x\n", p.revents); + } + + /* --- DQBUF OUTPUT (return the bitstream buffer) --- */ + memset(&buf, 0, sizeof(buf)); + memset(planes, 0, sizeof(planes)); + buf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + buf.memory = V4L2_MEMORY_MMAP; + buf.m.planes = planes; + buf.length = 1; + if (ioctl(fd, VIDIOC_DQBUF, &buf) < 0) + die("DQBUF OUTPUT"); + printf("DQBUF OUTPUT[%u] flags=0x%x\n", buf.index, buf.flags); + + /* --- DQBUF CAPTURE (get the decoded frame) --- */ + memset(&buf, 0, sizeof(buf)); + memset(planes, 0, sizeof(planes)); + buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + buf.memory = V4L2_MEMORY_MMAP; + buf.m.planes = planes; + buf.length = 2; + if (ioctl(fd, VIDIOC_DQBUF, &buf) < 0) + die("DQBUF CAPTURE"); + printf("DQBUF CAPTURE[%u] flags=0x%x payloads=[%u, %u]\n", + buf.index, buf.flags, + planes[0].bytesused, planes[1].bytesused); + if (buf.flags & V4L2_BUF_FLAG_ERROR) { + fprintf(stderr, "CAPTURE buffer flagged ERROR\n"); + return 1; + } + + /* --- mmap CAPTURE plane 0 + 1 and dump --- */ + { + struct v4l2_buffer qb; + struct v4l2_plane pl[2]; + + memset(&qb, 0, sizeof(qb)); + memset(pl, 0, sizeof(pl)); + qb.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + qb.memory = V4L2_MEMORY_MMAP; + qb.index = 0; + qb.m.planes = pl; + qb.length = 2; + if (ioctl(fd, VIDIOC_QUERYBUF, &qb) < 0) + die("QUERYBUF CAPTURE"); + cap_y = mmap(NULL, pl[0].length, PROT_READ, MAP_SHARED, fd, + pl[0].m.mem_offset); + if (cap_y == MAP_FAILED) + die("mmap cap Y"); + cap_uv = mmap(NULL, pl[1].length, PROT_READ, MAP_SHARED, fd, + pl[1].m.mem_offset); + if (cap_uv == MAP_FAILED) + die("mmap cap UV"); + } + + { + FILE *of = fopen(out_path, "wb"); + size_t y_actual = planes[0].bytesused + ? planes[0].bytesused : cap_y_size; + size_t uv_actual = planes[1].bytesused + ? planes[1].bytesused : cap_uv_size; + if (!of) + die("fopen out"); + fwrite(cap_y, 1, y_actual, of); + fwrite(cap_uv, 1, uv_actual, of); + fclose(of); + printf("wrote %zu Y + %zu UV bytes to %s\n", + y_actual, uv_actual, out_path); + } + + /* --- STREAMOFF, cleanup --- */ + t = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + ioctl(fd, VIDIOC_STREAMOFF, &t); + t = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + ioctl(fd, VIDIOC_STREAMOFF, &t); + + close(fd); + free(bitstream); + printf("OK\n"); + (void) expbuf; + (void) i; + return 0; +}