Files
daedalus-v4l2/kernel/daedalus_v4l2_chardev.c
T
claude-noether 94be8c3d03 kernel: drain in-flight m2m jobs on daemon disconnect
Fixes issue #146 — daemon-crash (SIGKILL, SEGV, anything that
triggers chardev release) leaves V4L2 consumers in unkillable
TASK_UNINTERRUPTIBLE on /dev/video0 close.

## Root cause

device_run() adds an entry to dev->inflight when it sends a
REQ_DECODE to the daemon, marking the m2m job as "running".
The job is only cleared via v4l2_m2m_buf_done_and_job_finish()
in daedalus_complete_resp_frame(), which only fires on RESP_FRAME.

If the daemon dies (SIGKILL, SEGV, exit) BEFORE writing the
matching RESP_FRAME:
  - the inflight entry is never popped
  - v4l2_m2m_buf_done_and_job_finish is never called
  - the m2m scheduler still thinks a job is running

Later, when the V4L2 consumer's close() runs (or gets signalled
to exit), v4l2_m2m_ctx_release() → v4l2_m2m_cancel_job() waits
for !job_running indefinitely.  The consumer enters D-state and
survives SIGKILL until reboot.

Reproduced on hertz 2026-05-23, kernel 6.12.75+rpt-rpi-2712:

  $ sudo kill -STOP $DAEMON_PID            # block daemon I/O
  $ ./test_m2m_decode keyframe.bin out.nv12 1920 1080 vp9 &
  $ sudo kill -9 $DAEMON_PID               # chardev_release fires
  $ kill -9 $CLIENT_PID                    # ignored — D-state
  # client stack:
  v4l2_m2m_cancel_job+0x14c [v4l2_mem2mem]
  v4l2_m2m_ctx_release+0x20 [v4l2_mem2mem]
  daedalus_release+0x2c [daedalus_v4l2]
  v4l2_release+0x7c [videodev]
  __fput → do_exit → SIGKILL never delivered

## Fix

New API daedalus_drain_inflight_on_disconnect() in main.{c,h}:
walks the in-flight list, marks both src+dst buffers
VB2_BUF_STATE_ERROR via v4l2_m2m_buf_done_and_job_finish(), and
releases the bound media_request if any.  Same completion shape
as daedalus_complete_resp_frame() takes on the success path,
just with state = ERROR for every in-flight entry.

chardev_release calls the drain after flushing dev->req_queue
(messages still in req_queue weren't released to the daemon yet,
so they don't need the m2m-job-finish dance — freeing them is
sufficient).  The order matters: queue first (cheap), then m2m
drain (heavier, takes the inflight list).

Locking: list_splice_init under inflight_lock to take the entire
list atomically; lock dropped before iterating because
v4l2_m2m_buf_done_and_job_finish can sleep via vb2's buffer-done
dispatch and can re-enter device_run via the scheduler (which
would need inflight_lock again on the next REQ_DECODE).

## Verification path

Cannot rmmod the running module on hertz right now — the D-state
corpse from the repro session pins the refcount.  Verification
of the fixed module needs a reboot or fresh test host:

  $ sudo reboot                            # clears hung client
  $ sudo make modules_install              # install new .ko
  $ sudo modprobe daedalus_v4l2
  $ # rerun the repro script — client should die cleanly with
  $ # an -EIO / similar return from poll/DQBUF instead of hanging.

Build: clean on Linux 6.12.75 + rpt-rpi-2712, no new warnings.
The pre-existing "frame size 2128 > 2048" warning on
daedalus_device_run is unchanged by this commit.

## Followup not in scope

If a new V4L2 consumer races a REQ_DECODE through device_run
AFTER the drain has spliced the list (but before the daemon
chardev is reopened), the new entry sits in a freshly-empty
inflight list and the same hang can recur for that consumer
when the systemd auto-restart of the daemon either fails or
takes longer than the consumer's patience.  A secondary
safeguard would be to fail-fast in device_run when dev->chardev
is unopened — proposing as a separate ticket if this race
materialises in practice.

Closes #146.
2026-05-23 17:06:06 +02:00

551 lines
15 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* daedalus-v4l2 — kernel ↔ daemon chardev bridge.
*
* Exposes /dev/daedalus-v4l2 (a misc-class character device)
* for the userspace daemon to attach to. Single-instance:
* only one open file at a time. Blocking read() pulls the next
* request from a kernel-side FIFO; write() submits a response.
*
* Phase 8.2 scope: PING request handling — the daemon writes a
* PONG response to a PING request that arrives via read(). In
* Phase 8.2 the kernel injects test PING requests itself via a
* debugfs trigger (no V4L2 ioctl flow yet); Phase 8.4 wires
* real DECODE requests from the V4L2 path.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/debugfs.h>
#include "daedalus_v4l2_proto.h"
#include "daedalus_v4l2_chardev.h"
#include "daedalus_v4l2_main.h"
#define DAEDALUS_CHARDEV_NAME "daedalus-v4l2"
/* Cap the number of pending requests so a stuck daemon can't OOM us. */
#define DAEDALUS_QUEUE_MAX 64
/**
* struct daedalus_chardev_msg - in-kernel queued message
* @list: queue linkage
* @hdr: wire header
* @payload: payload bytes; size = hdr.payload_len
*/
struct daedalus_chardev_msg {
struct list_head list;
struct daedalus_msg_hdr hdr;
u8 *payload;
};
/**
* struct daedalus_chardev - per-singleton chardev state
* @misc: misc-class device registration
* @open_lock: serialises open()/release()
* @opened: non-zero when the chardev is currently open
* @req_lock: protects @req_queue / @req_count
* @req_queue: list of pending REQ_* messages waiting for daemon read()
* @req_count: current number of queued requests
* @req_wait: read() blocks here until a request arrives
*/
struct daedalus_chardev {
struct miscdevice misc;
struct mutex open_lock;
int opened;
struct mutex req_lock;
struct list_head req_queue;
int req_count;
wait_queue_head_t req_wait;
struct dentry *debugfs_dir;
};
static struct daedalus_chardev *g_chardev;
/* -- internal helpers ------------------------------------------------ */
static struct daedalus_chardev_msg *
daedalus_chardev_dequeue_locked(struct daedalus_chardev *dev)
{
struct daedalus_chardev_msg *msg;
if (list_empty(&dev->req_queue))
return NULL;
msg = list_first_entry(&dev->req_queue,
struct daedalus_chardev_msg, list);
list_del(&msg->list);
dev->req_count--;
return msg;
}
static void daedalus_chardev_msg_free(struct daedalus_chardev_msg *msg)
{
if (!msg)
return;
kfree(msg->payload);
kfree(msg);
}
int daedalus_chardev_enqueue_req(u32 type, u32 cookie,
const void *payload, size_t payload_len)
{
struct daedalus_chardev *dev = g_chardev;
struct daedalus_chardev_msg *msg;
if (!dev)
return -ENODEV;
if (payload_len > DAEDALUS_PROTO_MAX_PAYLOAD)
return -EMSGSIZE;
if (type & 0x80000000u) /* responses don't get queued here */
return -EINVAL;
msg = kzalloc(sizeof(*msg), GFP_KERNEL);
if (!msg)
return -ENOMEM;
if (payload_len) {
msg->payload = kmemdup(payload, payload_len, GFP_KERNEL);
if (!msg->payload) {
kfree(msg);
return -ENOMEM;
}
}
msg->hdr.magic = DAEDALUS_PROTO_MAGIC;
msg->hdr.version = DAEDALUS_PROTO_VERSION;
msg->hdr.type = type;
msg->hdr.cookie = cookie;
msg->hdr.payload_len = (u32) payload_len;
msg->hdr.reserved = 0;
mutex_lock(&dev->req_lock);
if (dev->req_count >= DAEDALUS_QUEUE_MAX) {
mutex_unlock(&dev->req_lock);
daedalus_chardev_msg_free(msg);
return -ENOSPC;
}
list_add_tail(&msg->list, &dev->req_queue);
dev->req_count++;
mutex_unlock(&dev->req_lock);
wake_up_interruptible(&dev->req_wait);
return 0;
}
/* -- file operations ------------------------------------------------- */
static int daedalus_chardev_open(struct inode *inode, struct file *file)
{
struct daedalus_chardev *dev = g_chardev;
mutex_lock(&dev->open_lock);
if (dev->opened) {
mutex_unlock(&dev->open_lock);
return -EBUSY;
}
dev->opened = 1;
mutex_unlock(&dev->open_lock);
file->private_data = dev;
return 0;
}
static int daedalus_chardev_release(struct inode *inode, struct file *file)
{
struct daedalus_chardev *dev = file->private_data;
struct daedalus_chardev_msg *msg;
mutex_lock(&dev->req_lock);
while ((msg = daedalus_chardev_dequeue_locked(dev)) != NULL) {
mutex_unlock(&dev->req_lock);
daedalus_chardev_msg_free(msg);
mutex_lock(&dev->req_lock);
}
mutex_unlock(&dev->req_lock);
/*
* Drain the V4L2-side in-flight list before the daemon goes
* away. Any REQ_DECODE we already sent to the daemon won't
* get a matching RESP_FRAME — without this drain,
* v4l2_m2m_cancel_job() in the V4L2 consumer's close() path
* (or in vb2's STREAMOFF path) blocks forever waiting for a
* job_finish that will never arrive, and the consumer becomes
* unkillable D-state. Issue #146.
*
* Done AFTER draining the request queue: any REQ_DECODE still
* sitting in dev->req_queue is per definition not yet "in
* flight" (the kernel never released it to the daemon), so it
* doesn't need the m2m-job-finish dance — freeing the message
* is sufficient. The inflight list holds entries the kernel
* already committed to (added in device_run after the message
* was queued or written), which is exactly what needs to be
* failed back to vb2 here.
*/
daedalus_drain_inflight_on_disconnect();
mutex_lock(&dev->open_lock);
dev->opened = 0;
mutex_unlock(&dev->open_lock);
return 0;
}
static ssize_t daedalus_chardev_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct daedalus_chardev *dev = file->private_data;
struct daedalus_chardev_msg *msg;
size_t total;
int ret;
if (count < sizeof(struct daedalus_msg_hdr))
return -EINVAL;
for (;;) {
mutex_lock(&dev->req_lock);
msg = daedalus_chardev_dequeue_locked(dev);
mutex_unlock(&dev->req_lock);
if (msg)
break;
if (file->f_flags & O_NONBLOCK)
return -EAGAIN;
ret = wait_event_interruptible(dev->req_wait,
dev->req_count > 0);
if (ret)
return ret;
}
total = sizeof(msg->hdr) + msg->hdr.payload_len;
if (count < total) {
/*
* Requeue so the caller can retry with a bigger buffer.
* Re-enqueue at HEAD to preserve FIFO order.
*/
mutex_lock(&dev->req_lock);
list_add(&msg->list, &dev->req_queue);
dev->req_count++;
mutex_unlock(&dev->req_lock);
return -EMSGSIZE;
}
if (copy_to_user(buf, &msg->hdr, sizeof(msg->hdr))) {
daedalus_chardev_msg_free(msg);
return -EFAULT;
}
if (msg->hdr.payload_len &&
copy_to_user(buf + sizeof(msg->hdr), msg->payload,
msg->hdr.payload_len)) {
daedalus_chardev_msg_free(msg);
return -EFAULT;
}
daedalus_chardev_msg_free(msg);
return total;
}
static ssize_t daedalus_chardev_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
struct daedalus_msg_hdr hdr;
u8 *payload = NULL;
size_t expected;
if (count < sizeof(hdr))
return -EINVAL;
if (copy_from_user(&hdr, buf, sizeof(hdr)))
return -EFAULT;
if (hdr.magic != DAEDALUS_PROTO_MAGIC)
return -EBADMSG;
if (hdr.version != DAEDALUS_PROTO_VERSION)
return -EPROTO;
if (hdr.payload_len > DAEDALUS_PROTO_MAX_PAYLOAD)
return -EMSGSIZE;
expected = sizeof(hdr) + hdr.payload_len;
if (count < expected)
return -EINVAL;
if (hdr.payload_len) {
payload = kmalloc(hdr.payload_len, GFP_KERNEL);
if (!payload)
return -ENOMEM;
if (copy_from_user(payload, buf + sizeof(hdr),
hdr.payload_len)) {
kfree(payload);
return -EFAULT;
}
}
/*
* Response dispatch. Phase 8.4 understands PONG (echoes
* back at debug level) and RESP_FRAME (logs decode result
* at info so the test harness can see it without enabling
* dyndbg). Phase 8.5+ will wire RESP_FRAME to the V4L2
* buffer-done path.
*/
switch (hdr.type) {
case DAEDALUS_MSG_RESP_FRAME: {
struct daedalus_resp_frame fr;
const u8 *pixels = NULL;
size_t pixels_len = 0;
if (hdr.payload_len < sizeof(fr)) {
pr_warn("daedalus_v4l2: RESP_FRAME payload too short (%u < %zu)\n",
hdr.payload_len, sizeof(fr));
kfree(payload);
return -EBADMSG;
}
memcpy(&fr, payload, sizeof(fr));
if (hdr.payload_len > sizeof(fr)) {
pixels = payload + sizeof(fr);
pixels_len = hdr.payload_len - sizeof(fr);
}
pr_debug("daedalus_v4l2: RESP_FRAME cookie=%u status=%u codec=%u %ux%u pixfmt=%d luma=%u chroma=%u fnv1a=0x%08x inline_pixels=%zu\n",
hdr.cookie, fr.status, fr.codec_id,
fr.width, fr.height, fr.pix_fmt,
fr.luma_len, fr.chroma_len, fr.fnv1a_yuv,
pixels_len);
/*
* Hand off to the V4L2 m2m completion path. If no
* V4L2 device is registered yet (e.g. debugfs-only
* test_decode used and no V4L2 m2m_ctx exists),
* daedalus_complete_resp_frame returns silently after
* a ratelimited warn.
*/
daedalus_complete_resp_frame(hdr.cookie, &fr, pixels,
pixels_len);
break;
}
default:
pr_debug("daedalus_v4l2: chardev got response type=0x%08x cookie=%u plen=%u\n",
hdr.type, hdr.cookie, hdr.payload_len);
break;
}
kfree(payload);
return expected;
}
static __poll_t daedalus_chardev_poll(struct file *file,
struct poll_table_struct *wait)
{
struct daedalus_chardev *dev = file->private_data;
__poll_t mask = EPOLLOUT | EPOLLWRNORM;
poll_wait(file, &dev->req_wait, wait);
if (READ_ONCE(dev->req_count) > 0)
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
}
/*
* Phase 8.6 chardev ioctl: daemon uses DAEDALUS_IOC_GET_DMABUF
* to fetch a dmabuf fd for the CAPTURE buffer the kernel
* scheduled. The fd is installed in the calling task's fd
* table by vb2_core_expbuf, so the daemon can mmap it directly.
*/
static long daedalus_chardev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
switch (cmd) {
case DAEDALUS_IOC_GET_DMABUF: {
struct daedalus_get_dmabuf k;
int fd;
int rc;
if (copy_from_user(&k, (void __user *) arg, sizeof(k)))
return -EFAULT;
rc = daedalus_export_capture_dmabuf(k.cookie, k.plane,
k.flags, &fd);
if (rc)
return rc;
k.fd = fd;
if (copy_to_user((void __user *) arg, &k, sizeof(k))) {
/* fd is already installed in caller's table; daemon
* still must close it on this error path. */
return -EFAULT;
}
return 0;
}
default:
return -ENOTTY;
}
}
/*
* .llseek intentionally unset. The chardev is a streaming
* request/response channel; no positional semantics. Recent
* kernels removed `no_llseek`; leaving the slot NULL gets the
* generic "no-op or -ESPIPE" behaviour the v6.12+ vfs picks.
*/
static const struct file_operations daedalus_chardev_fops = {
.owner = THIS_MODULE,
.open = daedalus_chardev_open,
.release = daedalus_chardev_release,
.read = daedalus_chardev_read,
.write = daedalus_chardev_write,
.poll = daedalus_chardev_poll,
.unlocked_ioctl = daedalus_chardev_ioctl,
};
/* -- debugfs test trigger (Phase 8.2 only) --------------------------- */
/*
* Writing any non-zero byte stream to
* /sys/kernel/debug/daedalus_v4l2/test_ping enqueues a PING
* request with a fixed 24-byte payload "DAEDALUS-V4L2-PING-PL\0\0\0".
* The userspace test daemon (tools/test_chardev_pingpong.c)
* then reads it back, sends PONG, and the kernel logs the
* round-trip at pr_debug level.
*
* Phase 8.4 replaces this with real REQ_DECODE injection from
* the V4L2 buffer-submit path; the debugfs entry can be removed
* then.
*/
static ssize_t daedalus_test_ping_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
static const char payload[24] = "DAEDALUS-V4L2-PING-PL";
int ret;
ret = daedalus_chardev_enqueue_req(DAEDALUS_MSG_PING, 0x1234u,
payload, sizeof(payload));
if (ret)
return ret;
return count;
}
static const struct file_operations daedalus_test_ping_fops = {
.owner = THIS_MODULE,
.write = daedalus_test_ping_write,
};
/*
* Writing bitstream bytes to
* /sys/kernel/debug/daedalus_v4l2/test_decode enqueues a REQ_DECODE
* carrying those bytes as a VP9 access unit (debugging utility;
* the real production path is the V4L2 m2m queue). The wire
* payload prepends a struct daedalus_req_decode header.
*
* Phase 8.6: cookies come from the shared module-wide allocator
* (daedalus_next_cookie) so debugfs and V4L2 cookies never
* collide and RESP_FRAME logs stay deterministic.
*/
static ssize_t daedalus_test_decode_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
struct daedalus_req_decode req;
u8 *blob;
size_t total;
u32 cookie;
int ret;
if (count == 0)
return -EINVAL;
if (count + sizeof(req) > DAEDALUS_PROTO_MAX_PAYLOAD)
return -EMSGSIZE;
total = sizeof(req) + count;
blob = kmalloc(total, GFP_KERNEL);
if (!blob)
return -ENOMEM;
memset(&req, 0, sizeof(req));
req.codec_id = DAEDALUS_CODEC_VP9;
req.bitstream_len = (u32) count;
/*
* No CAPTURE plane info for the debugfs path — there's no
* V4L2 client backing this REQ_DECODE. Daemon will see
* capture_num_planes == 0 and run decode without writing
* pixels anywhere.
*/
memcpy(blob, &req, sizeof(req));
if (copy_from_user(blob + sizeof(req), buf, count)) {
kfree(blob);
return -EFAULT;
}
cookie = daedalus_next_cookie();
ret = daedalus_chardev_enqueue_req(DAEDALUS_MSG_REQ_DECODE, cookie,
blob, total);
kfree(blob);
if (ret)
return ret;
pr_info("daedalus_v4l2: REQ_DECODE (debugfs) cookie=%u codec=VP9 bitstream=%zu\n",
cookie, count);
return count;
}
static const struct file_operations daedalus_test_decode_fops = {
.owner = THIS_MODULE,
.write = daedalus_test_decode_write,
};
/* -- registration ---------------------------------------------------- */
int daedalus_chardev_init(void)
{
struct daedalus_chardev *dev;
int ret;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return -ENOMEM;
mutex_init(&dev->open_lock);
mutex_init(&dev->req_lock);
INIT_LIST_HEAD(&dev->req_queue);
init_waitqueue_head(&dev->req_wait);
dev->misc.minor = MISC_DYNAMIC_MINOR;
dev->misc.name = DAEDALUS_CHARDEV_NAME;
dev->misc.fops = &daedalus_chardev_fops;
dev->misc.mode = 0660; /* root:video, like /dev/videoNN */
ret = misc_register(&dev->misc);
if (ret) {
kfree(dev);
return ret;
}
dev->debugfs_dir = debugfs_create_dir("daedalus_v4l2", NULL);
if (!IS_ERR(dev->debugfs_dir)) {
debugfs_create_file("test_ping", 0200, dev->debugfs_dir,
NULL, &daedalus_test_ping_fops);
debugfs_create_file("test_decode", 0200, dev->debugfs_dir,
NULL, &daedalus_test_decode_fops);
}
g_chardev = dev;
pr_info("daedalus_v4l2: /dev/%s registered\n", DAEDALUS_CHARDEV_NAME);
return 0;
}
void daedalus_chardev_exit(void)
{
struct daedalus_chardev *dev = g_chardev;
struct daedalus_chardev_msg *msg;
if (!dev)
return;
debugfs_remove_recursive(dev->debugfs_dir);
misc_deregister(&dev->misc);
while ((msg = list_first_entry_or_null(&dev->req_queue,
struct daedalus_chardev_msg,
list)) != NULL) {
list_del(&msg->list);
daedalus_chardev_msg_free(msg);
}
mutex_destroy(&dev->req_lock);
mutex_destroy(&dev->open_lock);
kfree(dev);
g_chardev = NULL;
}