kernel: claim src/dst at device_run, not at buf_done

Hard reboot observed on higgs (Pi CM5) during the first mpv vaapi-copy
playback against the freshly-deployed r28+g79256dc stack — kernel
panic, no persistent journal, no recoverable trace.  Bug introduced
by the daedalus-v4l2#6 reorder fix (#7).

Cause
-----
The new completion path runs `v4l2_m2m_job_finish` on SRC_CONSUMED
even when the dst_buf is still parked (waiting for a future
HAS_PIXELS).  job_finish moves the m2m_ctx back to IDLE, the
scheduler dispatches the next device_run — which calls
`v4l2_m2m_next_dst_buf`, which returns the head of the CAPTURE
ready-queue, which is STILL the parked dst_buf because we never
removed it.  Two inflight entries now reference the same vb2_buffer;
the later HAS_PIXELS triggers `v4l2_m2m_dst_buf_remove_by_buf` on a
vb2_buffer whose list_head is no longer linked to that queue, and
`list_del()` smashes the next/prev pointers of whatever ELSE was at
those addresses.

Fix
---
Take both src and dst off `m2m_ctx`'s rdy_queue at device_run — as
soon as `v4l2_m2m_next_*_buf` has peeked them and all early-exit
validation has passed.  After that, the daemon owns both halves
exclusively via the inflight item; the m2m scheduler can't re-issue
them on the next device_run.  Completion path drops the redundant
`_remove_by_buf` calls — list is already detached, so `buf_done`
alone is correct.

Matches the amphion `vdec.c`/`venc.c` pattern (which also claims at
device_run for the same reason: amphion's encode pipeline parks
output buffers across multiple frames waiting for the codec to
finish, structurally the same as our H.264 B-frame DPB parking).

`fail_buf_error` learns about the new `claimed` flag and skips the
`v4l2_m2m_*_buf_remove` calls when the buffers have already been
removed by-buf at device_run.

Verified
--------
Builds clean against 6.18.29+rpt-rpi-2712.  Field test pending —
deploy via marfrit-packages bump in lock-step with the daemon
(which doesn't need to change for this fix; PROTO_VERSION stays at 1).
This commit is contained in:
2026-05-21 13:49:44 +02:00
parent 79256dc7ef
commit f10a26d883
+35 -4
View File
@@ -731,6 +731,7 @@ static void daedalus_device_run(void *priv)
size_t blen, payload_len;
u32 cookie;
int ret;
bool claimed = false; /* src/dst removed from m2m rdy_queue */
src_buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx);
dst_buf = v4l2_m2m_next_dst_buf(ctx->m2m_ctx);
@@ -856,6 +857,28 @@ static void daedalus_device_run(void *priv)
inf = kzalloc(sizeof(*inf), GFP_KERNEL);
if (!inf)
goto fail_buf_error;
/*
* Take both buffers off the m2m ready-queue HERE — before the
* inflight list grows. Once src_consumed releases the src side
* and the m2m scheduler can dispatch the next device_run, the
* NEW device_run mustn't see this dst_buf (which we're still
* holding for a future HAS_PIXELS). Without this claim,
* v4l2_m2m_next_dst_buf at the next device_run returns the same
* parked dst_buf, two inflight entries reference it, and the
* later HAS_PIXELS triggers a list_del on an already-removed
* vb2_buffer → kernel panic (observed on Pi CM5 hard reboot
* during mpv vaapi-copy playback of 720p H.264, 2026-05-21).
*
* Both helpers are inline list_del+counter-decrement under the
* q_ctx rdy_spinlock — safe to call from device_run on the
* buffer we just peeked via next_*_buf above. Mirrors the
* amphion vdec/venc pattern.
*/
v4l2_m2m_src_buf_remove_by_buf(ctx->m2m_ctx, src_buf);
v4l2_m2m_dst_buf_remove_by_buf(ctx->m2m_ctx, dst_buf);
claimed = true;
cookie = daedalus_next_cookie();
inf->cookie = cookie;
inf->ctx = ctx;
@@ -909,11 +932,13 @@ static void daedalus_device_run(void *priv)
fail_buf_error:
if (src_buf) {
v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
if (!claimed)
v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_ERROR);
}
if (dst_buf) {
v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
if (!claimed)
v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_ERROR);
}
kfree(req);
@@ -1074,7 +1099,13 @@ void daedalus_complete_resp_frame(u32 cookie,
daedalus_pack_pixels_into_dst(dst_to_complete, fr,
pixels, pixels_len);
dst_to_complete->vb2_buf.timestamp = dst_timestamp;
v4l2_m2m_dst_buf_remove_by_buf(ctx->m2m_ctx, dst_to_complete);
/*
* The buffer was already removed from m2m's rdy_queue at
* device_run time (see the "Take both buffers off ..."
* block). Just call buf_done here — calling
* v4l2_m2m_dst_buf_remove_by_buf again would list_del a
* list_head that's no longer linked, smashing the list.
*/
v4l2_m2m_buf_done(dst_to_complete, state);
}
@@ -1091,7 +1122,7 @@ void daedalus_complete_resp_frame(u32 cookie,
if (src_to_complete) {
if (req_to_complete)
v4l2_ctrl_request_complete(req_to_complete, &ctx->hdl);
v4l2_m2m_src_buf_remove_by_buf(ctx->m2m_ctx, src_to_complete);
/* Already off the rdy_queue (see device_run claim) — buf_done only. */
v4l2_m2m_buf_done(src_to_complete, state);
if (req_to_complete)
media_request_put(req_to_complete);