From f10a26d88366f34c8e9183a6bdb1212707fbf2c5 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Thu, 21 May 2026 13:49:44 +0200 Subject: [PATCH] kernel: claim src/dst at device_run, not at buf_done MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hard reboot observed on higgs (Pi CM5) during the first mpv vaapi-copy playback against the freshly-deployed r28+g79256dc stack — kernel panic, no persistent journal, no recoverable trace. Bug introduced by the daedalus-v4l2#6 reorder fix (#7). Cause ----- The new completion path runs `v4l2_m2m_job_finish` on SRC_CONSUMED even when the dst_buf is still parked (waiting for a future HAS_PIXELS). job_finish moves the m2m_ctx back to IDLE, the scheduler dispatches the next device_run — which calls `v4l2_m2m_next_dst_buf`, which returns the head of the CAPTURE ready-queue, which is STILL the parked dst_buf because we never removed it. Two inflight entries now reference the same vb2_buffer; the later HAS_PIXELS triggers `v4l2_m2m_dst_buf_remove_by_buf` on a vb2_buffer whose list_head is no longer linked to that queue, and `list_del()` smashes the next/prev pointers of whatever ELSE was at those addresses. Fix --- Take both src and dst off `m2m_ctx`'s rdy_queue at device_run — as soon as `v4l2_m2m_next_*_buf` has peeked them and all early-exit validation has passed. After that, the daemon owns both halves exclusively via the inflight item; the m2m scheduler can't re-issue them on the next device_run. Completion path drops the redundant `_remove_by_buf` calls — list is already detached, so `buf_done` alone is correct. Matches the amphion `vdec.c`/`venc.c` pattern (which also claims at device_run for the same reason: amphion's encode pipeline parks output buffers across multiple frames waiting for the codec to finish, structurally the same as our H.264 B-frame DPB parking). `fail_buf_error` learns about the new `claimed` flag and skips the `v4l2_m2m_*_buf_remove` calls when the buffers have already been removed by-buf at device_run. Verified -------- Builds clean against 6.18.29+rpt-rpi-2712. Field test pending — deploy via marfrit-packages bump in lock-step with the daemon (which doesn't need to change for this fix; PROTO_VERSION stays at 1). --- kernel/daedalus_v4l2_main.c | 39 +++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/kernel/daedalus_v4l2_main.c b/kernel/daedalus_v4l2_main.c index 899600a..dce98e3 100644 --- a/kernel/daedalus_v4l2_main.c +++ b/kernel/daedalus_v4l2_main.c @@ -731,6 +731,7 @@ static void daedalus_device_run(void *priv) size_t blen, payload_len; u32 cookie; int ret; + bool claimed = false; /* src/dst removed from m2m rdy_queue */ src_buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx); dst_buf = v4l2_m2m_next_dst_buf(ctx->m2m_ctx); @@ -856,6 +857,28 @@ static void daedalus_device_run(void *priv) inf = kzalloc(sizeof(*inf), GFP_KERNEL); if (!inf) goto fail_buf_error; + + /* + * Take both buffers off the m2m ready-queue HERE — before the + * inflight list grows. Once src_consumed releases the src side + * and the m2m scheduler can dispatch the next device_run, the + * NEW device_run mustn't see this dst_buf (which we're still + * holding for a future HAS_PIXELS). Without this claim, + * v4l2_m2m_next_dst_buf at the next device_run returns the same + * parked dst_buf, two inflight entries reference it, and the + * later HAS_PIXELS triggers a list_del on an already-removed + * vb2_buffer → kernel panic (observed on Pi CM5 hard reboot + * during mpv vaapi-copy playback of 720p H.264, 2026-05-21). + * + * Both helpers are inline list_del+counter-decrement under the + * q_ctx rdy_spinlock — safe to call from device_run on the + * buffer we just peeked via next_*_buf above. Mirrors the + * amphion vdec/venc pattern. + */ + v4l2_m2m_src_buf_remove_by_buf(ctx->m2m_ctx, src_buf); + v4l2_m2m_dst_buf_remove_by_buf(ctx->m2m_ctx, dst_buf); + claimed = true; + cookie = daedalus_next_cookie(); inf->cookie = cookie; inf->ctx = ctx; @@ -909,11 +932,13 @@ static void daedalus_device_run(void *priv) fail_buf_error: if (src_buf) { - v4l2_m2m_src_buf_remove(ctx->m2m_ctx); + if (!claimed) + v4l2_m2m_src_buf_remove(ctx->m2m_ctx); v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_ERROR); } if (dst_buf) { - v4l2_m2m_dst_buf_remove(ctx->m2m_ctx); + if (!claimed) + v4l2_m2m_dst_buf_remove(ctx->m2m_ctx); v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_ERROR); } kfree(req); @@ -1074,7 +1099,13 @@ void daedalus_complete_resp_frame(u32 cookie, daedalus_pack_pixels_into_dst(dst_to_complete, fr, pixels, pixels_len); dst_to_complete->vb2_buf.timestamp = dst_timestamp; - v4l2_m2m_dst_buf_remove_by_buf(ctx->m2m_ctx, dst_to_complete); + /* + * The buffer was already removed from m2m's rdy_queue at + * device_run time (see the "Take both buffers off ..." + * block). Just call buf_done here — calling + * v4l2_m2m_dst_buf_remove_by_buf again would list_del a + * list_head that's no longer linked, smashing the list. + */ v4l2_m2m_buf_done(dst_to_complete, state); } @@ -1091,7 +1122,7 @@ void daedalus_complete_resp_frame(u32 cookie, if (src_to_complete) { if (req_to_complete) v4l2_ctrl_request_complete(req_to_complete, &ctx->hdl); - v4l2_m2m_src_buf_remove_by_buf(ctx->m2m_ctx, src_to_complete); + /* Already off the rdy_queue (see device_run claim) — buf_done only. */ v4l2_m2m_buf_done(src_to_complete, state); if (req_to_complete) media_request_put(req_to_complete); -- 2.47.3