kernel: claim src/dst at device_run, not at buf_done
Hard reboot observed on higgs (Pi CM5) during the first mpv vaapi-copy playback against the freshly-deployed r28+g79256dc stack — kernel panic, no persistent journal, no recoverable trace. Bug introduced by the daedalus-v4l2#6 reorder fix (#7). Cause ----- The new completion path runs `v4l2_m2m_job_finish` on SRC_CONSUMED even when the dst_buf is still parked (waiting for a future HAS_PIXELS). job_finish moves the m2m_ctx back to IDLE, the scheduler dispatches the next device_run — which calls `v4l2_m2m_next_dst_buf`, which returns the head of the CAPTURE ready-queue, which is STILL the parked dst_buf because we never removed it. Two inflight entries now reference the same vb2_buffer; the later HAS_PIXELS triggers `v4l2_m2m_dst_buf_remove_by_buf` on a vb2_buffer whose list_head is no longer linked to that queue, and `list_del()` smashes the next/prev pointers of whatever ELSE was at those addresses. Fix --- Take both src and dst off `m2m_ctx`'s rdy_queue at device_run — as soon as `v4l2_m2m_next_*_buf` has peeked them and all early-exit validation has passed. After that, the daemon owns both halves exclusively via the inflight item; the m2m scheduler can't re-issue them on the next device_run. Completion path drops the redundant `_remove_by_buf` calls — list is already detached, so `buf_done` alone is correct. Matches the amphion `vdec.c`/`venc.c` pattern (which also claims at device_run for the same reason: amphion's encode pipeline parks output buffers across multiple frames waiting for the codec to finish, structurally the same as our H.264 B-frame DPB parking). `fail_buf_error` learns about the new `claimed` flag and skips the `v4l2_m2m_*_buf_remove` calls when the buffers have already been removed by-buf at device_run. Verified -------- Builds clean against 6.18.29+rpt-rpi-2712. Field test pending — deploy via marfrit-packages bump in lock-step with the daemon (which doesn't need to change for this fix; PROTO_VERSION stays at 1).
This commit is contained in:
@@ -731,6 +731,7 @@ static void daedalus_device_run(void *priv)
|
||||
size_t blen, payload_len;
|
||||
u32 cookie;
|
||||
int ret;
|
||||
bool claimed = false; /* src/dst removed from m2m rdy_queue */
|
||||
|
||||
src_buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx);
|
||||
dst_buf = v4l2_m2m_next_dst_buf(ctx->m2m_ctx);
|
||||
@@ -856,6 +857,28 @@ static void daedalus_device_run(void *priv)
|
||||
inf = kzalloc(sizeof(*inf), GFP_KERNEL);
|
||||
if (!inf)
|
||||
goto fail_buf_error;
|
||||
|
||||
/*
|
||||
* Take both buffers off the m2m ready-queue HERE — before the
|
||||
* inflight list grows. Once src_consumed releases the src side
|
||||
* and the m2m scheduler can dispatch the next device_run, the
|
||||
* NEW device_run mustn't see this dst_buf (which we're still
|
||||
* holding for a future HAS_PIXELS). Without this claim,
|
||||
* v4l2_m2m_next_dst_buf at the next device_run returns the same
|
||||
* parked dst_buf, two inflight entries reference it, and the
|
||||
* later HAS_PIXELS triggers a list_del on an already-removed
|
||||
* vb2_buffer → kernel panic (observed on Pi CM5 hard reboot
|
||||
* during mpv vaapi-copy playback of 720p H.264, 2026-05-21).
|
||||
*
|
||||
* Both helpers are inline list_del+counter-decrement under the
|
||||
* q_ctx rdy_spinlock — safe to call from device_run on the
|
||||
* buffer we just peeked via next_*_buf above. Mirrors the
|
||||
* amphion vdec/venc pattern.
|
||||
*/
|
||||
v4l2_m2m_src_buf_remove_by_buf(ctx->m2m_ctx, src_buf);
|
||||
v4l2_m2m_dst_buf_remove_by_buf(ctx->m2m_ctx, dst_buf);
|
||||
claimed = true;
|
||||
|
||||
cookie = daedalus_next_cookie();
|
||||
inf->cookie = cookie;
|
||||
inf->ctx = ctx;
|
||||
@@ -909,11 +932,13 @@ static void daedalus_device_run(void *priv)
|
||||
|
||||
fail_buf_error:
|
||||
if (src_buf) {
|
||||
v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
|
||||
if (!claimed)
|
||||
v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
|
||||
v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_ERROR);
|
||||
}
|
||||
if (dst_buf) {
|
||||
v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
|
||||
if (!claimed)
|
||||
v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
|
||||
v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_ERROR);
|
||||
}
|
||||
kfree(req);
|
||||
@@ -1074,7 +1099,13 @@ void daedalus_complete_resp_frame(u32 cookie,
|
||||
daedalus_pack_pixels_into_dst(dst_to_complete, fr,
|
||||
pixels, pixels_len);
|
||||
dst_to_complete->vb2_buf.timestamp = dst_timestamp;
|
||||
v4l2_m2m_dst_buf_remove_by_buf(ctx->m2m_ctx, dst_to_complete);
|
||||
/*
|
||||
* The buffer was already removed from m2m's rdy_queue at
|
||||
* device_run time (see the "Take both buffers off ..."
|
||||
* block). Just call buf_done here — calling
|
||||
* v4l2_m2m_dst_buf_remove_by_buf again would list_del a
|
||||
* list_head that's no longer linked, smashing the list.
|
||||
*/
|
||||
v4l2_m2m_buf_done(dst_to_complete, state);
|
||||
}
|
||||
|
||||
@@ -1091,7 +1122,7 @@ void daedalus_complete_resp_frame(u32 cookie,
|
||||
if (src_to_complete) {
|
||||
if (req_to_complete)
|
||||
v4l2_ctrl_request_complete(req_to_complete, &ctx->hdl);
|
||||
v4l2_m2m_src_buf_remove_by_buf(ctx->m2m_ctx, src_to_complete);
|
||||
/* Already off the rdy_queue (see device_run claim) — buf_done only. */
|
||||
v4l2_m2m_buf_done(src_to_complete, state);
|
||||
if (req_to_complete)
|
||||
media_request_put(req_to_complete);
|
||||
|
||||
Reference in New Issue
Block a user