Merge pull request 'daemon: shadow_decoder wiring (PR-Q3a.1)' (#25 ) from noether/daemon-shadow-decoder-wiring into main

Reviewed-on: #25
daemon: shadow_decoder wiring (PR-Q3a.1)
2026-05-26 12:28:16 +00:00 · 2026-05-26 14:15:13 +02:00 · 2026-05-23 17:16:27 +00:00 · 2026-05-23 18:31:41 +02:00 · 2026-05-23 15:12:16 +00:00 · 2026-05-23 15:11:57 +00:00
17 changed files with 2517 additions and 750 deletions
@@ -28,16 +28,37 @@ find_package(PkgConfig REQUIRED)
 pkg_check_modules(FFMPEG REQUIRED IMPORTED_TARGET
 	libavformat libavcodec libavutil)

+# daedalus-fourier — VC VII (V3D) + ARM NEON back-end kernel library.
+# Linked statically.  Today only the no-QPU smoke-test path is wired
+# (a ctx_create_no_qpu at daemon startup, log-and-destroy at exit);
+# follow-up patches (per daedalus-v4l2#11) substitute the
+# `daedalus_recipe_dispatch_h264_*` family for libavcodec's per-MB
+# pixel primitives, one cycle at a time.
+#
+# We bypass IMPORTED_TARGET and consume pkg-config's static variables
+# (--static --libs path) directly so we control the link order:
+# libdaedalus_core.a must precede -lvulkan because the static archive
+# references vulkan symbols and the linker resolves left-to-right.
+pkg_check_modules(DAEDALUS_FOURIER REQUIRED daedalus-fourier)
+# daedalus-decoder — frame-major UMA H.264 decoder.  Linked into the
+# shadow-mode path (env DAEDALUS_SHADOW_MODE=1) and inert otherwise.
+# Linked unconditionally to keep CMake configurations symmetrical
+# between production and shadow-mode runs.
+pkg_check_modules(DAEDALUS_DECODER REQUIRED daedalus-decoder)
+find_package(Vulkan REQUIRED)
+
 add_executable(daedalus_v4l2_daemon
 	src/main.c
 	src/ffmpeg_loader.c
 	src/log.c
 	src/parser.c
 	src/decoder.c
+	src/shadow_decoder.c
 	src/chardev_client.c
 	src/dmabuf_capture.c
 	src/bitstream_writer.c
 	src/h264_nal_synth.c
+	src/av1_obu_synth.c
 )

 target_include_directories(daedalus_v4l2_daemon
@@ -45,14 +66,51 @@ target_include_directories(daedalus_v4l2_daemon
 		src
 		${CMAKE_CURRENT_SOURCE_DIR}/../include
 		${FFMPEG_INCLUDE_DIRS}
+		${DAEDALUS_FOURIER_INCLUDE_DIRS}
+		${DAEDALUS_DECODER_INCLUDE_DIRS}
 )

 # dl for dlopen, pthread for future threading work.
+target_link_directories(daedalus_v4l2_daemon
+	PRIVATE
+		${DAEDALUS_FOURIER_LIBRARY_DIRS}
+		${DAEDALUS_DECODER_LIBRARY_DIRS}
+)
+
 target_link_libraries(daedalus_v4l2_daemon
 	PRIVATE
 		dl
 		pthread
+		# Order matters for left-to-right linker resolution of
+		# static archives.  daedalus-decoder references symbols
+		# from daedalus-fourier; daedalus-fourier references
+		# vulkan symbols.  So: decoder, fourier, vulkan.
+		${DAEDALUS_DECODER_LIBRARIES}
+		${DAEDALUS_FOURIER_LIBRARIES}
+		Vulkan::Vulkan
 )

 install(TARGETS daedalus_v4l2_daemon
 	RUNTIME DESTINATION /usr/local/bin)
+
+# --- Unit tests (opt-in) -------------------------------------------------
+#
+# DAEDALUS_BUILD_TESTS=ON enables standalone test executables that run on
+# the build host (no V4L2 / FFmpeg / Vulkan dependency).  Used by CI to
+# gate bitstream synthesis modules against regressions.
+
+option(DAEDALUS_BUILD_TESTS "Build daemon unit tests" OFF)
+
+if (DAEDALUS_BUILD_TESTS)
+	add_executable(test_av1_obu_synth
+		src/test_av1_obu_synth.c
+		src/av1_obu_synth.c
+		src/bitstream_writer.c
+	)
+	target_include_directories(test_av1_obu_synth PRIVATE src)
+	# Test binary does not link FFmpeg / Vulkan / dl — it exercises
+	# pure-C encoders against in-memory inputs.
+
+	enable_testing()
+	add_test(NAME av1_obu_synth COMMAND test_av1_obu_synth)
+endif()
@@ -0,0 +1,897 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * av1_obu_synth.c — encode AV1 OBU bytes from V4L2 stateless controls.
+ *
+ * Spec references throughout are to the AOM AV1 Bitstream and Decoding
+ * Process Specification rev 1.0.0 with errata.  See §5.3.2 OBU header
+ * syntax, §5.5.1 sequence_header_obu syntax, §5.9.2 trailing_bits().
+ *
+ * Synthesis defaults — fields the V4L2 control surface doesn't carry
+ * are set to values that match the "common-case profile-0 4:2:0 8-bit"
+ * path the V4L2 stateless AV1 contract is overwhelmingly used for.
+ * Specifically:
+ *
+ *   - reduced_still_picture_header = 0 (full sequence-header form)
+ *   - timing_info_present_flag = 0
+ *   - decoder_model_info_present_flag = 0
+ *   - initial_display_delay_present_flag = 0
+ *   - operating_points_cnt_minus_1 = 0 (single operating point)
+ *   - operating_point_idc[0] = 0 (all temporal/spatial layers)
+ *   - seq_level_idx[0] = 13 (level 5.1 — supports up to 4K, well past
+ *     anything libva-v4l2-request is likely to drive; libavcodec is
+ *     lenient on level mismatches that don't constrain the frame size)
+ *   - seq_tier[0] = 0
+ *   - frame_id_numbers_present_flag = 0
+ *   - seq_choose_screen_detection_tools = 1 (SELECT) so
+ *     seq_force_screen_content_tools = 2 (SELECT)
+ *   - seq_choose_integer_mv = 1 (SELECT) so seq_force_integer_mv = 2
+ *   - color_description_present_flag = 0 (V4L2 ctrl doesn't carry CICP)
+ *   - chroma_sample_position = 0 (CSP_UNKNOWN)
+ *
+ * If a V4L2 sequence control arrives with bit_depth / seq_profile /
+ * subsampling combinations the AV1 spec doesn't allow (e.g. profile 1
+ * with bit_depth 12), we return 0 to surface the mismatch loudly rather
+ * than silently encoding nonsense the libavcodec parser would reject.
+ */
+
+#include "av1_obu_synth.h"
+#include "bitstream_writer.h"
+
+#include <string.h>
+
+#define OBU_SEQUENCE_HEADER	 1
+#define OBU_TEMPORAL_DELIMITER	 2
+#define OBU_FRAME_HEADER	 3
+#define OBU_TILE_GROUP		 4
+#define OBU_FRAME		 6
+
+/* AV1 §3 ref-frame symbolic constants — values per the spec table.
+ * INTRA_FRAME is index 0 (used for intra-only); LAST_FRAME..ALTREF_FRAME
+ * are 1..7.  TOTAL_REFS_PER_FRAME = 8 (V4L2 mirrors this). */
+#define AV1_INTRA_FRAME		0
+#define AV1_LAST_FRAME		1
+#define AV1_NUM_REF_FRAMES	8	/* the DPB size */
+#define AV1_REFS_PER_FRAME	7	/* refs available to an inter frame */
+#define AV1_PRIMARY_REF_NONE	7
+
+/* Default operating-point level: 5.1 — supports any frame size up to
+ * 4K@60fps.  Well past anything the V4L2 path is realistically driven
+ * with on Pi 5; libavcodec doesn't enforce level against actual frame
+ * dims at decode time, it just uses the field to size some bitstream
+ * limits (max tile cols, etc.) that aren't load-bearing for stream
+ * conformance. */
+#define DEFAULT_SEQ_LEVEL_IDX	13
+
+/*
+ * leb128 (§4.10.5) — unsigned variable-length encoding, 7 value bits per
+ * byte, MSB of each byte set when another byte follows.  Writes to
+ * @out[w..] at byte alignment.  Returns number of bytes written, or 0
+ * on overflow.  AV1 caps leb128 at 8 bytes (Leb128Bytes constraint).
+ */
+static size_t leb128_put(uint32_t v, uint8_t *out, size_t cap)
+{
+	size_t w = 0;
+
+	do {
+		uint8_t byte = (uint8_t) (v & 0x7fu);
+
+		v >>= 7;
+		if (v != 0)
+			byte |= 0x80u;
+		if (w >= cap)
+			return 0;
+		out[w++] = byte;
+	} while (v != 0);
+
+	return w;
+}
+
+/*
+ * Smallest n such that (1 << n) > x; i.e. ceil(log2(x + 1)).
+ * Used to compute frame_width_bits_minus_1 / frame_height_bits_minus_1
+ * from max_frame_width_minus_1 / max_frame_height_minus_1.  Spec wants
+ * n_bits ≥ ceil(log2(max+1)), with n_bits encoded as (n_bits - 1) in
+ * f(4) — so the value must fit in [1, 16].  We clamp to 16 (which
+ * accommodates a 65536-pixel frame, comfortably absurd).
+ */
+static int min_bits_for(uint32_t x)
+{
+	int n = 0;
+
+	while (x) {
+		n++;
+		x >>= 1;
+	}
+	return n == 0 ? 1 : n;
+}
+
+/*
+ * Resolve subsampling per §5.5.2.  V4L2 carries SUBSAMPLING_X and
+ * SUBSAMPLING_Y as flags but the AV1 spec forces them based on
+ * seq_profile + bit_depth in some branches.  Returns 1 on success and
+ * 0 on illegal combination (e.g. profile 1 + bit_depth 12, which the
+ * spec doesn't allow).  Output via the two int pointers.
+ *
+ * Note: we intentionally don't honour the V4L2 flags in the forced
+ * branches.  Producers that set them inconsistently with seq_profile
+ * are bug; we trust the profile.
+ */
+static int resolve_subsampling(uint8_t seq_profile, uint8_t bit_depth,
+			       uint32_t flags, bool monochrome,
+			       int *out_x, int *out_y)
+{
+	if (monochrome) {
+		*out_x = 1;
+		*out_y = 1;
+		return 1;
+	}
+
+	switch (seq_profile) {
+	case 0:		/* 4:2:0 8/10-bit */
+		if (bit_depth != 8 && bit_depth != 10)
+			return 0;
+		*out_x = 1;
+		*out_y = 1;
+		return 1;
+	case 1:		/* 4:4:4 8/10-bit */
+		if (bit_depth != 8 && bit_depth != 10)
+			return 0;
+		*out_x = 0;
+		*out_y = 0;
+		return 1;
+	case 2:		/* 4:2:2 or 4:2:0/4:4:4 12-bit */
+		if (bit_depth == 12) {
+			*out_x = (flags & V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X) ? 1 : 0;
+			if (*out_x)
+				*out_y = (flags & V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y) ? 1 : 0;
+			else
+				*out_y = 0;
+		} else if (bit_depth == 8 || bit_depth == 10) {
+			*out_x = 1;	/* forced 4:2:2 */
+			*out_y = 0;
+		} else {
+			return 0;
+		}
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+size_t av1_synth_sequence_header_obu(const struct v4l2_ctrl_av1_sequence *seq,
+				     uint8_t *out, size_t out_cap)
+{
+	uint8_t rbsp[64];
+	struct bs_writer bs;
+	uint32_t flags;
+	uint8_t bit_depth;
+	uint8_t seq_profile;
+	bool still_picture, monochrome, enable_order_hint;
+	int high_bitdepth, twelve_bit;
+	int subsampling_x, subsampling_y;
+	int width_bits, height_bits;
+	size_t payload_len;
+	size_t w;
+
+	if (!seq || !out || out_cap < 8)
+		return 0;
+
+	flags		= seq->flags;
+	bit_depth	= seq->bit_depth;
+	seq_profile	= seq->seq_profile;
+	still_picture	= !!(flags & V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE);
+	monochrome	= !!(flags & V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME);
+	enable_order_hint = !!(flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT);
+
+	/* Sanity checks against the spec's allowed combinations. */
+	if (seq_profile > 2)
+		return 0;
+	if (bit_depth != 8 && bit_depth != 10 && bit_depth != 12)
+		return 0;
+	if (seq_profile == 1 && monochrome)
+		return 0;	/* profile 1 must be 4:4:4 colour */
+
+	high_bitdepth = (bit_depth > 8) ? 1 : 0;
+	twelve_bit    = (seq_profile == 2 && bit_depth == 12) ? 1 : 0;
+
+	if (!resolve_subsampling(seq_profile, bit_depth, flags, monochrome,
+				 &subsampling_x, &subsampling_y))
+		return 0;
+
+	width_bits  = min_bits_for((uint32_t) seq->max_frame_width_minus_1);
+	height_bits = min_bits_for((uint32_t) seq->max_frame_height_minus_1);
+	if (width_bits > 16 || height_bits > 16)
+		return 0;	/* spec encodes (n - 1) in f(4): n in [1, 16] */
+
+	bsw_init(&bs, rbsp, sizeof(rbsp));
+
+	/* --- sequence_header_obu --- §5.5.1 --- */
+
+	bsw_put_u(&bs, seq_profile, 3);
+	bsw_put_u(&bs, still_picture ? 1u : 0u, 1);
+	bsw_put_u(&bs, 0u, 1);				/* reduced_still_picture_header */
+
+	/* full-form path (reduced_still_picture_header == 0) */
+	bsw_put_u(&bs, 0u, 1);				/* timing_info_present_flag */
+	bsw_put_u(&bs, 0u, 1);				/* initial_display_delay_present_flag */
+	bsw_put_u(&bs, 0u, 5);				/* operating_points_cnt_minus_1 */
+	bsw_put_u(&bs, 0u, 12);				/* operating_point_idc[0] */
+	bsw_put_u(&bs, DEFAULT_SEQ_LEVEL_IDX, 5);	/* seq_level_idx[0] */
+	if (DEFAULT_SEQ_LEVEL_IDX > 7)
+		bsw_put_u(&bs, 0u, 1);			/* seq_tier[0] */
+
+	bsw_put_u(&bs, (uint32_t)(width_bits  - 1), 4);	/* frame_width_bits_minus_1 */
+	bsw_put_u(&bs, (uint32_t)(height_bits - 1), 4);	/* frame_height_bits_minus_1 */
+	bsw_put_u(&bs, (uint32_t) seq->max_frame_width_minus_1,  width_bits);
+	bsw_put_u(&bs, (uint32_t) seq->max_frame_height_minus_1, height_bits);
+
+	bsw_put_u(&bs, 0u, 1);				/* frame_id_numbers_present_flag */
+
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK) ? 1u : 0u, 1);
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA) ? 1u : 0u, 1);
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER) ? 1u : 0u, 1);
+
+	/* non-still-picture block — V4L2 controls fill these in */
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND) ? 1u : 0u, 1);
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND) ? 1u : 0u, 1);
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION) ? 1u : 0u, 1);
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER) ? 1u : 0u, 1);
+	bsw_put_u(&bs, enable_order_hint ? 1u : 0u, 1);
+	if (enable_order_hint) {
+		bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP) ? 1u : 0u, 1);
+		bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS) ? 1u : 0u, 1);
+	}
+	bsw_put_u(&bs, 1u, 1);				/* seq_choose_screen_detection_tools */
+	/* seq_force_screen_content_tools = SELECT (2), so no further bits */
+	/* seq_choose_integer_mv path:
+	 *   seq_force_screen_content_tools > 0, so we emit seq_choose_integer_mv = 1
+	 *   (SELECT) — which leaves seq_force_integer_mv = SELECT (2) without
+	 *   further bits. */
+	bsw_put_u(&bs, 1u, 1);				/* seq_choose_integer_mv */
+	if (enable_order_hint) {
+		uint8_t ohb = seq->order_hint_bits;
+		if (ohb < 1)  ohb = 1;
+		if (ohb > 8)  ohb = 8;
+		bsw_put_u(&bs, (uint32_t)(ohb - 1), 3);	/* order_hint_bits_minus_1 */
+	}
+
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES) ? 1u : 0u, 1);
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF) ? 1u : 0u, 1);
+	bsw_put_u(&bs, (flags & V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION) ? 1u : 0u, 1);
+
+	/* --- color_config() --- §5.5.2 --- */
+
+	bsw_put_u(&bs, high_bitdepth ? 1u : 0u, 1);
+	if (seq_profile == 2 && high_bitdepth)
+		bsw_put_u(&bs, twelve_bit ? 1u : 0u, 1);
+	if (seq_profile != 1)
+		bsw_put_u(&bs, monochrome ? 1u : 0u, 1);
+	bsw_put_u(&bs, 0u, 1);				/* color_description_present_flag */
+
+	if (monochrome) {
+		bsw_put_u(&bs,
+			  (flags & V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE) ? 1u : 0u, 1);
+		/* monochrome path: subsampling/chroma_sample_position/separate_uv_delta_q
+		 * are forced by the spec — no further bits emitted. */
+	} else {
+		bsw_put_u(&bs,
+			  (flags & V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE) ? 1u : 0u, 1);
+		/* subsampling encoding depends on seq_profile */
+		if (seq_profile == 2 && bit_depth == 12) {
+			bsw_put_u(&bs, subsampling_x ? 1u : 0u, 1);
+			if (subsampling_x)
+				bsw_put_u(&bs, subsampling_y ? 1u : 0u, 1);
+		}
+		/* profile 0 / profile 1 / profile 2 non-12-bit: subsampling is
+		 * forced by the spec, no bits emitted. */
+		if (subsampling_x && subsampling_y) {
+			/* chroma_sample_position f(2) — V4L2 ctrl doesn't carry
+			 * this; default CSP_UNKNOWN (0). */
+			bsw_put_u(&bs, 0u, 2);
+		}
+		bsw_put_u(&bs,
+			  (flags & V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q) ? 1u : 0u, 1);
+	}
+
+	/* film_grain_params_present + trailing_bits */
+	bsw_put_u(&bs,
+		  (flags & V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT) ? 1u : 0u, 1);
+
+	bsw_align_rbsp(&bs);
+	if (bsw_overflowed(&bs))
+		return 0;
+
+	payload_len = bsw_bytes(&bs);
+
+	/* --- assemble OBU: header byte | leb128(payload_len) | payload --- */
+
+	/* OBU header byte (§5.3.2):
+	 *   obu_forbidden_bit = 0      [bit 7]
+	 *   obu_type          = 1      [bits 6..3]  (OBU_SEQUENCE_HEADER)
+	 *   obu_extension_flag = 0     [bit 2]
+	 *   obu_has_size_field = 1     [bit 1]
+	 *   obu_reserved_1bit  = 0     [bit 0]
+	 * → 0_0001_0_1_0 = 0x0A
+	 */
+	w = 0;
+	if (w >= out_cap)
+		return 0;
+	out[w++] = (uint8_t)
+		((0u << 7) |
+		 ((OBU_SEQUENCE_HEADER & 0xfu) << 3) |
+		 (0u << 2) |
+		 (1u << 1) |
+		 (0u << 0));
+
+	{
+		size_t leb_n = leb128_put((uint32_t) payload_len,
+					   out + w, out_cap - w);
+		if (leb_n == 0)
+			return 0;
+		w += leb_n;
+	}
+
+	if (out_cap - w < payload_len)
+		return 0;
+	memcpy(out + w, rbsp, payload_len);
+	w += payload_len;
+
+	return w;
+}
+
+/* -------------------------------------------------------------------
+ * Shared OBU wrap helper (header byte + leb128 size + payload).  Used
+ * by frame_header_obu and the temporal_delimiter helper; the sequence
+ * header above predates this factor-out and keeps its inline
+ * assembly so its memory footprint stays predictable.
+ * ----------------------------------------------------------------- */
+static size_t wrap_obu(uint8_t obu_type, const uint8_t *payload,
+		       size_t payload_len, uint8_t *out, size_t out_cap)
+{
+	size_t w = 0;
+
+	if (out_cap < 2)
+		return 0;
+	out[w++] = (uint8_t)(
+		(0u << 7) |
+		((obu_type & 0xfu) << 3) |
+		(0u << 2) |
+		(1u << 1) |
+		(0u << 0));
+
+	{
+		size_t leb_n = leb128_put((uint32_t) payload_len,
+					   out + w, out_cap - w);
+		if (leb_n == 0)
+			return 0;
+		w += leb_n;
+	}
+
+	if (out_cap - w < payload_len)
+		return 0;
+	if (payload_len)
+		memcpy(out + w, payload, payload_len);
+	w += payload_len;
+	return w;
+}
+
+size_t av1_synth_temporal_delimiter_obu(uint8_t *out, size_t out_cap)
+{
+	return wrap_obu(OBU_TEMPORAL_DELIMITER, NULL, 0, out, out_cap);
+}
+
+/* -------------------------------------------------------------------
+ * Frame Header OBU — §5.9
+ *
+ * The encoder is sectioned to mirror the spec.  Each subsection
+ * helper writes into the shared bs_writer and signals "out of
+ * scope" by setting a sticky `*unsupported` flag that the top-level
+ * checks before returning.  This keeps the spec-mirror linear and
+ * the failure modes diagnosable.
+ * ----------------------------------------------------------------- */
+
+/* MiCols / MiRows per spec §3 — 4x4-unit count, rounded up to the
+ * 8x8 alignment the spec uses for tiling math.  Returns AlignPow2
+ * of ((dim + 7) >> 3) at miSize=2 (8x8 mi-block). */
+static uint32_t mi_cols_for(uint32_t frame_width)
+{
+	uint32_t mi = (frame_width + 7u) >> 3;
+	return mi << 1;	/* 4x4 mi units == miCols */
+}
+static uint32_t mi_rows_for(uint32_t frame_height)
+{
+	uint32_t mi = (frame_height + 7u) >> 3;
+	return mi << 1;
+}
+
+/* tile_log2(blkSize, target) per AV1 §5.9.15 — smallest k such that
+ * (blkSize << k) >= target. */
+static int tile_log2_ge(int blk, int target)
+{
+	int k = 0;
+	while ((blk << k) < target) k++;
+	return k;
+}
+
+/* §5.9.12 quantization_params */
+static void write_quantization_params(struct bs_writer *bs,
+				      const struct v4l2_av1_quantization *q,
+				      bool num_planes_gt_1,
+				      bool separate_uv_delta_q)
+{
+	bsw_put_u(bs, q->base_q_idx, 8);
+
+	/* read_delta_q: 1 bit "delta_coded" + (s(7)?) — we always emit
+	 * the full delta if non-zero, zero-encoded as delta_coded=0
+	 * (single bit). */
+	#define EMIT_DELTA_Q(val) do {					\
+		int _v = (int8_t)(val);					\
+		if (_v != 0) {						\
+			bsw_put_u(bs, 1u, 1);				\
+			/* su(1+6): sign + 6-bit magnitude */		\
+			if (_v < 0) {					\
+				bsw_put_u(bs, (uint32_t)(_v + 128) & 0x7fu, 7); \
+			} else {					\
+				bsw_put_u(bs, (uint32_t)_v & 0x7fu, 7); \
+			}						\
+		} else {						\
+			bsw_put_u(bs, 0u, 1);				\
+		}							\
+	} while (0)
+
+	EMIT_DELTA_Q(q->delta_q_y_dc);
+
+	if (num_planes_gt_1) {
+		if (separate_uv_delta_q)
+			bsw_put_u(bs,
+				  (q->flags & V4L2_AV1_QUANTIZATION_FLAG_DIFF_UV_DELTA) ? 1u : 0u,
+				  1);
+		EMIT_DELTA_Q(q->delta_q_u_dc);
+		EMIT_DELTA_Q(q->delta_q_u_ac);
+		if (separate_uv_delta_q &&
+		    (q->flags & V4L2_AV1_QUANTIZATION_FLAG_DIFF_UV_DELTA)) {
+			EMIT_DELTA_Q(q->delta_q_v_dc);
+			EMIT_DELTA_Q(q->delta_q_v_ac);
+		}
+	}
+	#undef EMIT_DELTA_Q
+
+	bsw_put_u(bs,
+		  (q->flags & V4L2_AV1_QUANTIZATION_FLAG_USING_QMATRIX) ? 1u : 0u,
+		  1);
+	if (q->flags & V4L2_AV1_QUANTIZATION_FLAG_USING_QMATRIX) {
+		bsw_put_u(bs, q->qm_y, 4);
+		bsw_put_u(bs, q->qm_u, 4);
+		if (num_planes_gt_1 && separate_uv_delta_q)
+			bsw_put_u(bs, q->qm_v, 4);
+	}
+}
+
+/* §5.9.11 loop_filter_params */
+static void write_loop_filter_params(struct bs_writer *bs,
+				     const struct v4l2_av1_loop_filter *lf,
+				     bool num_planes_gt_1,
+				     bool coded_lossless_or_allow_intrabc)
+{
+	if (coded_lossless_or_allow_intrabc) {
+		/* spec §6.8.10: when CodedLossless or allow_intrabc is set,
+		 * loop filter levels are inferred and not coded. */
+		return;
+	}
+	bsw_put_u(bs, lf->level[0], 6);
+	bsw_put_u(bs, lf->level[1], 6);
+	if (num_planes_gt_1) {
+		if (lf->level[0] || lf->level[1]) {
+			bsw_put_u(bs, lf->level[2], 6);
+			bsw_put_u(bs, lf->level[3], 6);
+		}
+	}
+	bsw_put_u(bs, lf->sharpness, 3);
+
+	/* loop_filter_delta_enabled */
+	bool delta_en = !!(lf->flags & V4L2_AV1_LOOP_FILTER_FLAG_DELTA_ENABLED);
+	bsw_put_u(bs, delta_en ? 1u : 0u, 1);
+	if (delta_en) {
+		bool delta_upd = !!(lf->flags & V4L2_AV1_LOOP_FILTER_FLAG_DELTA_UPDATE);
+		bsw_put_u(bs, delta_upd ? 1u : 0u, 1);
+		if (delta_upd) {
+			int i;
+			for (i = 0; i < 8; i++) {
+				/* update_ref_delta: emit 0 (no update) — V4L2 carries the
+				 * cumulative state; trying to differentially encode here
+				 * would need previous-frame state we don't track. */
+				bsw_put_u(bs, 0u, 1);
+			}
+			for (i = 0; i < 2; i++)
+				bsw_put_u(bs, 0u, 1);
+		}
+	}
+}
+
+/* §5.9.19 cdef_params */
+static void write_cdef_params(struct bs_writer *bs,
+			      const struct v4l2_av1_cdef *cdef,
+			      bool num_planes_gt_1,
+			      bool enable_cdef,
+			      bool coded_lossless_or_intrabc)
+{
+	int i, n;
+	if (!enable_cdef || coded_lossless_or_intrabc)
+		return;
+	bsw_put_u(bs, cdef->damping_minus_3, 2);
+	bsw_put_u(bs, cdef->bits, 2);
+	n = 1 << cdef->bits;
+	for (i = 0; i < n; i++) {
+		bsw_put_u(bs, cdef->y_pri_strength[i] & 0xfu, 4);
+		bsw_put_u(bs, cdef->y_sec_strength[i] & 0x3u, 2);
+		if (num_planes_gt_1) {
+			bsw_put_u(bs, cdef->uv_pri_strength[i] & 0xfu, 4);
+			bsw_put_u(bs, cdef->uv_sec_strength[i] & 0x3u, 2);
+		}
+	}
+}
+
+/* §5.9.20 lr_params — only RESTORE_NONE supported here */
+static int write_lr_params(struct bs_writer *bs,
+			   const struct v4l2_av1_loop_restoration *lr,
+			   int num_planes,
+			   bool enable_restoration,
+			   bool coded_lossless_or_intrabc)
+{
+	int p;
+	if (!enable_restoration || coded_lossless_or_intrabc)
+		return 1;
+
+	/* Out-of-scope if ANY plane uses restoration */
+	if (lr->frame_restoration_type[0] != V4L2_AV1_FRAME_RESTORE_NONE)
+		return 0;
+	if (num_planes > 1) {
+		if (lr->frame_restoration_type[1] != V4L2_AV1_FRAME_RESTORE_NONE)
+			return 0;
+		if (lr->frame_restoration_type[2] != V4L2_AV1_FRAME_RESTORE_NONE)
+			return 0;
+	}
+	/* Emit 2-bit RESTORE_NONE per plane */
+	for (p = 0; p < num_planes; p++)
+		bsw_put_u(bs, 0u, 2);
+	return 1;
+}
+
+/* §5.9.15 tile_info — single-tile uniform-spacing path only */
+static int write_tile_info_single_tile(struct bs_writer *bs,
+				       uint32_t frame_width,
+				       uint32_t frame_height,
+				       bool use_128_sb)
+{
+	uint32_t mi_cols = mi_cols_for(frame_width);
+	uint32_t mi_rows = mi_rows_for(frame_height);
+	int sb_log2 = use_128_sb ? 5 : 4;	/* mi units */
+	uint32_t sb_cols = (mi_cols + ((1u << sb_log2) - 1u)) >> sb_log2;
+	uint32_t sb_rows = (mi_rows + ((1u << sb_log2) - 1u)) >> sb_log2;
+	int min_log2_cols = tile_log2_ge(use_128_sb ? 4096 : 4096 / 1,
+					 (int)(sb_cols * (use_128_sb ? 128 : 64)));
+	(void) min_log2_cols;
+
+	/* uniform_tile_spacing_flag = 1, both increment loops = 0 →
+	 * tile_cols_log2 = tile_rows_log2 = 0 (single tile).  This
+	 * matches "uniform spacing with no width/height halving" which
+	 * is the simplest valid encoding. */
+	bsw_put_u(bs, 1u, 1);	/* uniform_tile_spacing_flag */
+
+	/* increment_tile_cols_log2: 0 zeros + the next non-increment
+	 * bit terminates the loop.  In single-tile mode we encode the
+	 * terminator immediately. */
+	(void) sb_cols;
+	(void) sb_rows;
+	/* The increment loops in the spec run while
+	 * tile_cols_log2 < max_log2_tile_cols, reading bits until a 0
+	 * appears.  For our forced single-tile, we emit a single 0 bit
+	 * to terminate the cols loop and another for the rows loop. */
+	bsw_put_u(bs, 0u, 1);	/* terminate cols */
+	bsw_put_u(bs, 0u, 1);	/* terminate rows */
+
+	/* tile_size_bytes_minus_1: 0 (1 byte) — only meaningful when
+	 * NumTiles > 1, but spec emits it unconditionally when
+	 * NumTiles > 1.  Single tile → not emitted.  We're single tile,
+	 * skip. */
+	return 1;
+}
+
+size_t av1_synth_frame_header_obu(const struct v4l2_ctrl_av1_sequence *seq,
+				  const struct v4l2_ctrl_av1_frame *fr,
+				  uint8_t *out, size_t out_cap)
+{
+	uint8_t rbsp[256];
+	struct bs_writer bs;
+	uint32_t sf, ff;
+	bool show_existing_frame = false;
+	bool reduced_still_picture_header;
+	bool show_frame, showable_frame, error_resilient_mode;
+	bool disable_cdf_update, allow_screen_content_tools;
+	bool force_integer_mv, allow_intrabc, frame_size_override;
+	bool allow_high_precision_mv, is_motion_mode_switchable;
+	bool use_ref_frame_mvs, disable_frame_end_update_cdf;
+	bool reference_select, allow_warped_motion, reduced_tx_set;
+	bool skip_mode_present, monochrome;
+	uint8_t frame_type, primary_ref_frame;
+	uint32_t frame_width, frame_height;
+	int num_planes;
+	int width_bits, height_bits;
+	uint8_t order_hint_bits;
+	bool enable_order_hint, enable_ref_frame_mvs, enable_warped_motion_seq;
+	bool enable_cdef_seq, enable_restoration_seq;
+	int i;
+
+	if (!seq || !fr || !out || out_cap < 16)
+		return 0;
+
+	sf = seq->flags;
+	ff = fr->flags;
+
+	/* sanity */
+	monochrome = !!(sf & V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME);
+	num_planes = monochrome ? 1 : 3;
+	enable_order_hint = !!(sf & V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT);
+	enable_ref_frame_mvs = !!(sf & V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS);
+	enable_warped_motion_seq = !!(sf & V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION);
+	enable_cdef_seq = !!(sf & V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF);
+	enable_restoration_seq = !!(sf & V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION);
+	order_hint_bits = enable_order_hint ? seq->order_hint_bits : 0;
+	if (order_hint_bits > 8) order_hint_bits = 8;
+	reduced_still_picture_header = false;	/* matches sequence-header default */
+
+	frame_type = fr->frame_type;
+	if (frame_type == V4L2_AV1_SWITCH_FRAME)
+		return 0;	/* out of scope */
+
+	show_frame = !!(ff & V4L2_AV1_FRAME_FLAG_SHOW_FRAME);
+	showable_frame = !!(ff & V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME);
+	error_resilient_mode = !!(ff & V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE);
+	disable_cdf_update = !!(ff & V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE);
+	allow_screen_content_tools = !!(ff & V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS);
+	force_integer_mv = !!(ff & V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV);
+	allow_intrabc = !!(ff & V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC);
+	frame_size_override = !!(ff & V4L2_AV1_FRAME_FLAG_FRAME_SIZE_OVERRIDE);
+	allow_high_precision_mv = !!(ff & V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV);
+	is_motion_mode_switchable = !!(ff & V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE);
+	use_ref_frame_mvs = !!(ff & V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS);
+	disable_frame_end_update_cdf = !!(ff & V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF);
+	reference_select = !!(ff & V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT);
+	allow_warped_motion = !!(ff & V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION);
+	reduced_tx_set = !!(ff & V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET);
+	skip_mode_present = !!(ff & V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT);
+	primary_ref_frame = fr->primary_ref_frame;
+
+	frame_width = fr->frame_width_minus_1 + 1;
+	frame_height = fr->frame_height_minus_1 + 1;
+
+	width_bits = min_bits_for((uint32_t) seq->max_frame_width_minus_1);
+	height_bits = min_bits_for((uint32_t) seq->max_frame_height_minus_1);
+
+	bsw_init(&bs, rbsp, sizeof(rbsp));
+
+	/* show_existing_frame: 0 (V4L2 doesn't surface the show-only path
+	 * — every fr ctrl describes a real decoded frame). */
+	bsw_put_u(&bs, show_existing_frame ? 1u : 0u, 1);
+
+	bsw_put_u(&bs, (uint32_t) frame_type, 2);
+	bsw_put_u(&bs, show_frame ? 1u : 0u, 1);
+	if (show_frame) {
+		/* No decoder_model_info_present_flag emitted in seq header,
+		 * so no buffer-removal-time bits here either. */
+	} else {
+		bsw_put_u(&bs, showable_frame ? 1u : 0u, 1);
+	}
+	if (frame_type == V4L2_AV1_SWITCH_FRAME ||
+	    (frame_type == V4L2_AV1_KEY_FRAME && show_frame)) {
+		/* error_resilient_mode = 1 inferred — not coded */
+	} else {
+		bsw_put_u(&bs, error_resilient_mode ? 1u : 0u, 1);
+	}
+
+	bsw_put_u(&bs, disable_cdf_update ? 1u : 0u, 1);
+	/* allow_screen_content_tools coded as 1 bit when sequence
+	 * forces NOT-SELECT; SELECT mode means we always emit a 1 bit
+	 * for the SELECT_SCREEN_CONTENT_TOOLS path.  Our sequence
+	 * header always emits SELECT, so emit a single bit equal to
+	 * the V4L2 flag. */
+	bsw_put_u(&bs, allow_screen_content_tools ? 1u : 0u, 1);
+	if (allow_screen_content_tools) {
+		/* seq_force_integer_mv = SELECT (2) so:
+		 *   force_integer_mv coded as 1 bit */
+		bsw_put_u(&bs, force_integer_mv ? 1u : 0u, 1);
+	}
+
+	/* frame_id_numbers_present_flag = 0 in seq → no current_frame_id */
+
+	if (frame_type != V4L2_AV1_SWITCH_FRAME && !reduced_still_picture_header)
+		bsw_put_u(&bs, frame_size_override ? 1u : 0u, 1);
+
+	if (enable_order_hint)
+		bsw_put_u(&bs, fr->order_hint, order_hint_bits);
+
+	if (frame_type != V4L2_AV1_KEY_FRAME && frame_type != V4L2_AV1_INTRA_ONLY_FRAME &&
+	    !error_resilient_mode)
+		bsw_put_u(&bs, primary_ref_frame, 3);
+
+	/* frame_size + render_size (§5.9.5, §5.9.6) */
+	if (frame_size_override) {
+		bsw_put_u(&bs, fr->frame_width_minus_1, width_bits);
+		bsw_put_u(&bs, fr->frame_height_minus_1, height_bits);
+	}
+	/* superres_params: §5.9.8 */
+	{
+		bool use_superres = !!(ff & V4L2_AV1_FRAME_FLAG_USE_SUPERRES);
+		if (sf & V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES)
+			bsw_put_u(&bs, use_superres ? 1u : 0u, 1);
+		if (use_superres) {
+			/* coded_denom = superres_denom - SUPERRES_DENOM_MIN(9) */
+			int denom = fr->superres_denom;
+			if (denom < 9) denom = 9;
+			bsw_put_u(&bs, (uint32_t)(denom - 9) & 0x7u, 3);
+		}
+	}
+	/* render_size present flag: 1 if render dims given */
+	{
+		bool render_and_frame_match =
+			(fr->render_width_minus_1 == fr->frame_width_minus_1) &&
+			(fr->render_height_minus_1 == fr->frame_height_minus_1);
+		bsw_put_u(&bs, render_and_frame_match ? 0u : 1u, 1);
+		if (!render_and_frame_match) {
+			bsw_put_u(&bs, fr->render_width_minus_1, 16);
+			bsw_put_u(&bs, fr->render_height_minus_1, 16);
+		}
+	}
+
+	if (frame_type != V4L2_AV1_KEY_FRAME && frame_type != V4L2_AV1_INTRA_ONLY_FRAME) {
+		/* allow_intrabc only on key/intra-only — skip for inter */
+		(void) allow_intrabc;
+		if (!error_resilient_mode && enable_order_hint)
+			bsw_put_u(&bs, 0u, 1);	/* frame_refs_short_signaling */
+		/* read ref_frame_idx for each of REFS_PER_FRAME */
+		for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+			int8_t idx = fr->ref_frame_idx[i];
+			if (idx < 0) idx = 0;
+			bsw_put_u(&bs, (uint32_t)(idx & 0x7), 3);
+		}
+		if (frame_size_override && !error_resilient_mode) {
+			/* found_ref loop — emit "no" for each, so frame_size
+			 * fields above already populated. */
+			for (i = 0; i < AV1_REFS_PER_FRAME; i++)
+				bsw_put_u(&bs, 0u, 1);
+		}
+		bsw_put_u(&bs, allow_high_precision_mv ? 1u : 0u, 1);
+		/* read_interpolation_filter: is_filter_switchable + value */
+		{
+			int interp = fr->interpolation_filter;
+			bool switchable = (interp == V4L2_AV1_INTERPOLATION_FILTER_SWITCHABLE);
+			bsw_put_u(&bs, switchable ? 1u : 0u, 1);
+			if (!switchable)
+				bsw_put_u(&bs, (uint32_t)interp & 0x3u, 2);
+		}
+		bsw_put_u(&bs, is_motion_mode_switchable ? 1u : 0u, 1);
+		if (!error_resilient_mode && enable_ref_frame_mvs)
+			bsw_put_u(&bs, use_ref_frame_mvs ? 1u : 0u, 1);
+	} else {
+		if (frame_type == V4L2_AV1_INTRA_ONLY_FRAME && allow_screen_content_tools)
+			bsw_put_u(&bs, allow_intrabc ? 1u : 0u, 1);
+		else if (frame_type == V4L2_AV1_KEY_FRAME && allow_screen_content_tools)
+			bsw_put_u(&bs, allow_intrabc ? 1u : 0u, 1);
+	}
+
+	/* disable_frame_end_update_cdf */
+	if (!disable_cdf_update)
+		bsw_put_u(&bs, disable_frame_end_update_cdf ? 1u : 0u, 1);
+
+	/* tile_info: single-tile path */
+	{
+		bool use_128 = !!(sf & V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK);
+		if (!write_tile_info_single_tile(&bs, frame_width, frame_height,
+						 use_128))
+			return 0;
+	}
+
+	/* quantization_params */
+	write_quantization_params(&bs, &fr->quantization,
+				  num_planes > 1,
+				  !!(sf & V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q));
+
+	/* segmentation_params: only enabled=0 supported */
+	{
+		bool seg_en = !!(fr->segmentation.flags & V4L2_AV1_SEGMENTATION_FLAG_ENABLED);
+		if (seg_en)
+			return 0;
+		bsw_put_u(&bs, 0u, 1);	/* segmentation_enabled */
+	}
+
+	/* delta_q_params + delta_lf_params */
+	{
+		bool delta_q_present = !!(fr->quantization.flags &
+					   V4L2_AV1_QUANTIZATION_FLAG_DELTA_Q_PRESENT);
+		if (fr->quantization.base_q_idx > 0) {
+			bsw_put_u(&bs, delta_q_present ? 1u : 0u, 1);
+			if (delta_q_present)
+				bsw_put_u(&bs, fr->quantization.delta_q_res & 0x3u, 2);
+		}
+		if (delta_q_present && !allow_intrabc) {
+			bool delta_lf_present =
+				!!(fr->loop_filter.flags & V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_PRESENT);
+			bsw_put_u(&bs, delta_lf_present ? 1u : 0u, 1);
+			if (delta_lf_present) {
+				bsw_put_u(&bs, fr->loop_filter.delta_lf_res & 0x3u, 2);
+				bsw_put_u(&bs,
+					  (fr->loop_filter.flags & V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_MULTI)
+						? 1u : 0u, 1);
+			}
+		}
+	}
+
+	/* coded_lossless heuristic: when base_q_idx==0 and all deltas==0
+	 * and qm not in use, AV1 treats the frame as lossless.  We
+	 * approximate with the base_q_idx check; the lf/cdef writers
+	 * gate on the same value. */
+	{
+		bool coded_lossless = (fr->quantization.base_q_idx == 0);
+
+		write_loop_filter_params(&bs, &fr->loop_filter,
+					 num_planes > 1,
+					 coded_lossless || allow_intrabc);
+		write_cdef_params(&bs, &fr->cdef, num_planes > 1,
+				  enable_cdef_seq,
+				  coded_lossless || allow_intrabc);
+		if (!write_lr_params(&bs, &fr->loop_restoration, num_planes,
+				     enable_restoration_seq,
+				     coded_lossless || allow_intrabc))
+			return 0;
+	}
+
+	/* read_tx_mode (§5.9.21) */
+	{
+		bool coded_lossless = (fr->quantization.base_q_idx == 0);
+		if (coded_lossless) {
+			/* tx_mode = ONLY_4X4 (inferred) */
+		} else {
+			int tx_mode = fr->tx_mode;
+			bsw_put_u(&bs, (tx_mode == V4L2_AV1_TX_MODE_SELECT) ? 1u : 0u, 1);
+			if (tx_mode != V4L2_AV1_TX_MODE_SELECT)
+				bsw_put_u(&bs, (tx_mode == V4L2_AV1_TX_MODE_LARGEST) ? 1u : 0u, 1);
+		}
+	}
+
+	/* frame_reference_mode (§5.9.23) */
+	if (frame_type != V4L2_AV1_KEY_FRAME && frame_type != V4L2_AV1_INTRA_ONLY_FRAME)
+		bsw_put_u(&bs, reference_select ? 1u : 0u, 1);
+
+	/* skip_mode_params (§5.9.22) */
+	{
+		bool skip_allowed = !!(ff & V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED);
+		if (skip_allowed)
+			bsw_put_u(&bs, skip_mode_present ? 1u : 0u, 1);
+	}
+
+	/* reduced_tx_set */
+	bsw_put_u(&bs, reduced_tx_set ? 1u : 0u, 1);
+
+	/* global_motion_params: §5.9.24 — emit IDENTITY for each ref */
+	if (frame_type != V4L2_AV1_KEY_FRAME && frame_type != V4L2_AV1_INTRA_ONLY_FRAME) {
+		int r;
+		(void) enable_warped_motion_seq;
+		(void) allow_warped_motion;
+		for (r = 1; r < AV1_NUM_REF_FRAMES; r++) {
+			uint8_t wm_type = fr->global_motion.type[r];
+			if (wm_type != V4L2_AV1_WARP_MODEL_IDENTITY)
+				return 0;	/* out of scope */
+			bsw_put_u(&bs, 0u, 1);	/* is_global = 0 → identity */
+		}
+	}
+
+	/* film_grain_params: §6.8.20 — only "not present" path supported */
+	if (sf & V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT)
+		return 0;	/* out of scope: film grain coding deferred */
+
+	bsw_align_rbsp(&bs);
+	if (bsw_overflowed(&bs))
+		return 0;
+
+	return wrap_obu(OBU_FRAME_HEADER, rbsp, bsw_bytes(&bs), out, out_cap);
+}
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * av1_obu_synth.h — synthesise AV1 OBU bytes from the V4L2 stateless
+ * AV1 controls.
+ *
+ * V4L2 stateless AV1 (per drivers/media/v4l2-core/v4l2-h264.c-style
+ * contract) passes the OUTPUT buffer as bare tile-group bitstream and
+ * the sequence / frame-header information as structured controls
+ * (V4L2_CID_STATELESS_AV1_SEQUENCE, V4L2_CID_STATELESS_AV1_FRAME, ...).
+ * libavcodec's AV1 decoder is full-bitstream, so the daemon has to
+ * reconstruct the OBUs that the consumer parsed out and prepend them
+ * to the tile-group bytes before handing the assembled stream to
+ * libavcodec.
+ *
+ * This header covers Sequence Header (§5.5.1), Temporal Delimiter
+ * (§5.6), and Frame Header (§5.9) OBUs.  All share the same wire
+ * conventions:
+ *   - No emulation prevention (AV1 uses leb128 sized fields instead).
+ *   - obu_has_size_field = 1 in the OBU header byte.
+ *   - obu_extension_flag = 0 (no temporal_id / spatial_id encoding).
+ *   - trailing_bits() finalises the payload to a byte boundary the same
+ *     way H.264's rbsp_trailing_bits does — bsw_align_rbsp covers it.
+ *
+ * Synthesis decisions for fields V4L2 doesn't carry are documented in
+ * the .c file (search for "synthesis default").
+ */
+#ifndef DAEDALUS_AV1_OBU_SYNTH_H
+#define DAEDALUS_AV1_OBU_SYNTH_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <linux/v4l2-controls.h>
+
+/*
+ * Encode an AV1 Sequence Header OBU (header byte + leb128 size + RBSP)
+ * into @out.  Returns total bytes written, or 0 on overflow / malformed
+ * input (e.g. inconsistent bit_depth vs seq_profile).  @out_cap must
+ * be at least 32 bytes for any reasonable sequence header; 64 bytes
+ * is a generous upper bound.
+ *
+ * The caller is expected to bracket the resulting bytes with a
+ * Temporal Delimiter OBU (1 byte: 0x12 0x00) before any Frame OBU so
+ * that libavcodec's AV1 parser sees a well-formed access unit; the
+ * temporal-delimiter byte is trivial and not produced here.
+ */
+size_t av1_synth_sequence_header_obu(const struct v4l2_ctrl_av1_sequence *seq,
+				     uint8_t *out, size_t out_cap);
+
+/*
+ * Encode an AV1 Temporal Delimiter OBU into @out.  Always exactly 2
+ * bytes: 0x12 (obu_type=TEMPORAL_DELIMITER, has_size_field=1) followed
+ * by 0x00 (leb128(0) — zero-payload).  Returns 2 on success, 0 if
+ * @out_cap < 2.
+ *
+ * Per AV1 spec §5.6 every temporal unit MUST start with a Temporal
+ * Delimiter OBU when temporal_delimiter_obus_present is implied — the
+ * libavcodec AV1 parser uses TD OBUs as access-unit boundaries when
+ * fed full-bitstream input.
+ */
+size_t av1_synth_temporal_delimiter_obu(uint8_t *out, size_t out_cap);
+
+/*
+ * Integration status (2026-05-23):
+ *
+ * The Sequence / Frame Header / Temporal Delimiter encoders below are
+ * standalone primitives.  They are NOT yet called from decoder.c — the
+ * AV1 decode hot path still passes the OUTPUT buffer straight to
+ * libavcodec, which only works if the V4L2 consumer happens to be
+ * sending a fully-OBU'd access unit (i.e. is not strictly following
+ * the V4L2 stateless AV1 "tile-group bytes only" contract).
+ *
+ * Wiring these primitives in requires a separate kernel-side change:
+ *
+ *   - extend daedalus_v4l2_proto.h with a `struct daedalus_av1_meta`
+ *     mirroring v4l2_ctrl_av1_sequence + v4l2_ctrl_av1_frame
+ *   - update kernel/daedalus_v4l2_main.c to capture
+ *     V4L2_CID_STATELESS_AV1_{SEQUENCE,FRAME} at device_run time and
+ *     ship the meta alongside the bitstream over the chardev
+ *   - update daemon/src/chardev_client.c to receive the meta
+ *   - update daemon/src/decoder.c to: synth TD + SH + FH OBUs, wrap
+ *     the OUTPUT bytes as an OBU_TILE_GROUP, concat in that order,
+ *     hand the assembled bitstream to libavcodec
+ *
+ * Tracked as a follow-on; see daedalus-v4l2 task notes.
+ */
+
+/*
+ * Encode an AV1 Frame Header OBU from the V4L2 stateless frame control
+ * (and the matching sequence control, which provides fields the
+ * frame-header encoder branches on per §5.9.1).
+ *
+ * Scope (this revision — libva-v4l2-request common-case path):
+ *   - Frame types KEY / INTER / INTRA_ONLY.  SWITCH frames return 0
+ *     (caller should fall back to libavcodec native parsing).
+ *   - segmentation_params() emits the "segmentation disabled" path
+ *     when V4L2_AV1_SEGMENTATION_FLAG_ENABLED is 0.  Enabled
+ *     segmentation returns 0.
+ *   - loop_restoration_params(): only RESTORE_NONE on all planes
+ *     supported.  Other restoration types return 0.
+ *   - global_motion: only IDENTITY warp model emitted.  Non-IDENTITY
+ *     entries return 0.
+ *   - film_grain_params(): treated as "not present" — only valid when
+ *     the sequence header has film_grain_params_present = 0.  If the
+ *     sequence claims film grain is present this revision returns 0
+ *     (the per-frame film-grain coding is a separate follow-on).
+ *
+ * Out-of-scope branches return 0 so the caller can surface a coverage
+ * warning and fall back to direct libavcodec parsing of the original
+ * bitstream where possible.
+ *
+ * @out_cap must be at least 128 bytes for any reasonable frame header;
+ * 256 bytes is a safe upper bound for the supported subset.
+ */
+size_t av1_synth_frame_header_obu(const struct v4l2_ctrl_av1_sequence *seq,
+				  const struct v4l2_ctrl_av1_frame *fr,
+				  uint8_t *out, size_t out_cap);
+
+#endif /* DAEDALUS_AV1_OBU_SYNTH_H */
@@ -133,288 +133,110 @@ static int send_response(struct chardev_client *cli, uint32_t type,
 	return rc;
 }

-/*
- * Register a new (src_pts → cookie) mapping in the pending table.
- * Reuses an existing slot for src_pts if one exists (defensive — the
- * kernel should never re-use the same src_pts for two live cookies,
- * but libva running against a test client without timestamps might
- * send all-zero src_pts; collapse them onto the latest cookie so the
- * 1:1-per-stream case keeps working).  Returns 0 on success, -ENOSPC
- * if the table is full.
- */
-static int pending_register(struct chardev_client *cli, uint64_t src_pts,
-			    uint32_t cookie,
-			    const struct daedalus_req_decode *req)
-{
-	int free_slot = -1;
-	int i;
-
-	for (i = 0; i < DAEDALUS_MAX_PENDING_COOKIES; i++) {
-		if (cli->pending[i].used && cli->pending[i].src_pts == src_pts) {
-			cli->pending[i].cookie = cookie;
-			cli->pending[i].cached_req = *req;
-			return 0;
-		}
-		if (!cli->pending[i].used && free_slot < 0)
-			free_slot = i;
-	}
-
-	if (free_slot < 0) {
-		log_err("pending: table full registering cookie=%u src_pts=%llu",
-			cookie, (unsigned long long) src_pts);
-		return -ENOSPC;
-	}
-
-	cli->pending[free_slot].used	    = 1;
-	cli->pending[free_slot].src_pts	    = src_pts;
-	cli->pending[free_slot].cookie	    = cookie;
-	cli->pending[free_slot].cached_req  = *req;
-	return 0;
-}
-
-/*
- * Look up the cookie + cached REQ_DECODE that originally introduced
- * @src_pts.  Returns 0 + populates @cookie_out / @req_out, or -ENOENT
- * if no match (likely a daemon bug or codec output we can't route).
- */
-static int pending_lookup(const struct chardev_client *cli,
-			  uint64_t src_pts,
-			  uint32_t *cookie_out,
-			  struct daedalus_req_decode *req_out)
-{
-	int i;
-
-	for (i = 0; i < DAEDALUS_MAX_PENDING_COOKIES; i++) {
-		if (cli->pending[i].used &&
-		    cli->pending[i].src_pts == src_pts) {
-			*cookie_out = cli->pending[i].cookie;
-			*req_out    = cli->pending[i].cached_req;
-			return 0;
-		}
-	}
-	return -ENOENT;
-}
-
-static void pending_release(struct chardev_client *cli, uint64_t src_pts)
-{
-	int i;
-
-	for (i = 0; i < DAEDALUS_MAX_PENDING_COOKIES; i++) {
-		if (cli->pending[i].used &&
-		    cli->pending[i].src_pts == src_pts) {
-			cli->pending[i].used = 0;
-			cli->pending[i].src_pts = 0;
-			cli->pending[i].cookie = 0;
-			return;
-		}
-	}
-}
-
-/*
- * Pack the daemon's current AVFrame into the CAPTURE buffer owned by
- * @owner_cookie, then ship RESP_FRAME with the flags caller asked for.
- * Returns 0 on success; -errno on GET_DMABUF / mmap failure (RESP is
- * still emitted so the kernel doesn't park the dst buffer forever).
- */
-static int deliver_frame_to_cookie(struct chardev_client *cli,
-				   uint32_t owner_cookie,
-				   const struct daedalus_req_decode *owner_req,
-				   struct daedalus_resp_frame *resp,
-				   uint32_t resp_flags)
-{
-	struct daedalus_capture_planes planes;
-	int orc;
-
-	orc = daedalus_capture_planes_open(cli->fd, owner_cookie, owner_req,
-					   &planes);
-	if (orc < 0) {
-		log_warn("drain: GET_DMABUF cookie=%u failed (%d); RESP metadata-only",
-			 owner_cookie, orc);
-	} else {
-		(void) daedalus_decoder_pack_current(cli->decoder, &planes,
-						     owner_req->capture_pix_fmt);
-		daedalus_capture_planes_close(&planes);
-	}
-
-	resp->flags |= resp_flags;
-	return send_response(cli, DAEDALUS_MSG_RESP_FRAME, owner_cookie,
-			     resp, sizeof(*resp));
-}
-
 static int handle_req_decode(struct chardev_client *cli,
 			     const struct daedalus_msg_hdr *hdr,
 			     const uint8_t *payload)
 {
 	struct daedalus_req_decode req;
+	struct daedalus_resp_frame resp;
+	struct daedalus_capture_planes planes;
 	const struct daedalus_h264_meta *h264_meta = NULL;
 	size_t meta_off, meta_len = 0;
-	int submit_status;
-	int src_consumed_emitted = 0;
 	int rc;
+	int decoded = 0;

 	if (hdr->payload_len < sizeof(req)) {
-		struct daedalus_resp_frame err = { 0 };
-
 		log_err("REQ_DECODE cookie=%u: payload too short %u < %zu",
 			hdr->cookie, hdr->payload_len, sizeof(req));
-		err.status = DAEDALUS_DECODE_ERR_RECV;
-		err.flags  = DAEDALUS_RESP_FLAG_HAS_PIXELS |
-			     DAEDALUS_RESP_FLAG_SRC_CONSUMED;
+		memset(&resp, 0, sizeof(resp));
+		resp.status = DAEDALUS_DECODE_ERR_RECV;
 		return send_response(cli, DAEDALUS_MSG_RESP_FRAME,
-				     hdr->cookie, &err, sizeof(err));
+				     hdr->cookie, &resp, sizeof(resp));
 	}
 	memcpy(&req, payload, sizeof(req));

+	/* Optional H.264 meta block follows req when the flag is set;
+	 * bitstream comes after meta. */
 	if (req.flags & DAEDALUS_REQ_FLAG_H264_META)
 		meta_len = sizeof(struct daedalus_h264_meta);
 	meta_off = sizeof(req);

 	if ((size_t) req.bitstream_len + sizeof(req) + meta_len !=
 	    hdr->payload_len) {
-		struct daedalus_resp_frame err = { 0 };
-
 		log_err("REQ_DECODE cookie=%u: bitstream_len %u + meta %zu inconsistent with payload_len %u",
 			hdr->cookie, req.bitstream_len, meta_len,
 			hdr->payload_len);
-		err.status = DAEDALUS_DECODE_ERR_RECV;
-		err.flags  = DAEDALUS_RESP_FLAG_HAS_PIXELS |
-			     DAEDALUS_RESP_FLAG_SRC_CONSUMED;
+		memset(&resp, 0, sizeof(resp));
+		resp.status = DAEDALUS_DECODE_ERR_RECV;
 		return send_response(cli, DAEDALUS_MSG_RESP_FRAME,
-				     hdr->cookie, &err, sizeof(err));
+				     hdr->cookie, &resp, sizeof(resp));
 	}
 	if (meta_len)
 		h264_meta = (const struct daedalus_h264_meta *)
 				(payload + meta_off);

-	log_info("REQ_DECODE cookie=%u codec=%u bitstream=%u bytes meta=%s capture=%ux%u %u planes src_pts=%llu",
+	log_info("REQ_DECODE cookie=%u codec=%u bitstream=%u bytes meta=%s capture=%ux%u %u planes",
 		 hdr->cookie, req.codec_id, req.bitstream_len,
 		 h264_meta ? "h264" : "none",
 		 req.capture_width, req.capture_height,
-		 req.capture_num_planes,
-		 (unsigned long long) req.src_pts);
+		 req.capture_num_planes);

 	/*
-	 * Register (src_pts → cookie) mapping BEFORE submit, so any drained
-	 * frame whose pts matches this REQ's src_pts (the steady-state
-	 * 1:1 path) can find its owner via pending_lookup below.  Out of
-	 * space here is fatal — we'd lose the routing identity for this
-	 * cookie's eventual frame.  Send an error RESP that releases both
-	 * src and dst so the V4L2 client moves on.
+	 * Degenerate-bitstream filter (issue #17): libva-v4l2-request-
+	 * fourier flushes a stub packet into the OUTPUT_MPLANE queue at
+	 * playback-pause boundaries.  The payload is shorter than any
+	 * parseable H.264 NAL (3-byte start code + 1-byte NAL header =
+	 * 4 bytes minimum); avcodec_send_packet returns
+	 * AVERROR_INVALIDDATA, which we used to propagate to the kernel
+	 * as a decode failure.  Firefox then marks H.264-via-VAAPI as
+	 * broken for the session and routes every subsequent frame to
+	 * libmozavcodec SW — pause never recovers to HW.
+	 *
+	 * Drop the request as a no-op decode and reply RESP_FRAME OK so
+	 * libva's V4L2 state machine keeps its surface pool alive.
 	 */
-	rc = pending_register(cli, req.src_pts, hdr->cookie, &req);
-	if (rc < 0) {
-		struct daedalus_resp_frame err = { 0 };
-
-		err.status = DAEDALUS_DECODE_ERR_SEND;
-		err.flags  = DAEDALUS_RESP_FLAG_HAS_PIXELS |
-			     DAEDALUS_RESP_FLAG_SRC_CONSUMED;
+	if (req.bitstream_len < 4) {
+		log_info("REQ_DECODE cookie=%u: tiny bitstream %u bytes — dropping as no-op (pause-time sentinel)",
+			 hdr->cookie, req.bitstream_len);
+		memset(&resp, 0, sizeof(resp));
+		resp.status = DAEDALUS_DECODE_NO_FRAME;
 		return send_response(cli, DAEDALUS_MSG_RESP_FRAME,
-				     hdr->cookie, &err, sizeof(err));
-	}
-
-	submit_status = daedalus_decoder_submit(cli->decoder, &req,
-						payload + meta_off + meta_len,
-						h264_meta);
-	if (submit_status != 0) {
-		/*
-		 * avcodec_send_packet failed before any frame could have
-		 * been queued for this src_pts.  Drop the pending entry
-		 * (no future drain will find a matching pts), and emit a
-		 * combined HAS_PIXELS|SRC_CONSUMED error RESP for this
-		 * cookie so the V4L2 client unblocks.
-		 */
-		struct daedalus_resp_frame err = { 0 };
-
-		pending_release(cli, req.src_pts);
-		err.status = (uint32_t) submit_status;
-		err.codec_id = req.codec_id;
-		err.flags  = DAEDALUS_RESP_FLAG_HAS_PIXELS |
-			     DAEDALUS_RESP_FLAG_SRC_CONSUMED;
-		err.output_src_pts = req.src_pts;
-		return send_response(cli, DAEDALUS_MSG_RESP_FRAME,
-				     hdr->cookie, &err, sizeof(err));
-	}
-
-	/*
-	 * Drain libavcodec for as many display-ordered frames as it can
-	 * emit right now.  Each frame's pts identifies which cookie's
-	 * CAPTURE buffer the pixels go in (see [[daedalus-v4l2#6]]).  In
-	 * steady state for VP9/AV1 (no reorder) the loop runs exactly
-	 * once, draining the just-submitted packet's own frame.  For
-	 * H.264 with B-frames the first drained frame may belong to an
-	 * EARLIER cookie's bitstream — that's the entire point.
-	 */
-	for (;;) {
-		struct daedalus_resp_frame resp;
-		uint32_t owner_cookie = 0;
-		struct daedalus_req_decode owner_req;
-		uint32_t flags;
-
-		rc = daedalus_decoder_drain_one(cli->decoder, req.codec_id,
-						&resp);
-		if (rc == -EAGAIN)
-			break;
-		if (rc != 0) {
-			/*
-			 * Hard codec error during drain.  resp->status is set.
-			 * Pin it to THIS REQ's cookie (we can't know whose
-			 * pts the failed frame would have had); set both
-			 * flags so the V4L2 client moves on.
-			 */
-			pending_release(cli, req.src_pts);
-			resp.flags = DAEDALUS_RESP_FLAG_HAS_PIXELS |
-				     DAEDALUS_RESP_FLAG_SRC_CONSUMED;
-			resp.output_src_pts = req.src_pts;
-			(void) send_response(cli, DAEDALUS_MSG_RESP_FRAME,
-					     hdr->cookie, &resp, sizeof(resp));
-			src_consumed_emitted = 1;
-			break;
-		}
-
-		if (pending_lookup(cli, resp.output_src_pts,
-				   &owner_cookie, &owner_req) != 0) {
-			/*
-			 * Frame's pts has no registered owner — implies a
-			 * daemon-side tracking bug or a codec output for a
-			 * packet we never registered (e.g. a B-frame that
-			 * was queued before the daemon caught up).  Drop the
-			 * frame; can't safely route it.
-			 */
-			log_warn("drain: no pending entry for output_src_pts=%llu (codec dropped a frame?)",
-				 (unsigned long long) resp.output_src_pts);
-			continue;
-		}
-
-		flags = DAEDALUS_RESP_FLAG_HAS_PIXELS;
-		if (owner_cookie == hdr->cookie) {
-			flags |= DAEDALUS_RESP_FLAG_SRC_CONSUMED;
-			src_consumed_emitted = 1;
-		}
-
-		(void) deliver_frame_to_cookie(cli, owner_cookie, &owner_req,
-					       &resp, flags);
-		pending_release(cli, resp.output_src_pts);
-	}
-
-	/*
-	 * If the drain loop didn't already SRC_CONSUMED this REQ's cookie
-	 * (libavcodec held the frame for display-order reorder — the
-	 * pixels will arrive in a future drain), emit a standalone
-	 * SRC_CONSUMED RESP now.  Kernel releases src_buf + runs
-	 * job_finish; dst_buf parked until the matching HAS_PIXELS
-	 * shows up later.
-	 */
-	if (!src_consumed_emitted) {
-		struct daedalus_resp_frame resp = { 0 };
-
-		resp.status = DAEDALUS_DECODE_OK;
-		resp.codec_id = req.codec_id;
-		resp.flags = DAEDALUS_RESP_FLAG_SRC_CONSUMED;
-		(void) send_response(cli, DAEDALUS_MSG_RESP_FRAME,
 				     hdr->cookie, &resp, sizeof(resp));
 	}

-	return 0;
+	/*
+	 * Open dmabuf-fds for every CAPTURE plane and mmap them.
+	 * If this fails we still attempt the decode (so the kernel
+	 * gets a structured error response) — but we pass NULL
+	 * planes so pixels aren't written anywhere.
+	 */
+	rc = daedalus_capture_planes_open(cli->fd, hdr->cookie, &req,
+					  &planes);
+	if (rc < 0) {
+		log_warn("REQ_DECODE cookie=%u: GET_DMABUF/mmap failed (%d); decode metadata-only",
+			 hdr->cookie, rc);
+		/* planes is already zeroed by capture_planes_open */
+	}
+
+	rc = daedalus_decoder_run_request(cli->decoder, &req,
+					  payload + meta_off + meta_len,
+					  h264_meta,
+					  &resp,
+					  planes.nr ? &planes : NULL);
+	decoded = (rc >= 0);
+
+	daedalus_capture_planes_close(&planes);
+
+	if (!decoded)
+		return rc;
+
+	/*
+	 * RESP_FRAME is metadata-only in Phase 8.6 — pixels already
+	 * live in the V4L2 client's CAPTURE buffer via the dmabuf
+	 * the daemon wrote to in pack_nv12_to_planes.
+	 */
+	return send_response(cli, DAEDALUS_MSG_RESP_FRAME, hdr->cookie,
+			     &resp, sizeof(resp));
 }

 static int handle_ping(struct chardev_client *cli,
@@ -18,44 +18,18 @@
 struct ffmpeg_loader;
 struct daedalus_decoder;

-/*
- * Per-inflight (cookie, src_pts) tracking for the H.264 B-frame
- * display-reorder fix (daedalus-v4l2#6).  When the daemon drains a
- * frame from libavcodec, frame->pts (= src_pts of the OUTPUT bitstream
- * that contained the frame's slices) identifies which cookie's CAPTURE
- * buffer the pixels belong in — distinct from the cookie of the REQ
- * that triggered the receive_frame call.  Mapping is small (bounded
- * by the V4L2 client's buffer pool depth, typically ≤24) so a linear
- * array beats a hashtable for cache-locality.
- *
- * cached_req carries the capture geometry (num_planes, plane sizes,
- * strides, pix_fmt) so a later drain — which may target this cookie
- * from a DIFFERENT REQ's drain loop — can call GET_DMABUF + open
- * planes with the original REQ's parameters.
- */
-#define DAEDALUS_MAX_PENDING_COOKIES	64
-
-struct chardev_pending_cookie {
-	int				 used;
-	uint64_t			 src_pts;
-	uint32_t			 cookie;
-	struct daedalus_req_decode	 cached_req;
-};
-
 /**
 * struct chardev_client - daemon-side chardev state
 * @fd:		open /dev/daedalus-v4l2 descriptor (-1 if not open)
 * @loader:	dlopen'd FFmpeg loader (borrowed; not owned)
 * @decoder:	per-codec AVCodecContext cache (owned)
 * @stop_flag:	set non-zero from a signal handler to break the loop
- * @pending:	pts → cookie lookup table for split SRC/DST RESPs
 */
 struct chardev_client {
 	int				 fd;
 	struct ffmpeg_loader		*loader;
 	struct daedalus_decoder		*decoder;
 	volatile sig_atomic_t		*stop_flag;
-	struct chardev_pending_cookie	 pending[DAEDALUS_MAX_PENDING_COOKIES];
 };

 /**
@@ -6,16 +6,60 @@
 #include "ffmpeg_loader.h"
 #include "h264_nal_synth.h"
 #include "log.h"
+#include "shadow_decoder.h"

 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>

 #include <linux/videodev2.h>

 #include <libavcodec/avcodec.h>
 #include <libavutil/pixfmt.h>

+/*
+ * Per-codec running stats — daedalus-v4l2#11 step 1.  Establishes
+ * baseline observability before any daedalus-fourier kernel
+ * substitution lands, so we can see what each substitution actually
+ * shifted.  Per-frame `decoder: OK` line now carries decode_us; a
+ * "decoder stats" summary line lands every DAEDALUS_STATS_EVERY OK
+ * frames with throughput + per-frame budget aggregates.
+ *
+ * Counters are static (process-local) and unsynchronised — the
+ * daemon's chardev event loop is single-threaded, so no atomics or
+ * locking needed.  Reset when codec_id changes (different stream).
+ */
+#define DAEDALUS_STATS_EVERY	60u
+
+struct daedalus_decode_stats {
+	uint32_t codec_id;
+	uint64_t frames;
+	uint64_t total_decode_ns;
+	uint64_t total_bitstream_bytes;
+	uint64_t total_mbs;	/* derived from frame WxH; H.264-style 16x16 */
+	struct timespec window_start;
+};
+
+static struct daedalus_decode_stats g_stats;
+
+static inline uint64_t timespec_delta_ns(const struct timespec *a,
+					 const struct timespec *b)
+{
+	return (uint64_t)(b->tv_sec - a->tv_sec) * 1000000000ull +
+	       (uint64_t)(b->tv_nsec - a->tv_nsec);
+}
+
+static const char *codec_id_name(uint32_t cid)
+{
+	switch (cid) {
+	case DAEDALUS_CODEC_VP9:	 return "vp9";
+	case DAEDALUS_CODEC_AV1:	 return "av1";
+	case DAEDALUS_CODEC_H264: return "h264";
+	default:		 return "?";
+	}
+}
+
 /*
 * FNV-1a 32-bit hash.  Used as a compact digest of the decoded
 * frame's YUV planes so the kernel can verify "the daemon produced
@@ -67,6 +111,13 @@ int daedalus_decoder_init(struct daedalus_decoder *dec,
 		loader->av_packet_free(&dec->pkt);
 		return -ENOMEM;
 	}
+	/*
+	 * Returns NULL when DAEDALUS_SHADOW_MODE != "1" or the loaded
+	 * libavcodec lacks the per-MB inspection callback.  Both are
+	 * the normal production state — the rest of decoder.c is
+	 * shadow-aware via NULL-safe shadow_decoder_* entry points.
+	 */
+	dec->shadow = shadow_decoder_create(loader);
 	return 0;
 }

@@ -74,6 +125,8 @@ void daedalus_decoder_cleanup(struct daedalus_decoder *dec)
 {
 	if (!dec || !dec->loader)
 		return;
+	if (dec->shadow)
+		shadow_decoder_destroy(dec->shadow);
 	if (dec->ctx_vp9)
 		dec->loader->avcodec_free_context(&dec->ctx_vp9);
 	if (dec->ctx_av1)
@@ -132,6 +185,32 @@ static int decoder_open_codec(struct daedalus_decoder *dec, uint32_t codec_id,
 	ctx = fm->avcodec_alloc_context3(codec);
 	if (!ctx)
 		return -ENOMEM;
+
+	/*
+	 * H.264-only: force libavcodec to emit frames in DECODE order
+	 * (one frame per send_packet, no internal display-order reorder
+	 * queue).  V4L2 stateless decoder protocol expects each OUTPUT
+	 * bitstream packet to produce one CAPTURE buffer with that
+	 * packet's slice-decoded pixels — regardless of display order.
+	 * ffmpeg-vaapi's H.264 decoder (which is what consumes our
+	 * CAPTURE buffers via libva-v4l2-request-fourier) does its own
+	 * POC-based display reorder upstream, so producing decode-order
+	 * output is correct.
+	 *
+	 * AV_CODEC_FLAG_LOW_DELAY forces `low_delay = 1` inside
+	 * libavcodec's H.264 decoder — `h264_select_output_frame` emits
+	 * the just-decoded picture immediately instead of holding it
+	 * for the display-order DPB output queue.  DPB management for
+	 * reference frames (short_ref / long_ref) is unaffected; B-frame
+	 * decoding correctness is preserved.
+	 *
+	 * Closes daedalus-v4l2#11 part (2).  Skipped for VP9 / AV1 —
+	 * those formats don't internally reorder, so the flag would be
+	 * a no-op but adds no value.
+	 */
+	if (codec_id == DAEDALUS_CODEC_H264)
+		ctx->flags |= AV_CODEC_FLAG_LOW_DELAY;
+
 	rc = fm->avcodec_open2(ctx, codec, NULL);
 	if (rc < 0) {
 		log_err("decoder: avcodec_open2 failed: %d", rc);
@@ -142,6 +221,16 @@ static int decoder_open_codec(struct daedalus_decoder *dec, uint32_t codec_id,
 	*cache = ctx;
 	*out = ctx;
 	log_info("decoder: opened %s context", codec->name);
+
+	/*
+	 * Shadow-mode hook on H.264 only: install the per-MB inspection
+	 * callback once the AVCodecContext is open.  NULL-safe — when
+	 * shadow mode is disabled (the normal production case) this
+	 * does nothing.
+	 */
+	if (codec_id == DAEDALUS_CODEC_H264)
+		shadow_decoder_install_cb(dec->shadow, ctx);
+
 	return 0;
 }

@@ -204,6 +293,20 @@ static int pack_p010_to_plane(struct AVFrame *fr,
 	if (!base)
 		return -EINVAL;

+	/* Bounds-check (see pack_nv12_single comment).  P010 stores 16
+	 * bits per sample on both Y and CbCr planes; stride is in bytes
+	 * and already accounts for the 2× expansion. */
+	{
+		size_t y_size_chk = (size_t) stride * (size_t) h;
+		size_t required = y_size_chk + (size_t) stride * (size_t) ch;
+		if (planes->size[0] < required) {
+			log_warn("pack_p010: frame %dx%d (stride=%u required=%zu) "
+				 "exceeds CAPTURE plane[0] size %zu — skipping pack",
+				 w, h, stride, required, planes->size[0]);
+			return -EOVERFLOW;
+		}
+	}
+
 	dst_y  = base;
 	y_size = (size_t) stride * (size_t) h;
 	dst_uv = base + y_size;
@@ -251,7 +354,7 @@ static int pack_nv12_single_to_plane(struct AVFrame *fr,
 	uint8_t *base;
 	uint32_t stride;
 	uint8_t *dst_y, *dst_uv;
-	size_t y_size;
+	size_t y_size, required;

 	if (!desc || !planes || planes->nr < 1)
 		return -EINVAL;
@@ -270,8 +373,27 @@ static int pack_nv12_single_to_plane(struct AVFrame *fr,
 	if (!base)
 		return -EINVAL;

+	/*
+	 * Bounds-check before any write — the V4L2 client's CAPTURE
+	 * dmabuf may have been sized for a smaller frame than what
+	 * libavcodec just decoded (e.g. YouTube DASH stepping
+	 * resolution mid-stream — libva is supposed to handle the
+	 * SOURCE_CHANGE event with STREAMOFF + S_FMT + REQBUFS but
+	 * sometimes a stale request slips through carrying the old
+	 * buffer size).  Writing the chroma interleave loop into an
+	 * undersized mapping faults the daemon with SIGSEGV mid-frame.
+	 * Bail loudly with a warn instead.
+	 */
+	y_size   = (size_t) stride * (size_t) h;
+	required = y_size + (size_t) stride * (size_t) ch;
+	if (planes->size[0] < required) {
+		log_warn("pack_nv12_single: frame %dx%d (stride=%u required=%zu) "
+			 "exceeds CAPTURE plane[0] size %zu — skipping pack",
+			 w, h, stride, required, planes->size[0]);
+		return -EOVERFLOW;
+	}
+
 	dst_y  = base;
-	y_size = (size_t) stride * (size_t) h;
 	dst_uv = base + y_size;

 	for (y = 0; y < h; y++)
@@ -326,6 +448,24 @@ static int pack_nv12_to_planes(struct AVFrame *fr,
 	if (!dst_y || !dst_uv)
 		return -EINVAL;

+	/*
+	 * Bounds-check both planes against the mapped dmabuf size.  See
+	 * pack_nv12_single_to_plane comment for the resolution-change-
+	 * mid-stream crash story this protects against.
+	 */
+	{
+		size_t y_required  = (size_t) dst_y_stride  * (size_t) h;
+		size_t uv_required = (size_t) dst_uv_stride * (size_t) ch;
+		if (planes->size[0] < y_required ||
+		    planes->size[1] < uv_required) {
+			log_warn("pack_nv12_2plane: frame %dx%d "
+				 "(y=%zu/%zu uv=%zu/%zu) exceeds CAPTURE — skipping pack",
+				 w, h, y_required, planes->size[0],
+				 uv_required, planes->size[1]);
+			return -EOVERFLOW;
+		}
+	}
+
 	/* Y plane copy — strip source stride padding. */
 	for (y = 0; y < h; y++)
 		memcpy(dst_y + (size_t) y * dst_y_stride,
@@ -348,30 +488,31 @@ static int pack_nv12_to_planes(struct AVFrame *fr,
 	return 0;
 }

-/*
- * Per-codec assemble + send_packet.  Returns 0 on success, or one
- * of DAEDALUS_DECODE_ERR_* on failure (errors here propagate via
- * the caller's RESP_FRAME status field — they are NOT logged as a
- * silent skip).  pkt->pts is stamped from req->src_pts so the
- * resulting frame->pts comes back identifiable on the drain side.
- */
-int daedalus_decoder_submit(struct daedalus_decoder *dec,
-			    const struct daedalus_req_decode *req,
-			    const uint8_t *bitstream,
-			    const struct daedalus_h264_meta *h264_meta)
+int daedalus_decoder_run_request(struct daedalus_decoder *dec,
+				 const struct daedalus_req_decode *req,
+				 const uint8_t *bitstream,
+				 const struct daedalus_h264_meta *h264_meta,
+				 struct daedalus_resp_frame *resp,
+				 const struct daedalus_capture_planes *planes)
 {
 	struct ffmpeg_loader *fm = dec->loader;
 	struct AVCodecContext *ctx = NULL;
 	uint8_t *assembled = NULL;
 	size_t assembled_len = 0;
 	int rc;
-	int status = 0;
+
+	memset(resp, 0, sizeof(*resp));
+	resp->codec_id = req->codec_id;

 	rc = decoder_open_codec(dec, req->codec_id, &ctx);
-	if (rc == -ENOSYS)
-		return DAEDALUS_DECODE_ERR_CODEC;
-	if (rc < 0)
-		return DAEDALUS_DECODE_ERR_OPEN;
+	if (rc == -ENOSYS) {
+		resp->status = DAEDALUS_DECODE_ERR_CODEC;
+		goto out;
+	}
+	if (rc < 0) {
+		resp->status = DAEDALUS_DECODE_ERR_OPEN;
+		goto out;
+	}

 	fm->av_packet_unref(dec->pkt);

@@ -396,14 +537,14 @@ int daedalus_decoder_submit(struct daedalus_decoder *dec,
 		if (sps_len == 0 || pps_len == 0) {
 			log_err("decoder: SPS/PPS NAL synth failed (sps=%zu pps=%zu)",
 				sps_len, pps_len);
-			status = DAEDALUS_DECODE_ERR_SEND;
+			resp->status = DAEDALUS_DECODE_ERR_SEND;
 			goto out;
 		}

 		assembled_len = sps_len + pps_len + req->bitstream_len;
 		assembled = malloc(assembled_len + AV_INPUT_BUFFER_PADDING_SIZE);
 		if (!assembled) {
-			status = DAEDALUS_DECODE_ERR_SEND;
+			resp->status = DAEDALUS_DECODE_ERR_SEND;
 			goto out;
 		}
 		memcpy(assembled, sps_nal, sps_len);
@@ -441,161 +582,207 @@ int daedalus_decoder_submit(struct daedalus_decoder *dec,
 	}

 	/*
-	 * Stamp pkt->pts from REQ_DECODE's src_pts (the V4L2 OUTPUT
-	 * buffer's vb2 timestamp captured by the kernel at device_run
-	 * time).  libavcodec carries pkt->pts forward to frame->pts on
-	 * the receive_frame side — even after display-order reordering
-	 * inside the H.264 DPB — which lets the chardev_client identify
-	 * which cookie's CAPTURE buffer a drained frame's pixels belong
-	 * in.  Without this stamp, every drained frame would look like
-	 * it came from the current REQ; pairs of B/P would swap places
-	 * in the visible output (daedalus-v4l2#6).
+	 * Time send_packet+receive_frame for the per-frame `decoder: OK`
+	 * line + the periodic stats summary.  Includes only the
+	 * libavcodec round-trip — not the bitstream packing, SPS/PPS
+	 * synth, or pack-to-planes work (those are accounted for
+	 * separately in the request's overall handle time).
 	 */
-	dec->pkt->pts = (int64_t) req->src_pts;
+	struct timespec t_decode_start, t_decode_end;
+	uint64_t decode_ns = 0;
+	clock_gettime(CLOCK_MONOTONIC, &t_decode_start);

 	rc = fm->avcodec_send_packet(ctx, dec->pkt);
 	if (rc < 0) {
 		log_err("decoder: avcodec_send_packet failed: %d", rc);
-		status = DAEDALUS_DECODE_ERR_SEND;
+		resp->status = DAEDALUS_DECODE_ERR_SEND;
 		goto out;
 	}

-out:
-	free(assembled);
-	(void) assembled_len;
-	return status;
-}
-
-/*
- * Pull the next display-ordered frame out of libavcodec's DPB.
- * Returns 0 if a frame was returned (dec->frame holds it and resp
- * is populated with metadata + output_src_pts == frame->pts),
- * -EAGAIN if libavcodec needs more input, or DAEDALUS_DECODE_ERR_*
- * on a hard codec error.  Caller may immediately invoke
- * daedalus_decoder_pack_current() to copy this frame's pixels into
- * a CAPTURE buffer's mapped planes, then call drain_one again for
- * any further frames in the DPB.
- */
-int daedalus_decoder_drain_one(struct daedalus_decoder *dec,
-			       uint32_t codec_id,
-			       struct daedalus_resp_frame *resp)
-{
-	struct ffmpeg_loader *fm = dec->loader;
-	struct AVCodecContext *ctx = NULL;
-	struct AVFrame *fr;
-	const AVPixFmtDescriptor *desc;
-	uint32_t h, luma_len = 0, chroma_len = 0;
-	int rc;
-
-	memset(resp, 0, sizeof(*resp));
-	resp->codec_id = codec_id;
-
-	rc = decoder_open_codec(dec, codec_id, &ctx);
-	if (rc == -ENOSYS) {
-		resp->status = DAEDALUS_DECODE_ERR_CODEC;
-		return DAEDALUS_DECODE_ERR_CODEC;
-	}
-	if (rc < 0) {
-		resp->status = DAEDALUS_DECODE_ERR_OPEN;
-		return DAEDALUS_DECODE_ERR_OPEN;
-	}
-
 	fm->av_frame_unref(dec->frame);
 	rc = fm->avcodec_receive_frame(ctx, dec->frame);
-	if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF)
-		return -EAGAIN;
+	clock_gettime(CLOCK_MONOTONIC, &t_decode_end);
+	decode_ns = timespec_delta_ns(&t_decode_start, &t_decode_end);
+	if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) {
+		log_debug("decoder: no frame ready yet (rc=%d, %lu us)",
+			  rc, (unsigned long)(decode_ns / 1000));
+		resp->status = DAEDALUS_DECODE_NO_FRAME;
+		goto out;
+	}
 	if (rc < 0) {
 		log_err("decoder: avcodec_receive_frame failed: %d", rc);
 		resp->status = DAEDALUS_DECODE_ERR_RECV;
-		return DAEDALUS_DECODE_ERR_RECV;
+		goto out;
 	}

-	fr = dec->frame;
-	desc = fm->av_pix_fmt_desc_get(fr->format);
-	h = fnv1a32_init();
+	/*
+	 * Shadow-mode frame-boundary hook.  H.264-only — the per-MB
+	 * callback is only registered for H.264, so on VP9/AV1 frames
+	 * shadow->mbs_this_frame stays zero anyway, but keeping the
+	 * codec gate here makes the log lines easier to read.
+	 * NULL-safe.
+	 */
+	if (req->codec_id == DAEDALUS_CODEC_H264)
+		shadow_decoder_on_frame(dec->shadow, dec->frame);

-	resp->status		= DAEDALUS_DECODE_OK;
-	resp->width		= (uint32_t) fr->width;
-	resp->height		= (uint32_t) fr->height;
-	resp->pix_fmt		= fr->format;
-	resp->output_src_pts	= (uint64_t) fr->pts;
+	{
+		struct AVFrame *fr = dec->frame;
+		const AVPixFmtDescriptor *desc =
+			fm->av_pix_fmt_desc_get(fr->format);
+		uint32_t h = fnv1a32_init();
+		uint32_t luma_len = 0, chroma_len = 0;

-	if (!desc) {
-		log_warn("decoder: no descriptor for pix_fmt %d", fr->format);
-	} else {
-		int p, max_plane = 0;
-		int i;
+		resp->status	= DAEDALUS_DECODE_OK;
+		resp->width	= (uint32_t) fr->width;
+		resp->height	= (uint32_t) fr->height;
+		resp->pix_fmt	= fr->format;

-		for (i = 0; i < desc->nb_components; i++) {
-			if (desc->comp[i].plane > max_plane)
-				max_plane = desc->comp[i].plane;
-		}
+		/*
+		 * Walk every plane reported by the AVPixFmtDescriptor.
+		 * For each component, byte width = ((plane_w *
+		 * step_minus1) >> 0) — but the descriptor only tells
+		 * us which plane each component sits in, not the
+		 * plane's byte stride per pixel.  In practice for the
+		 * formats we care about (YUV420P, YUV422P, YUV444P,
+		 * GBRP, NV12), each plane has exactly one component
+		 * at 1 byte/sample.  Hash each plane at
+		 * (width >> log2_chroma_w) × (height >> log2_chroma_h)
+		 * for chroma planes, full-size for plane 0.
+		 *
+		 * This generalises cleanly to anything 8-bit-per-
+		 * sample-per-plane; 10/12-bit (P010, YUV420P10LE) will
+		 * need depth handling when Phase 8.6 brings HDR
+		 * content into play.
+		 */
+		if (!desc) {
+			log_warn("decoder: no descriptor for pix_fmt %d",
+				 fr->format);
+		} else {
+			int p, max_plane = 0;
+			int i;

-		for (p = 0; p <= max_plane; p++) {
-			int pw, ph;
-			if (!fr->data[p] || !fr->linesize[p])
-				continue;
-			if (p == 0) {
-				pw = fr->width;
-				ph = fr->height;
-				luma_len += (uint32_t) pw * (uint32_t) ph;
-			} else {
-				pw = AV_CEIL_RSHIFT(fr->width,
-						    desc->log2_chroma_w);
-				ph = AV_CEIL_RSHIFT(fr->height,
-						    desc->log2_chroma_h);
-				chroma_len += (uint32_t) pw * (uint32_t) ph;
+			for (i = 0; i < desc->nb_components; i++) {
+				if (desc->comp[i].plane > max_plane)
+					max_plane = desc->comp[i].plane;
 			}
-			h = fnv1a32_plane(h, fr->data[p], pw, ph,
-					  fr->linesize[p]);
+
+			for (p = 0; p <= max_plane; p++) {
+				int pw, ph;
+				if (!fr->data[p] || !fr->linesize[p])
+					continue;
+				if (p == 0) {
+					pw = fr->width;
+					ph = fr->height;
+					luma_len += (uint32_t) pw *
+						    (uint32_t) ph;
+				} else {
+					pw = AV_CEIL_RSHIFT(fr->width,
+							    desc->log2_chroma_w);
+					ph = AV_CEIL_RSHIFT(fr->height,
+							    desc->log2_chroma_h);
+					chroma_len += (uint32_t) pw *
+						      (uint32_t) ph;
+				}
+				h = fnv1a32_plane(h, fr->data[p], pw, ph,
+						  fr->linesize[p]);
+			}
+		}
+
+		resp->luma_len	 = luma_len;
+		resp->chroma_len = chroma_len;
+		resp->fnv1a_yuv	 = h;
+
+		/*
+		 * Pack pixels directly into the mapped CAPTURE dmabuf
+		 * planes.  Dispatch on the V4L2 fourcc the kernel
+		 * negotiated:
+		 *   V4L2_PIX_FMT_NV12M (default, 8-bit, 2 planes)
+		 *   V4L2_PIX_FMT_P010  (10-bit HDR, 1 plane)
+		 */
+		if (planes && planes->nr >= 1) {
+			int prc = 0;
+			switch (req->capture_pix_fmt) {
+			case V4L2_PIX_FMT_NV12M:
+				prc = pack_nv12_to_planes(fr, desc, planes);
+				break;
+			case V4L2_PIX_FMT_NV12:
+				prc = pack_nv12_single_to_plane(fr, desc, planes);
+				break;
+			case V4L2_PIX_FMT_P010:
+				prc = pack_p010_to_plane(fr, desc, planes);
+				break;
+			default:
+				log_warn("decoder: unsupported capture fourcc 0x%08x",
+					 req->capture_pix_fmt);
+				prc = -EINVAL;
+				break;
+			}
+			if (prc < 0)
+				log_warn("decoder: pack failed (pix_fmt=%d cap_fourcc=0x%08x) — kernel will see metadata only",
+					 fr->format, req->capture_pix_fmt);
+		}
+
+		log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u decode_us=%lu",
+			 fr->width, fr->height, fr->format,
+			 desc ? desc->name : "?",
+			 h, luma_len, chroma_len,
+			 (unsigned long)(decode_ns / 1000));
+
+		/*
+		 * Periodic stats summary (every DAEDALUS_STATS_EVERY frames).
+		 * Reset window on codec change.  Gives observable baseline
+		 * for the daedalus-v4l2#11 substitution arc: fps, average
+		 * decode_us, MB/s throughput, bitstream B/MB.  Compare
+		 * against daedalus-fourier README's per-kernel NEON
+		 * baselines (e.g. H.264 IDCT 4x4 = 175 Mblock/s) to gauge
+		 * which substitutions are worth pursuing.
+		 */
+		if (g_stats.codec_id != req->codec_id) {
+			g_stats.codec_id = req->codec_id;
+			g_stats.frames = 0;
+			g_stats.total_decode_ns = 0;
+			g_stats.total_bitstream_bytes = 0;
+			g_stats.total_mbs = 0;
+			clock_gettime(CLOCK_MONOTONIC, &g_stats.window_start);
+		}
+		g_stats.frames++;
+		g_stats.total_decode_ns += decode_ns;
+		g_stats.total_bitstream_bytes += req->bitstream_len;
+		g_stats.total_mbs += (uint64_t)((fr->width  + 15) / 16) *
+				     (uint64_t)((fr->height + 15) / 16);
+
+		if (g_stats.frames % DAEDALUS_STATS_EVERY == 0) {
+			struct timespec t_now;
+			clock_gettime(CLOCK_MONOTONIC, &t_now);
+			uint64_t window_ns =
+				timespec_delta_ns(&g_stats.window_start, &t_now);
+			double window_s = (double)window_ns / 1e9;
+			double fps = window_s > 0 ?
+				(double)g_stats.frames / window_s : 0.0;
+			double avg_decode_us = g_stats.frames > 0 ?
+				(double)g_stats.total_decode_ns /
+				(double)g_stats.frames / 1000.0 : 0.0;
+			double mb_per_s = window_s > 0 ?
+				(double)g_stats.total_mbs / window_s : 0.0;
+			double bs_b_per_mb = g_stats.total_mbs > 0 ?
+				(double)g_stats.total_bitstream_bytes /
+				(double)g_stats.total_mbs : 0.0;
+
+			log_info("decoder stats: codec=%s "
+				 "frames=%llu window=%.2fs fps=%.2f "
+				 "avg_decode_us=%.1f mbs_per_s=%.0f "
+				 "bs_b_per_mb=%.2f",
+				 codec_id_name(g_stats.codec_id),
+				 (unsigned long long)g_stats.frames,
+				 window_s, fps, avg_decode_us,
+				 mb_per_s, bs_b_per_mb);
 		}
 	}

-	resp->luma_len	 = luma_len;
-	resp->chroma_len = chroma_len;
-	resp->fnv1a_yuv	 = h;
-
-	log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u src_pts=%llu",
-		 fr->width, fr->height, fr->format,
-		 desc ? desc->name : "?",
-		 h, luma_len, chroma_len,
-		 (unsigned long long) fr->pts);
+	fm->av_frame_unref(dec->frame);

+out:
+	free(assembled);
+	(void) assembled_len;
 	return 0;
 }
-
-int daedalus_decoder_pack_current(struct daedalus_decoder *dec,
-				  const struct daedalus_capture_planes *planes,
-				  uint32_t capture_pix_fmt)
-{
-	struct ffmpeg_loader *fm = dec->loader;
-	struct AVFrame *fr = dec->frame;
-	const AVPixFmtDescriptor *desc;
-	int prc;
-
-	if (!planes || planes->nr < 1 || !fr || !fr->width || !fr->height)
-		return -EINVAL;
-
-	desc = fm->av_pix_fmt_desc_get(fr->format);
-
-	switch (capture_pix_fmt) {
-	case V4L2_PIX_FMT_NV12M:
-		prc = pack_nv12_to_planes(fr, desc, planes);
-		break;
-	case V4L2_PIX_FMT_NV12:
-		prc = pack_nv12_single_to_plane(fr, desc, planes);
-		break;
-	case V4L2_PIX_FMT_P010:
-		prc = pack_p010_to_plane(fr, desc, planes);
-		break;
-	default:
-		log_warn("decoder: unsupported capture fourcc 0x%08x",
-			 capture_pix_fmt);
-		prc = -EINVAL;
-		break;
-	}
-	if (prc < 0)
-		log_warn("decoder: pack failed (pix_fmt=%d cap_fourcc=0x%08x)",
-			 fr->format, capture_pix_fmt);
-	return prc;
-}
@@ -21,6 +21,7 @@ struct ffmpeg_loader;
 struct AVCodecContext;
 struct AVPacket;
 struct AVFrame;
+struct shadow_decoder;

 /**
 * struct daedalus_decoder - per-daemon decoder state
@@ -31,6 +32,10 @@ struct AVFrame;
 * @ctx_h264:	lazily-opened H.264 AVCodecContext
 * @pkt:	shared AVPacket reused across requests
 * @frame:	shared AVFrame reused across requests
+ * @shadow:	env-gated daedalus-decoder shadow path; NULL when
+ *		DAEDALUS_SHADOW_MODE != "1" or libavcodec lacks the
+ *		per-MB inspection callback.  Production path doesn't
+ *		care; all shadow_decoder_* entry points are NULL-safe.
 */
 struct daedalus_decoder {
 	struct ffmpeg_loader	*loader;
@@ -39,6 +44,7 @@ struct daedalus_decoder {
 	struct AVCodecContext	*ctx_h264;
 	struct AVPacket		*pkt;
 	struct AVFrame		*frame;
+	struct shadow_decoder	*shadow;
 };

 /**
@@ -56,68 +62,33 @@ int  daedalus_decoder_init(struct daedalus_decoder *dec,
 void daedalus_decoder_cleanup(struct daedalus_decoder *dec);

 /**
- * daedalus_decoder_submit - send one REQ_DECODE's bitstream into libavcodec
+ * daedalus_decoder_run_request - decode one REQ_DECODE payload
 * @dec:	initialised decoder
- * @req:	REQ_DECODE prefix (from the wire); src_pts is stamped on
- *		the AVPacket so libavcodec returns frame->pts == src_pts
- *		when it eventually outputs the matching frame in display
- *		order (daedalus-v4l2#6).
+ * @req:	REQ_DECODE prefix (from the wire)
 * @bitstream:	bitstream blob (req->bitstream_len bytes)
 * @h264_meta:	optional H.264 SPS/PPS metadata; non-NULL only when
 *		codec_id == H264 and the kernel set DAEDALUS_REQ_FLAG_
- *		H264_META.  See decoder.c for the AnnexB synthesis.
+ *		H264_META.  Used to synthesise the AnnexB SPS+PPS NALs
+ *		libavcodec needs before any slice (libva-v4l2-request
+ *		passes only the slice in @bitstream per the V4L2
+ *		stateless API contract).  NULL for VP9/AV1 paths.
+ * @resp:	caller-allocated RESP_FRAME output (zeroed by callee)
+ * @planes:	mapped CAPTURE planes (Phase 8.6 dmabuf path).  If
+ *		NULL or planes->nr == 0, the decoder runs but
+ *		writes no pixels — caller still gets dims + digest.
 *
- * Calls avcodec_send_packet on the codec's per-codec AVCodecContext.
- * Returns 0 on success; one of DAEDALUS_DECODE_ERR_* on failure
- * (which the caller should propagate as the RESP_FRAME status for
- * the cookie of this REQ).  Does NOT call avcodec_receive_frame —
- * use daedalus_decoder_drain_one for that.
+ * Populates @resp with the decode outcome and writes decoded
+ * pixels (NV12 layout: Y to plane 0, interleaved CbCr to plane
+ * 1) directly into the mapped dmabuf planes.  Always returns
+ * 0; decode-level failures are reported via @resp->status so
+ * the kernel sees a structured response rather than a dropped
+ * request.
 */
-int  daedalus_decoder_submit(struct daedalus_decoder *dec,
-			     const struct daedalus_req_decode *req,
-			     const uint8_t *bitstream,
-			     const struct daedalus_h264_meta *h264_meta);
-
-/**
- * daedalus_decoder_drain_one - pop the next display-ordered frame, if any
- * @dec:	initialised decoder
- * @codec_id:	which codec context to drain (matches the REQ that just
- *		called submit).  VP9/AV1/H264 use independent contexts.
- * @resp:	caller-allocated RESP_FRAME output (zeroed by callee).
- *		On a successful drain (return 0), resp's status / width /
- *		height / pix_fmt / luma_len / chroma_len / fnv1a_yuv /
- *		output_src_pts are populated; flags is left at 0 (caller
- *		adds HAS_PIXELS / SRC_CONSUMED).  On EAGAIN, resp is
- *		zeroed.
- *
- * Return: 0 on a frame returned, -EAGAIN if libavcodec needs more
- * input (display-order frame held inside DPB), <0 on a hard codec
- * error (resp->status set).
- *
- * After a successful drain, the dec's internal AVFrame holds the
- * decoded picture.  Caller may immediately call
- * daedalus_decoder_pack_current(planes) to write that picture into
- * a CAPTURE buffer's dmabuf-mapped planes.  Subsequent calls to
- * drain_one (without another submit) try to pull additional frames
- * from libavcodec's DPB.
- */
-int  daedalus_decoder_drain_one(struct daedalus_decoder *dec,
-				uint32_t codec_id,
-				struct daedalus_resp_frame *resp);
-
-/**
- * daedalus_decoder_pack_current - pack the last drained frame into planes
- * @dec:	initialised decoder; must have a frame from drain_one
- * @planes:	mapped CAPTURE planes (open via GET_DMABUF using the
- *		cookie that owns the frame's output_src_pts).
- * @capture_pix_fmt: V4L2 fourcc on the CAPTURE side (NV12M, NV12,
- *		P010).
- *
- * Return: 0 on success, <0 on a pack failure (kernel sees only the
- * metadata, not pixels — typical when a format isn't wired yet).
- */
-int  daedalus_decoder_pack_current(struct daedalus_decoder *dec,
-				   const struct daedalus_capture_planes *planes,
-				   uint32_t capture_pix_fmt);
+int  daedalus_decoder_run_request(struct daedalus_decoder *dec,
+				  const struct daedalus_req_decode *req,
+				  const uint8_t *bitstream,
+				  const struct daedalus_h264_meta *h264_meta,
+				  struct daedalus_resp_frame *resp,
+				  const struct daedalus_capture_planes *planes);

 #endif /* DAEDALUS_V4L2_DECODER_H */
@@ -11,14 +11,31 @@
 #include <dlfcn.h>

 /*
- * SONAME versions match Debian Trixie / FFmpeg 7.1.3 today.  If
- * the system FFmpeg changes major, the daemon needs a rebuild;
- * we could add fallback paths (.so.60, .so.59, ...) but for
- * Phase 8.3 the pinned version is fine.
+ * SONAME versions match the Kwiboo ffmpeg-v4l2-request-fourier
+ * fork (FFmpeg 8.1) installed at the /opt/fourier prefix.  The
+ * fourier campaign's ld.so.conf.d/fourier.conf entry resolves
+ * these sonames from /opt/fourier/lib via the ld cache, so
+ * dlopen-by-soname works without LD_LIBRARY_PATH wrappers.
+ *
+ * Switched from Debian-stock soname 61/61/59 (FFmpeg 7.1.3) at
+ * 2026-05-21 to land daedalus-fourier kernel substitution into
+ * the production decode path via patches in the Kwiboo fork
+ * (see daedalus-v4l2#11 substitution arc): we own the fork
+ * source in marfrit-packages, so we can layer NEON-DSP
+ * substitution patches there for libavcodec/aarch64/h264dsp_init
+ * → daedalus_recipe_dispatch_* thunks.  The Debian-stock 7.1.3
+ * is built outside the marfrit-packages source tree, which
+ * would have made layering substitution patches awkward.
+ *
+ * Note: libavutil bumps soname 59 → 60 between FFmpeg 7.1 and
+ * 8.1; libavformat + libavcodec each bump 61 → 62.  The public
+ * API surface the daemon uses (avcodec_send_packet /
+ * receive_frame / AVCodecContext flags / AVFrame fields) is
+ * stable across the bump.
 */
-#define LIBAVFORMAT_SONAME	"libavformat.so.61"
-#define LIBAVCODEC_SONAME	"libavcodec.so.61"
-#define LIBAVUTIL_SONAME	"libavutil.so.59"
+#define LIBAVFORMAT_SONAME	"libavformat.so.62"
+#define LIBAVCODEC_SONAME	"libavcodec.so.62"
+#define LIBAVUTIL_SONAME	"libavutil.so.60"

 /*
 * Resolve a symbol from a dlopen'd handle.  Logs the failure
@@ -92,6 +109,24 @@ int ffmpeg_loader_init(struct ffmpeg_loader *loader)
 	RESOLVE(libavutil, LIBAVUTIL_SONAME, av_version_info);
 	RESOLVE(libavutil, LIBAVUTIL_SONAME, av_pix_fmt_desc_get);

+	/*
+	 * Optional symbols.  Resolved NULL-tolerantly — stock libavcodec
+	 * does not export these; the marfrit-packages
+	 * ffmpeg-v4l2-request-fourier fork does (patches 0016/0017).
+	 * Callers MUST NULL-check before invoking.  Clear any stale
+	 * dlerror() the previous lookups left behind so we read a clean
+	 * status here.
+	 */
+	(void) dlerror();
+	*(void **) &loader->ff_h264_set_mb_inspect_cb =
+		dlsym(loader->libavcodec, "ff_h264_set_mb_inspect_cb");
+	if (!loader->ff_h264_set_mb_inspect_cb) {
+		log_info("libavcodec lacks ff_h264_set_mb_inspect_cb "
+			 "(stock build, no daedalus-fourier 0016 patch) "
+			 "— shadow-mode unavailable");
+		(void) dlerror();	/* discard the not-found message */
+	}
+
 	{
 		unsigned int v = loader->avformat_version();
 		log_info("FFmpeg loaded: %s (libavformat %u.%u.%u)",
@@ -35,6 +35,14 @@
 #include <libavutil/avutil.h>
 #include <libavutil/pixdesc.h>

+/*
+ * Forward declaration must precede ff_h264_set_mb_inspect_cb's
+ * function-pointer signature below — otherwise the compiler treats
+ * `struct H264Context` as a parameter-scope declaration and the type
+ * is incompatible with later uses in shadow_decoder.c.
+ */
+struct H264Context;	/* opaque outside libavcodec */
+
 /**
 * struct ffmpeg_loader - resolved FFmpeg API entry points
 * @libavformat:	dlopen handle (close in cleanup)
@@ -88,6 +96,27 @@ struct ffmpeg_loader {
 	const char *(*av_get_media_type_string)(enum AVMediaType);
 	const char *(*av_version_info)(void);
 	const AVPixFmtDescriptor *(*av_pix_fmt_desc_get)(enum AVPixelFormat);
+
+	/*
+	 * Optional libavcodec symbols.  NULL when the loaded
+	 * libavcodec.so doesn't carry the corresponding marfrit-packages
+	 * patch.  Callers must NULL-check before invoking.
+	 *
+	 * ff_h264_set_mb_inspect_cb — marfrit-packages patch 0016.
+	 * Registers a per-MB callback that fires at the end of
+	 * ff_h264_hl_decode_mb.  Used by daedalus-v4l2's shadow-mode
+	 * path to drive daedalus-decoder's frame-major dispatch
+	 * alongside libavcodec's reference decode.  H264Context stays
+	 * opaque to the daemon — extraction of its private fields needs
+	 * the patched FFmpeg source-tree headers (see the CLI in
+	 * daedalus-decoder/tools/daedalus_decode_h264.c) and is
+	 * deferred to PR-Q3a.2.
+	 */
+	void  (*ff_h264_set_mb_inspect_cb)(struct AVCodecContext *avctx,
+					    void (*cb)(void *opaque,
+						       const struct H264Context *h,
+						       int mb_x, int mb_y),
+					    void *opaque);
 };

 /**
@@ -22,6 +22,8 @@

 #include <libavutil/log.h>

+#include <daedalus.h>
+
 static volatile sig_atomic_t g_terminate = 0;

 static void on_signal(int sig)
@@ -120,6 +122,26 @@ int main(int argc, char **argv)
 	/* Mute FFmpeg's own chattiness unless the user asked. */
 	fm.av_log_set_level(verbose ? AV_LOG_INFO : AV_LOG_WARNING);

+	/*
+	 * Initialise daedalus-fourier early so we can log substrate
+	 * availability up front.  daedalus_ctx_create_no_qpu() skips
+	 * the V3D Vulkan probe — we're not dispatching any kernels
+	 * yet, this is just the linkage sanity check + a marker in the
+	 * journal that the binary is wired against the right
+	 * daedalus-fourier version.  Future work (per daedalus-v4l2#11)
+	 * promotes to daedalus_ctx_create() once shader-path resolution
+	 * is wired through the public API.
+	 */
+	daedalus_ctx *df_ctx = daedalus_ctx_create_no_qpu();
+	if (df_ctx) {
+		log_info("daedalus-fourier: linked, ctx alive (no_qpu mode; "
+			 "has_qpu=%d)",
+			 daedalus_ctx_has_qpu(df_ctx));
+	} else {
+		log_warn("daedalus-fourier: ctx_create_no_qpu returned NULL "
+			 "(out of memory?) — continuing without backend kernels");
+	}
+
 	int rc;
 	const char *cmd = argv[i++];
 	if (strcmp(cmd, "parse") == 0) {
@@ -132,6 +154,8 @@ int main(int argc, char **argv)
 		rc = 2;
 	}

+	if (df_ctx)
+		daedalus_ctx_destroy(df_ctx);
 	ffmpeg_loader_cleanup(&fm);
 	log_cleanup();
 	return rc;
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * shadow_decoder.c — env-gated parallel daedalus-decoder wiring.
+ *
+ * PR-Q3a.1 scope: prove the toolchain.
+ *
+ *   1. DAEDALUS_SHADOW_MODE=1 + libavcodec carries marfrit-packages
+ *      0016 (ff_h264_set_mb_inspect_cb) → shadow path active.
+ *   2. Per-MB callback fires on every macroblock libavcodec emits.
+ *      We only count the firings here.
+ *   3. Frame boundary creates a daedalus_decoder context lazily
+ *      (sized from the first AVFrame); destroy + recreate on
+ *      resolution change.
+ *   4. Per-frame log line surfaces MB count + has_qpu state.
+ *
+ * No daedalus_decoder_append_mb / flush_frame calls yet — that
+ * needs H264Context introspection which depends on the patched
+ * FFmpeg source-tree headers (DAEDALUS_FFMPEG_SRC) and lands in
+ * PR-Q3a.2.  This module's job here is to confirm the link
+ * survives, the callback resolves, the context creates, and
+ * tearing the path back down doesn't perturb the production
+ * AVFrame → V4L2 pipeline.
+ */
+#include "shadow_decoder.h"
+
+#include "ffmpeg_loader.h"
+#include "log.h"
+
+#include <libavcodec/avcodec.h>
+#include <libavutil/frame.h>
+
+#include <daedalus_decoder.h>
+
+#include <stdlib.h>
+#include <string.h>
+
+struct shadow_decoder {
+	struct ffmpeg_loader *loader;
+	daedalus_decoder     *dec;	/* lazily created on first frame */
+	int                   ctx_w;	/* coded-frame width at last create */
+	int                   ctx_h;
+	uint64_t              mbs_this_frame;
+	uint64_t              total_frames;
+	uint64_t              total_mbs;
+};
+
+static void shadow_mb_inspect(void *opaque,
+			      const struct H264Context *h __attribute__((unused)),
+			      int mb_x __attribute__((unused)),
+			      int mb_y __attribute__((unused)))
+{
+	struct shadow_decoder *sh = opaque;
+	sh->mbs_this_frame++;
+}
+
+struct shadow_decoder *shadow_decoder_create(struct ffmpeg_loader *loader)
+{
+	const char *env = getenv("DAEDALUS_SHADOW_MODE");
+
+	if (!env || strcmp(env, "1") != 0)
+		return NULL;
+
+	if (!loader || !loader->ff_h264_set_mb_inspect_cb) {
+		log_warn("shadow_decoder: DAEDALUS_SHADOW_MODE=1 set but "
+			 "libavcodec lacks ff_h264_set_mb_inspect_cb — disabled");
+		return NULL;
+	}
+
+	struct shadow_decoder *sh = calloc(1, sizeof(*sh));
+	if (!sh) {
+		log_err("shadow_decoder: out of memory");
+		return NULL;
+	}
+	sh->loader = loader;
+	log_info("shadow_decoder: enabled (DAEDALUS_SHADOW_MODE=1, "
+		 "daedalus-decoder version %s)",
+		 daedalus_decoder_version());
+	return sh;
+}
+
+void shadow_decoder_destroy(struct shadow_decoder *sh)
+{
+	if (!sh)
+		return;
+	if (sh->dec)
+		daedalus_decoder_destroy(sh->dec);
+	log_info("shadow_decoder: shutdown — observed %llu frames / %llu MBs",
+		 (unsigned long long) sh->total_frames,
+		 (unsigned long long) sh->total_mbs);
+	free(sh);
+}
+
+void shadow_decoder_install_cb(struct shadow_decoder *sh,
+			       struct AVCodecContext *avctx)
+{
+	if (!sh || !avctx)
+		return;
+	/*
+	 * Loader's optional-symbol pointer was checked at create time
+	 * (we wouldn't be non-NULL otherwise), so the call is safe.
+	 */
+	sh->loader->ff_h264_set_mb_inspect_cb(avctx, shadow_mb_inspect, sh);
+	log_info("shadow_decoder: per-MB callback installed on H.264 ctx");
+}
+
+/*
+ * Ensure the daedalus_decoder context matches the frame's dimensions.
+ * Rounds up to the H.264 macroblock grid (16-pixel multiples) — the
+ * coded picture is always 16-aligned even when the displayed crop
+ * isn't.  Returns 0 on success, -1 on failure (ctx left NULL; caller
+ * logs and continues without shadow dispatch this frame).
+ */
+static int shadow_ensure_ctx(struct shadow_decoder *sh, int w, int h)
+{
+	int coded_w = (w + 15) & ~15;
+	int coded_h = (h + 15) & ~15;
+
+	if (sh->dec && sh->ctx_w == coded_w && sh->ctx_h == coded_h)
+		return 0;
+
+	if (sh->dec) {
+		daedalus_decoder_destroy(sh->dec);
+		sh->dec = NULL;
+	}
+
+	sh->dec = daedalus_decoder_create(coded_w, coded_h);
+	if (!sh->dec) {
+		log_warn("shadow_decoder: daedalus_decoder_create(%dx%d) "
+			 "failed — shadow dispatch skipped this stream",
+			 coded_w, coded_h);
+		sh->ctx_w = sh->ctx_h = 0;
+		return -1;
+	}
+	sh->ctx_w = coded_w;
+	sh->ctx_h = coded_h;
+	log_info("shadow_decoder: ctx ready (%dx%d coded, has_qpu=%d)",
+		 coded_w, coded_h, daedalus_decoder_has_qpu(sh->dec));
+	return 0;
+}
+
+void shadow_decoder_on_frame(struct shadow_decoder *sh,
+			     const struct AVFrame *fr)
+{
+	if (!sh || !fr)
+		return;
+
+	(void) shadow_ensure_ctx(sh, fr->width, fr->height);
+
+	sh->total_frames++;
+	sh->total_mbs += sh->mbs_this_frame;
+
+	uint64_t expected = (uint64_t) ((fr->width + 15) >> 4) *
+			    (uint64_t) ((fr->height + 15) >> 4);
+	log_info("shadow_decoder: frame #%llu %dx%d — %llu MBs observed "
+		 "(expected %llu)",
+		 (unsigned long long) sh->total_frames,
+		 fr->width, fr->height,
+		 (unsigned long long) sh->mbs_this_frame,
+		 (unsigned long long) expected);
+
+	sh->mbs_this_frame = 0;
+}
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * shadow_decoder.h — env-gated parallel daedalus-decoder path.
+ *
+ * When the daemon is launched with DAEDALUS_SHADOW_MODE=1, shadow_decoder
+ * runs alongside libavcodec's normal H.264 decode: a per-MB inspection
+ * callback fires for every macroblock libavcodec emits, and a frame-
+ * boundary hook lets the shadow path observe and (in future PRs)
+ * dispatch the same frame's worth of work through daedalus-decoder's
+ * frame-major UMA pipeline.  Production output (AVFrame → V4L2 NV12)
+ * is unchanged regardless of this module's state.
+ *
+ * PR-Q3a.1 scope: wiring only.  The callback counts MBs and the per-
+ * frame hook logs the count.  No daedalus-decoder dispatch yet; that
+ * lands in PR-Q3a.2 along with the H264Context-introspection path
+ * gated on the patched FFmpeg source-tree headers.
+ *
+ * Disabled state (env unset or libavcodec lacks ff_h264_set_mb_inspect_cb)
+ * is a hard NULL — shadow_decoder_create() returns NULL, all other
+ * entry points are safe with NULL and become no-ops.
+ *
+ * The daedalus-decoder context, when active, is created lazily on the
+ * first observed frame (dimensions come from libavcodec's AVFrame, not
+ * from the SPS — keeps init independent of stream-header bring-up
+ * order) and re-created on resolution change.
+ */
+#ifndef DAEDALUS_V4L2_SHADOW_DECODER_H
+#define DAEDALUS_V4L2_SHADOW_DECODER_H
+
+#include <stdint.h>
+
+struct ffmpeg_loader;
+struct AVCodecContext;
+struct AVFrame;
+struct shadow_decoder;
+
+/**
+ * shadow_decoder_create - allocate shadow state if env-enabled
+ * @loader:	borrowed FFmpeg loader (must outlive the returned ctx)
+ *
+ * Probes DAEDALUS_SHADOW_MODE env var and the loader's optional
+ * ff_h264_set_mb_inspect_cb pointer.  Returns NULL when shadow mode
+ * is disabled or unsupported; that's the normal production state.
+ * Returns a usable handle otherwise.  Caller owns the handle and must
+ * call shadow_decoder_destroy.
+ */
+struct shadow_decoder *shadow_decoder_create(struct ffmpeg_loader *loader);
+
+/**
+ * shadow_decoder_destroy - tear down.  Safe with NULL.
+ */
+void shadow_decoder_destroy(struct shadow_decoder *sh);
+
+/**
+ * shadow_decoder_install_cb - install the per-MB inspection callback
+ *					   on a freshly-opened H.264 AVCodecContext
+ *
+ * Safe with NULL @sh (NOP).  Should be called once per H.264 codec
+ * open; repeated calls just reinstall and are harmless.
+ */
+void shadow_decoder_install_cb(struct shadow_decoder *sh,
+			       struct AVCodecContext *avctx);
+
+/**
+ * shadow_decoder_on_frame - per-frame boundary hook
+ *
+ * Called after avcodec_receive_frame returns a frame.  Logs the per-
+ * frame MB counter, resets it, and (in future PRs) drives
+ * daedalus_decoder_flush_frame + the AVFrame-vs-shadow diff.  Safe
+ * with NULL @sh.
+ */
+void shadow_decoder_on_frame(struct shadow_decoder *sh,
+			     const struct AVFrame *fr);
+
+#endif /* DAEDALUS_V4L2_SHADOW_DECODER_H */
@@ -0,0 +1,479 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * test_av1_obu_synth — standalone unit test for av1_synth_sequence_header_obu.
+ *
+ * Builds as an opt-in executable target (test_av1_obu_synth) gated on
+ * -DDAEDALUS_BUILD_TESTS=ON.  Runs by default in the CI build matrix
+ * to gate the OBU encoder against regressions.
+ *
+ * Each test case sets up a struct v4l2_ctrl_av1_sequence with known
+ * field values, calls the synthesiser, then walks the output bit by
+ * bit against a hand-computed expected encoding.  The bit-walker uses
+ * the same reader semantics as bitstream_writer: MSB-first within each
+ * byte, with the OBU header byte / leb128 size at byte-aligned
+ * positions and the RBSP payload starting at the byte right after.
+ */
+
+#include "av1_obu_synth.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <linux/v4l2-controls.h>
+
+/* MSB-first bit reader over a byte stream. */
+struct br {
+	const uint8_t *buf;
+	size_t bytes;
+	size_t pos_bytes;
+	int pos_bit;
+	int overflow;
+};
+
+static void br_init(struct br *b, const uint8_t *buf, size_t bytes)
+{
+	b->buf = buf;
+	b->bytes = bytes;
+	b->pos_bytes = 0;
+	b->pos_bit = 0;
+	b->overflow = 0;
+}
+
+static uint32_t br_get(struct br *b, int n)
+{
+	uint32_t v = 0;
+	int i;
+	for (i = 0; i < n; i++) {
+		uint8_t bit;
+		if (b->pos_bytes >= b->bytes) {
+			b->overflow = 1;
+			return 0;
+		}
+		bit = (b->buf[b->pos_bytes] >> (7 - b->pos_bit)) & 1u;
+		v = (v << 1) | bit;
+		b->pos_bit++;
+		if (b->pos_bit == 8) {
+			b->pos_bit = 0;
+			b->pos_bytes++;
+		}
+	}
+	return v;
+}
+
+/* Round up to next byte; returns bytes consumed for boundary. */
+static void br_byte_align(struct br *b)
+{
+	if (b->pos_bit != 0) {
+		b->pos_bit = 0;
+		b->pos_bytes++;
+	}
+}
+
+#define CHECK(cond, ...) do {				\
+	if (!(cond)) {					\
+		fprintf(stderr, "FAIL %s:%d: ",		\
+			__func__, __LINE__);		\
+		fprintf(stderr, __VA_ARGS__);		\
+		fputc('\n', stderr);			\
+		return 1;				\
+	}						\
+} while (0)
+
+#define CHECK_EQ(actual, expected, name) do {		\
+	uint32_t _a = (uint32_t)(actual);		\
+	uint32_t _e = (uint32_t)(expected);		\
+	if (_a != _e) {					\
+		fprintf(stderr, "FAIL %s:%d %s: "	\
+			"got %u, expected %u\n",	\
+			__func__, __LINE__, (name),	\
+			_a, _e);			\
+		return 1;				\
+	}						\
+} while (0)
+
+/*
+ * Case 1: 1080p, profile 0 (4:2:0), 8-bit, color_range studio,
+ * order_hint enabled with 7 bits, CDEF + restoration on, no film grain.
+ * Covers the most common decode path libva-v4l2-request drives on
+ * the daedalus daemon.
+ */
+static int test_profile0_1080p(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	uint8_t out[64];
+	size_t n;
+	struct br br;
+	uint32_t bit;
+
+	memset(&seq, 0, sizeof(seq));
+	seq.seq_profile			= 0;
+	seq.order_hint_bits		= 7;
+	seq.bit_depth			= 8;
+	seq.max_frame_width_minus_1	= 1919;	/* 1920 */
+	seq.max_frame_height_minus_1	= 1079;	/* 1080 */
+	seq.flags =
+		V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK |
+		V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT |
+		V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF |
+		V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION;
+	/* COLOR_RANGE flag unset = studio swing (limited range, =0 in spec) */
+
+	n = av1_synth_sequence_header_obu(&seq, out, sizeof(out));
+	CHECK(n > 0 && n <= sizeof(out), "synthesiser returned %zu bytes", n);
+
+	/* OBU header byte: 0x0A (obu_type=1, has_size_field=1). */
+	CHECK_EQ(out[0], 0x0A, "OBU header byte");
+
+	/* leb128 size — payload fits in 1 byte for sub-128-byte payloads. */
+	CHECK(n >= 2, "OBU has size field byte");
+	CHECK((out[1] & 0x80) == 0, "leb128 single-byte form (no continuation)");
+	{
+		size_t payload_len = out[1] & 0x7fu;
+		CHECK_EQ(n, 2 + payload_len, "total length matches header+leb+payload");
+	}
+
+	/* Walk payload bits. */
+	br_init(&br, out + 2, n - 2);
+
+	bit = br_get(&br, 3);  CHECK_EQ(bit, 0, "seq_profile");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "still_picture");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "reduced_still_picture_header");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "timing_info_present_flag");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "initial_display_delay_present_flag");
+	bit = br_get(&br, 5);  CHECK_EQ(bit, 0, "operating_points_cnt_minus_1");
+	bit = br_get(&br, 12); CHECK_EQ(bit, 0, "operating_point_idc[0]");
+	bit = br_get(&br, 5);  CHECK_EQ(bit, 13, "seq_level_idx[0]");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "seq_tier[0]");
+
+	/* min_bits_for(1919) = 11; encoded value = 11 - 1 = 10 */
+	bit = br_get(&br, 4);  CHECK_EQ(bit, 10, "frame_width_bits_minus_1");
+	/* min_bits_for(1079) = 11; same value */
+	bit = br_get(&br, 4);  CHECK_EQ(bit, 10, "frame_height_bits_minus_1");
+	bit = br_get(&br, 11); CHECK_EQ(bit, 1919, "max_frame_width_minus_1");
+	bit = br_get(&br, 11); CHECK_EQ(bit, 1079, "max_frame_height_minus_1");
+
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "frame_id_numbers_present_flag");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "use_128x128_superblock");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_filter_intra");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_intra_edge_filter");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_interintra_compound");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_masked_compound");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_warped_motion");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_dual_filter");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "enable_order_hint");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_jnt_comp");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_ref_frame_mvs");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "seq_choose_screen_detection_tools");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "seq_choose_integer_mv");
+	/* order_hint_bits=7 → order_hint_bits_minus_1 = 6 */
+	bit = br_get(&br, 3);  CHECK_EQ(bit, 6, "order_hint_bits_minus_1");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_superres");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "enable_cdef");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "enable_restoration");
+
+	/* color_config: high_bitdepth=0 (8-bit), monochrome=0,
+	 * color_description_present=0, color_range=0, subsampling forced (no bits),
+	 * chroma_sample_position=0 (2 bits when subsampling_x && subsampling_y),
+	 * separate_uv_delta_q=0. */
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "high_bitdepth");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "monochrome");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "color_description_present_flag");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "color_range");
+	bit = br_get(&br, 2);  CHECK_EQ(bit, 0, "chroma_sample_position");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "separate_uv_delta_q");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "film_grain_params_present");
+
+	/* trailing_bits — single '1' then zero-fill */
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "trailing_bits stop_one");
+	br_byte_align(&br);
+
+	CHECK(!br.overflow, "no bit-reader overflow");
+	CHECK_EQ(br.pos_bytes, n - 2, "consumed exactly the payload");
+
+	printf("  profile0 1080p 8-bit: OK (%zu bytes)\n", n);
+	return 0;
+}
+
+/* Case 2: profile 0, 10-bit, 4:2:0, monochrome.
+ * Exercises the high_bitdepth + monochrome short-form color_config path. */
+static int test_profile0_monochrome_10bit(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	uint8_t out[64];
+	size_t n;
+	struct br br;
+	uint32_t bit;
+
+	memset(&seq, 0, sizeof(seq));
+	seq.seq_profile			= 0;
+	seq.order_hint_bits		= 0;
+	seq.bit_depth			= 10;
+	seq.max_frame_width_minus_1	= 1279;
+	seq.max_frame_height_minus_1	= 719;
+	seq.flags = V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME;
+
+	n = av1_synth_sequence_header_obu(&seq, out, sizeof(out));
+	CHECK(n > 0, "synthesiser returned %zu bytes", n);
+	CHECK_EQ(out[0], 0x0A, "OBU header byte");
+
+	br_init(&br, out + 2, n - 2);
+	bit = br_get(&br, 3);  CHECK_EQ(bit, 0, "seq_profile");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "still_picture");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "reduced_still_picture_header");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "timing_info_present_flag");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "initial_display_delay_present_flag");
+	bit = br_get(&br, 5);  CHECK_EQ(bit, 0, "operating_points_cnt_minus_1");
+	bit = br_get(&br, 12); CHECK_EQ(bit, 0, "operating_point_idc[0]");
+	bit = br_get(&br, 5);  CHECK_EQ(bit, 13, "seq_level_idx[0]");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "seq_tier[0]");
+	/* 1279 fits in 11 bits → width_bits_minus_1 = 10 */
+	bit = br_get(&br, 4);  CHECK_EQ(bit, 10, "frame_width_bits_minus_1");
+	/* 719 fits in 10 bits → height_bits_minus_1 = 9 */
+	bit = br_get(&br, 4);  CHECK_EQ(bit, 9, "frame_height_bits_minus_1");
+	bit = br_get(&br, 11); CHECK_EQ(bit, 1279, "max_frame_width_minus_1");
+	bit = br_get(&br, 10); CHECK_EQ(bit, 719,  "max_frame_height_minus_1");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "frame_id_numbers_present_flag");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "use_128x128_superblock");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_filter_intra");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_intra_edge_filter");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_interintra_compound");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_masked_compound");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_warped_motion");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_dual_filter");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_order_hint");
+	/* enable_order_hint=0 → no jnt_comp / ref_frame_mvs / order_hint_bits */
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "seq_choose_screen_detection_tools");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "seq_choose_integer_mv");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_superres");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_cdef");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "enable_restoration");
+
+	/* color_config: high_bitdepth=1 (10-bit), seq_profile==0 so no twelve_bit,
+	 * monochrome=1, color_description_present=0, color_range=0.
+	 * Monochrome short-form: no subsampling/chroma_sample_position/uv_delta_q bits. */
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "high_bitdepth");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "monochrome");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "color_description_present_flag");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "color_range");
+
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 0, "film_grain_params_present");
+	bit = br_get(&br, 1);  CHECK_EQ(bit, 1, "trailing_bits stop_one");
+
+	CHECK(!br.overflow, "no overflow");
+	printf("  profile0 monochrome 10-bit: OK (%zu bytes)\n", n);
+	return 0;
+}
+
+/* Case 3: reject illegal seq_profile + bit_depth combination. */
+static int test_reject_invalid_profile_bitdepth(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	uint8_t out[64];
+	size_t n;
+
+	memset(&seq, 0, sizeof(seq));
+	seq.seq_profile			= 1;	/* 4:4:4 only */
+	seq.bit_depth			= 12;	/* but profile 1 doesn't allow 12-bit */
+	seq.max_frame_width_minus_1	= 1919;
+	seq.max_frame_height_minus_1	= 1079;
+
+	n = av1_synth_sequence_header_obu(&seq, out, sizeof(out));
+	CHECK_EQ(n, 0, "expected 0 (rejected) for profile1+12bit");
+
+	printf("  reject profile1+12bit: OK\n");
+	return 0;
+}
+
+/* Case 4: small out_cap → overflow path. */
+static int test_overflow(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	uint8_t out[4];	/* deliberately too small */
+	size_t n;
+
+	memset(&seq, 0, sizeof(seq));
+	seq.seq_profile			= 0;
+	seq.bit_depth			= 8;
+	seq.max_frame_width_minus_1	= 1919;
+	seq.max_frame_height_minus_1	= 1079;
+
+	n = av1_synth_sequence_header_obu(&seq, out, sizeof(out));
+	CHECK_EQ(n, 0, "expected 0 (overflow) for tiny out buffer");
+
+	printf("  out_cap overflow: OK\n");
+	return 0;
+}
+
+/* Case 5: Temporal Delimiter is exactly 2 bytes 0x12 0x00. */
+static int test_temporal_delimiter(void)
+{
+	uint8_t out[4];
+	size_t n;
+
+	memset(out, 0xff, sizeof(out));
+	n = av1_synth_temporal_delimiter_obu(out, sizeof(out));
+	CHECK_EQ(n, 2, "TD length");
+	CHECK_EQ(out[0], 0x12, "TD obu header byte (obu_type=2, has_size=1)");
+	CHECK_EQ(out[1], 0x00, "TD leb128 size = 0");
+	printf("  temporal delimiter: OK\n");
+	return 0;
+}
+
+/* Test fixtures for Frame Header cases. */
+static void mk_seq_1080p_p0(struct v4l2_ctrl_av1_sequence *seq)
+{
+	memset(seq, 0, sizeof(*seq));
+	seq->seq_profile		= 0;
+	seq->order_hint_bits		= 7;
+	seq->bit_depth			= 8;
+	seq->max_frame_width_minus_1	= 1919;
+	seq->max_frame_height_minus_1	= 1079;
+	seq->flags = V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK |
+		     V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT |
+		     V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF |
+		     V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION;
+}
+
+static void mk_frame_key_1080p(struct v4l2_ctrl_av1_frame *fr)
+{
+	memset(fr, 0, sizeof(*fr));
+	fr->frame_type			= V4L2_AV1_KEY_FRAME;
+	fr->frame_width_minus_1		= 1919;
+	fr->frame_height_minus_1	= 1079;
+	fr->render_width_minus_1	= 1919;
+	fr->render_height_minus_1	= 1079;
+	fr->primary_ref_frame		= 7;	/* PRIMARY_REF_NONE */
+	fr->quantization.base_q_idx	= 60;
+	fr->loop_filter.level[0]	= 16;
+	fr->loop_filter.level[1]	= 16;
+	fr->loop_filter.level[2]	= 16;
+	fr->loop_filter.level[3]	= 16;
+	fr->cdef.bits			= 0;
+	fr->loop_restoration.frame_restoration_type[0] = V4L2_AV1_FRAME_RESTORE_NONE;
+	fr->loop_restoration.frame_restoration_type[1] = V4L2_AV1_FRAME_RESTORE_NONE;
+	fr->loop_restoration.frame_restoration_type[2] = V4L2_AV1_FRAME_RESTORE_NONE;
+	fr->interpolation_filter	= 0;
+	fr->tx_mode			= V4L2_AV1_TX_MODE_SELECT;
+	fr->flags = V4L2_AV1_FRAME_FLAG_SHOW_FRAME;
+}
+
+/* Case 6: KEY frame at 1080p — happy path, structural smoke. */
+static int test_frame_header_key_1080p(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	struct v4l2_ctrl_av1_frame fr;
+	uint8_t out[256];
+	size_t n;
+	struct br br;
+	uint32_t bit;
+
+	mk_seq_1080p_p0(&seq);
+	mk_frame_key_1080p(&fr);
+
+	n = av1_synth_frame_header_obu(&seq, &fr, out, sizeof(out));
+	CHECK(n > 0 && n <= sizeof(out), "FH synth returned %zu", n);
+
+	/* OBU header byte: obu_type=3 (FRAME_HEADER), has_size_field=1
+	 * → 0_0011_0_1_0 = 0x1A. */
+	CHECK_EQ(out[0], 0x1A, "FH obu header byte");
+	CHECK((out[1] & 0x80) == 0, "leb128 single byte");
+	CHECK_EQ(n, 2 + (size_t)(out[1] & 0x7f), "total = header+leb+payload");
+
+	br_init(&br, out + 2, n - 2);
+	bit = br_get(&br, 1); CHECK_EQ(bit, 0, "show_existing_frame");
+	bit = br_get(&br, 2); CHECK_EQ(bit, 0, "frame_type=KEY");
+	bit = br_get(&br, 1); CHECK_EQ(bit, 1, "show_frame");
+	bit = br_get(&br, 1); CHECK_EQ(bit, 0, "disable_cdf_update");
+	bit = br_get(&br, 1); CHECK_EQ(bit, 0, "allow_screen_content_tools");
+
+	printf("  KEY frame 1080p: OK (%zu bytes)\n", n);
+	return 0;
+}
+
+/* Case 7: INTER frame — coverage smoke. */
+static int test_frame_header_inter(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	struct v4l2_ctrl_av1_frame fr;
+	uint8_t out[256];
+	size_t n;
+	int i;
+
+	mk_seq_1080p_p0(&seq);
+	mk_frame_key_1080p(&fr);
+	fr.frame_type = V4L2_AV1_INTER_FRAME;
+	fr.primary_ref_frame = 0;
+	for (i = 0; i < V4L2_AV1_REFS_PER_FRAME; i++)
+		fr.ref_frame_idx[i] = (int8_t)(i & 7);
+	fr.flags |= V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT;
+
+	n = av1_synth_frame_header_obu(&seq, &fr, out, sizeof(out));
+	CHECK(n > 0, "INTER FH synth returned %zu", n);
+	CHECK_EQ(out[0], 0x1A, "FH obu header");
+	printf("  INTER frame: OK (%zu bytes)\n", n);
+	return 0;
+}
+
+/* Case 8: SWITCH frame should be rejected. */
+static int test_frame_header_switch_rejected(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	struct v4l2_ctrl_av1_frame fr;
+	uint8_t out[256];
+	size_t n;
+
+	mk_seq_1080p_p0(&seq);
+	mk_frame_key_1080p(&fr);
+	fr.frame_type = V4L2_AV1_SWITCH_FRAME;
+	n = av1_synth_frame_header_obu(&seq, &fr, out, sizeof(out));
+	CHECK_EQ(n, 0, "SWITCH frame should be out of scope");
+	printf("  SWITCH frame rejected: OK\n");
+	return 0;
+}
+
+/* Case 9: segmentation enabled should be rejected. */
+static int test_frame_header_segmentation_rejected(void)
+{
+	struct v4l2_ctrl_av1_sequence seq;
+	struct v4l2_ctrl_av1_frame fr;
+	uint8_t out[256];
+	size_t n;
+
+	mk_seq_1080p_p0(&seq);
+	mk_frame_key_1080p(&fr);
+	fr.segmentation.flags = V4L2_AV1_SEGMENTATION_FLAG_ENABLED;
+	n = av1_synth_frame_header_obu(&seq, &fr, out, sizeof(out));
+	CHECK_EQ(n, 0, "segmentation-enabled should be out of scope");
+	printf("  segmentation enabled rejected: OK\n");
+	return 0;
+}
+
+int main(void)
+{
+	int fail = 0;
+
+	printf("=== av1_synth_sequence_header_obu ===\n");
+
+	fail |= test_profile0_1080p();
+	fail |= test_profile0_monochrome_10bit();
+	fail |= test_reject_invalid_profile_bitdepth();
+	fail |= test_overflow();
+
+	printf("=== av1_synth_temporal_delimiter_obu ===\n");
+	fail |= test_temporal_delimiter();
+
+	printf("=== av1_synth_frame_header_obu ===\n");
+	fail |= test_frame_header_key_1080p();
+	fail |= test_frame_header_inter();
+	fail |= test_frame_header_switch_rejected();
+	fail |= test_frame_header_segmentation_rejected();
+
+	if (fail) {
+		fprintf(stderr, "AV1 OBU synth tests FAILED\n");
+		return 1;
+	}
+	printf("AV1 OBU synth tests PASSED\n");
+	return 0;
+}
@@ -28,12 +28,7 @@
 #include <linux/v4l2-controls.h>

 #define DAEDALUS_PROTO_MAGIC		0x44303456u	/* 'D04V' */
-#define DAEDALUS_PROTO_VERSION		1u		/* pre-1.0; bumped for
-							 * REQ_DECODE.src_pts +
-							 * RESP_FRAME.flags +
-							 * RESP_FRAME.output_src_pts
-							 * (H.264 B-frame reorder fix,
-							 * daedalus-v4l2#6). */
+#define DAEDALUS_PROTO_VERSION		0u		/* pre-1.0 */

 /*
 * Wire-protocol message types.
@@ -76,7 +71,18 @@ struct daedalus_msg_hdr {
 	__u32 reserved;
 };

-#define DAEDALUS_PROTO_MAX_PAYLOAD	(64u * 1024u)	/* 64 KiB */
+/*
+ * Wire-protocol payload cap.  Sized to comfortably hold real-world
+ * H.264 / VP9 / AV1 access-unit bitstreams:
+ *   - 720p H.264 worst-case I-frame:  ~200 KiB
+ *   - 1080p H.264 worst-case I-frame: ~500 KiB
+ *   - 4K H.264 worst-case I-frame:    ~2 MiB (would need a bump)
+ * 1 MiB is the conservative end of what cedrus / rkvdec / hantro
+ * report as OUTPUT_MPLANE sizeimage.  Allocations (chardev kmalloc
+ * / kmemdup, daemon read buffer, vb2 plane backing) are sized per-
+ * payload at runtime; this only sets the ceiling.  Issue #19.
+ */
+#define DAEDALUS_PROTO_MAX_PAYLOAD	(1024u * 1024u)	/* 1 MiB */

 /* -- REQ_DECODE / RESP_FRAME payload structures ---------------------- */

@@ -147,17 +153,6 @@ struct daedalus_req_decode {
 	__u32 capture_plane_size[3];
 	__u32 capture_plane_stride[3];
 	__u32 flags;
-	__u32 reserved0;	/* explicit pad to 8-byte align src_pts */
-	/*
-	 * The V4L2 OUTPUT (bitstream) buffer's vb2 timestamp at submission
-	 * time.  The daemon sets pkt->pts = src_pts before
-	 * avcodec_send_packet so libavcodec's display-ordered
-	 * receive_frame can return frame->pts == src_pts of the bitstream
-	 * the frame's slices belong to.  Decouples kernel cookie (decode
-	 * order, in-kernel identity) from display order — required for
-	 * H.264 B-frame correctness (daedalus-v4l2#6).
-	 */
-	__u64 src_pts;
 };

 /**
@@ -224,31 +219,6 @@ enum daedalus_decode_status {
 * Fixed size — keeps wire parsing simple.  No variable-length
 * pixel data in Phase 8.4; dmabuf in Phase 8.5 carries that.
 */
-/**
- * DAEDALUS_RESP_FLAG_HAS_PIXELS - this RESP delivers a decoded frame's
- *   pixels.  The owning CAPTURE buffer is identified by output_src_pts
- *   (matched against an in-flight item's src_pts on the kernel side),
- *   NOT by the chardev message header's cookie.  Required since
- *   libavcodec's H.264 decoder reorders to display order — the cookie
- *   the daemon just received the REQ on may not be the cookie whose
- *   bitstream produced the frame just popped from receive_frame.
- *
- * DAEDALUS_RESP_FLAG_SRC_CONSUMED - the chardev header's cookie's
- *   OUTPUT bitstream buffer is done from the daemon's perspective
- *   (libavcodec has accepted the slice data via avcodec_send_packet).
- *   Kernel releases src_buf for the cookie and runs job_finish so the
- *   m2m scheduler can dispatch the next REQ.  Independent of any
- *   pixel delivery — the dst_buf paired with this cookie may still
- *   be parked, awaiting a future RESP with HAS_PIXELS + matching
- *   output_src_pts.
- *
- * Both flags may be set in a single message (steady-state path with
- * no codec reorder lag — the just-sent packet immediately yielded a
- * frame whose pts == this REQ's src_pts).
- */
-#define DAEDALUS_RESP_FLAG_HAS_PIXELS	0x00000001u
-#define DAEDALUS_RESP_FLAG_SRC_CONSUMED	0x00000002u
-
 struct daedalus_resp_frame {
 	__u32 status;
 	__u32 codec_id;
@@ -258,16 +228,7 @@ struct daedalus_resp_frame {
 	__u32 luma_len;
 	__u32 chroma_len;
 	__u32 fnv1a_yuv;
-	__u32 flags;		/* bitmask of DAEDALUS_RESP_FLAG_* */
-	__u32 reserved0;	/* explicit pad to 8-byte align output_src_pts */
-	/*
-	 * Set when DAEDALUS_RESP_FLAG_HAS_PIXELS is in flags.  Identifies
-	 * which OUTPUT bitstream's slices produced the pixels in this
-	 * RESP — kernel completes the CAPTURE buffer whose inflight item
-	 * has src_pts == output_src_pts.  Ignored when HAS_PIXELS is
-	 * clear.
-	 */
-	__u64 output_src_pts;
+	__u32 reserved;
 };

 /* -- chardev ioctl ABI ----------------------------------------------- */
@@ -167,6 +167,26 @@ static int daedalus_chardev_release(struct inode *inode, struct file *file)
 	}
 	mutex_unlock(&dev->req_lock);

+	/*
+	 * Drain the V4L2-side in-flight list before the daemon goes
+	 * away.  Any REQ_DECODE we already sent to the daemon won't
+	 * get a matching RESP_FRAME — without this drain,
+	 * v4l2_m2m_cancel_job() in the V4L2 consumer's close() path
+	 * (or in vb2's STREAMOFF path) blocks forever waiting for a
+	 * job_finish that will never arrive, and the consumer becomes
+	 * unkillable D-state.  Issue #146.
+	 *
+	 * Done AFTER draining the request queue: any REQ_DECODE still
+	 * sitting in dev->req_queue is per definition not yet "in
+	 * flight" (the kernel never released it to the daemon), so it
+	 * doesn't need the m2m-job-finish dance — freeing the message
+	 * is sufficient.  The inflight list holds entries the kernel
+	 * already committed to (added in device_run after the message
+	 * was queued or written), which is exactly what needs to be
+	 * failed back to vb2 here.
+	 */
+	daedalus_drain_inflight_on_disconnect();
+
 	mutex_lock(&dev->open_lock);
 	dev->opened = 0;
 	mutex_unlock(&dev->open_lock);
@@ -611,28 +611,8 @@ struct daedalus_inflight {
 	struct list_head	list;
 	u32			cookie;
 	struct daedalus_ctx	*ctx;
-	/*
-	 * src_buf / dst_buf decouple in the daedalus-v4l2#6 reorder fix.
-	 * src_buf is cleared (NULL'd) when DAEDALUS_RESP_FLAG_SRC_CONSUMED
-	 * arrives — that signals libavcodec has accepted the bitstream
-	 * even if no display-order frame is ready yet.  dst_buf is cleared
-	 * when DAEDALUS_RESP_FLAG_HAS_PIXELS arrives — the daemon has
-	 * written pixels into this CAPTURE buffer.  When both are NULL
-	 * the inflight entry is removed and freed.
-	 */
 	struct vb2_v4l2_buffer	*src_buf;
 	struct vb2_v4l2_buffer	*dst_buf;
-	/*
-	 * src_buf->vb2_buf.timestamp captured at device_run time.
-	 * Mirrored into REQ_DECODE.src_pts so the daemon can set
-	 * pkt->pts = src_pts on avcodec_send_packet, and read back
-	 * frame->pts to identify which OUTPUT bitstream produced the
-	 * current display-order frame.  Kept here so the kernel can
-	 * stamp dst_buf.timestamp explicitly at HAS_PIXELS time even
-	 * though V4L2_BUF_FLAG_TIMESTAMP_COPY's automatic src->dst
-	 * pairing no longer applies (src/dst lifecycles decoupled).
-	 */
-	u64			src_pts;
 	/*
 	 * Captured media_request the src_buf was bound to (if any).
 	 * Set by device_run from src_buf->vb2_buf.req_obj.req;
@@ -643,22 +623,16 @@ struct daedalus_inflight {
 	struct media_request	*req;
 };

-/*
- * Peek (don't remove).  The split-completion path may receive
- * multiple RESP_FRAME messages on a single inflight item (one for
- * SRC_CONSUMED, one for HAS_PIXELS — possibly separated in time if
- * libavcodec held the picture for display reorder).  Caller removes
- * the entry only when both src_buf and dst_buf have been cleared
- * from inside the inflight lock.
- */
 static struct daedalus_inflight *
-daedalus_inflight_peek_locked(struct daedalus_dev *dev, u32 cookie)
+daedalus_inflight_pop_locked(struct daedalus_dev *dev, u32 cookie)
 {
 	struct daedalus_inflight *e;

 	list_for_each_entry(e, &dev->inflight, list) {
-		if (e->cookie == cookie)
+		if (e->cookie == cookie) {
+			list_del(&e->list);
 			return e;
+		}
 	}
 	return NULL;
 }
@@ -731,7 +705,6 @@ static void daedalus_device_run(void *priv)
 	size_t blen, payload_len;
 	u32 cookie;
 	int ret;
-	bool claimed = false;	/* src/dst removed from m2m rdy_queue */

 	src_buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx);
 	dst_buf = v4l2_m2m_next_dst_buf(ctx->m2m_ctx);
@@ -822,17 +795,6 @@ static void daedalus_device_run(void *priv)

 		req->codec_id	   = cid;
 		req->bitstream_len = (u32) blen;
-		/*
-		 * Ferry the OUTPUT buffer's vb2 timestamp through to the
-		 * daemon for the H.264 B-frame display-reorder fix
-		 * (daedalus-v4l2#6).  Daemon sets pkt->pts = src_pts before
-		 * avcodec_send_packet; libavcodec stamps frame->pts with
-		 * the same value when it eventually outputs the frame in
-		 * display order, letting the daemon route HAS_PIXELS RESPs
-		 * to the correct cookie even when libavcodec's display
-		 * order disagrees with V4L2's decode submission order.
-		 */
-		req->src_pts	   = (u64) src_buf->vb2_buf.timestamp;
 		req->capture_width	  = ctx->dst_fmt.width;
 		req->capture_height	  = ctx->dst_fmt.height;
 		req->capture_pix_fmt	  = ctx->dst_fmt.pixelformat;
@@ -857,34 +819,11 @@ static void daedalus_device_run(void *priv)
 	inf = kzalloc(sizeof(*inf), GFP_KERNEL);
 	if (!inf)
 		goto fail_buf_error;
-
-	/*
-	 * Take both buffers off the m2m ready-queue HERE — before the
-	 * inflight list grows.  Once src_consumed releases the src side
-	 * and the m2m scheduler can dispatch the next device_run, the
-	 * NEW device_run mustn't see this dst_buf (which we're still
-	 * holding for a future HAS_PIXELS).  Without this claim,
-	 * v4l2_m2m_next_dst_buf at the next device_run returns the same
-	 * parked dst_buf, two inflight entries reference it, and the
-	 * later HAS_PIXELS triggers a list_del on an already-removed
-	 * vb2_buffer → kernel panic (observed on Pi CM5 hard reboot
-	 * during mpv vaapi-copy playback of 720p H.264, 2026-05-21).
-	 *
-	 * Both helpers are inline list_del+counter-decrement under the
-	 * q_ctx rdy_spinlock — safe to call from device_run on the
-	 * buffer we just peeked via next_*_buf above.  Mirrors the
-	 * amphion vdec/venc pattern.
-	 */
-	v4l2_m2m_src_buf_remove_by_buf(ctx->m2m_ctx, src_buf);
-	v4l2_m2m_dst_buf_remove_by_buf(ctx->m2m_ctx, dst_buf);
-	claimed = true;
-
 	cookie		= daedalus_next_cookie();
 	inf->cookie	= cookie;
 	inf->ctx	= ctx;
 	inf->src_buf	= src_buf;
 	inf->dst_buf	= dst_buf;
-	inf->src_pts	= req->src_pts;
 	/*
 	 * Capture the bound media_request (if any) so the
 	 * completion path can call v4l2_ctrl_request_complete +
@@ -932,13 +871,11 @@ static void daedalus_device_run(void *priv)

 fail_buf_error:
 	if (src_buf) {
-		if (!claimed)
-			v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
+		v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
 		v4l2_m2m_buf_done(src_buf, VB2_BUF_STATE_ERROR);
 	}
 	if (dst_buf) {
-		if (!claimed)
-			v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
+		v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
 		v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_ERROR);
 	}
 	kfree(req);
@@ -952,185 +889,179 @@ static const struct v4l2_m2m_ops daedalus_m2m_ops = {

 /* -- chardev RESP_FRAME → buf_done bridge ---------------------------- */

-/*
- * Pack the daemon's pixel delivery into the inflight item's CAPTURE
- * buffer.  Called from daedalus_complete_resp_frame on the
- * HAS_PIXELS branch, after the lock has been dropped (vb2 ops may
- * sleep / take their own locks).  The dst_buf reference was
- * snapshotted under the inflight lock and cleared from the entry,
- * so no other RESP can race for this buffer.
- *
- * pixels_len == 0 → dmabuf path (Phase 8.6+); the daemon mmap'd the
- * CAPTURE plane via GET_DMABUF and wrote pixels in place; we just
- * set the plane payloads.  pixels_len > 0 → legacy Phase 8.5 inline
- * NV12 path; we memcpy from the chardev payload.
- */
-static void daedalus_pack_pixels_into_dst(struct vb2_v4l2_buffer *dst_buf,
-					  const struct daedalus_resp_frame *fr,
-					  const u8 *pixels, size_t pixels_len)
-{
-	struct vb2_buffer *vb = &dst_buf->vb2_buf;
-	void *dst_y, *dst_uv;
-	u32 y_size, uv_size;
-	unsigned int p;
-
-	if (pixels_len) {
-		y_size = min_t(u32, fr->luma_len,
-			       (u32) vb2_plane_size(vb, 0));
-		uv_size = vb->num_planes > 1 ?
-			min_t(u32, fr->chroma_len,
-			      (u32) vb2_plane_size(vb, 1)) : 0;
-		dst_y  = vb2_plane_vaddr(vb, 0);
-		dst_uv = vb->num_planes > 1 ?
-			vb2_plane_vaddr(vb, 1) : NULL;
-		if (dst_y && y_size && pixels_len >= y_size)
-			memcpy(dst_y, pixels, y_size);
-		else
-			y_size = 0;
-		if (dst_uv && uv_size &&
-		    pixels_len >= y_size + uv_size)
-			memcpy(dst_uv, pixels + y_size, uv_size);
-		else
-			uv_size = 0;
-		vb2_set_plane_payload(vb, 0, y_size);
-		if (vb->num_planes > 1)
-			vb2_set_plane_payload(vb, 1, uv_size);
-	} else {
-		for (p = 0; p < vb->num_planes; p++)
-			vb2_set_plane_payload(vb, p,
-					      vb2_plane_size(vb, p));
-	}
-}
-
 void daedalus_complete_resp_frame(u32 cookie,
 				  const struct daedalus_resp_frame *fr,
 				  const u8 *pixels, size_t pixels_len)
 {
 	struct daedalus_dev *dev = g_daedalus_dev;
 	struct daedalus_inflight *inf;
-	struct daedalus_ctx *ctx = NULL;
-	struct vb2_v4l2_buffer *src_to_complete = NULL;
-	struct vb2_v4l2_buffer *dst_to_complete = NULL;
-	struct media_request *req_to_complete = NULL;
 	enum vb2_buffer_state state;
-	u64 dst_timestamp = 0;
-	bool entry_freed = false;
-	bool has_pixels, src_consumed;
+	void *dst_y, *dst_uv;
+	u32 y_size, uv_size;

 	if (!dev)
 		return;

-	state = (fr->status == DAEDALUS_DECODE_OK)
-			? VB2_BUF_STATE_DONE : VB2_BUF_STATE_ERROR;
-	has_pixels   = !!(fr->flags & DAEDALUS_RESP_FLAG_HAS_PIXELS);
-	src_consumed = !!(fr->flags & DAEDALUS_RESP_FLAG_SRC_CONSUMED);
-
-	if (!has_pixels && !src_consumed) {
-		pr_warn_ratelimited(
-			"daedalus_v4l2: RESP_FRAME cookie=%u with neither HAS_PIXELS nor SRC_CONSUMED — ignoring\n",
-			cookie);
-		return;
-	}
-
 	mutex_lock(&dev->inflight_lock);
-	inf = daedalus_inflight_peek_locked(dev, cookie);
+	inf = daedalus_inflight_pop_locked(dev, cookie);
+	mutex_unlock(&dev->inflight_lock);
 	if (!inf) {
-		mutex_unlock(&dev->inflight_lock);
 		pr_warn_ratelimited(
 			"daedalus_v4l2: RESP_FRAME for unknown cookie=%u\n",
 			cookie);
 		return;
 	}

-	ctx = inf->ctx;
+	state = (fr->status == DAEDALUS_DECODE_OK)
+			? VB2_BUF_STATE_DONE : VB2_BUF_STATE_ERROR;

 	/*
-	 * Snapshot what this RESP completes and clear the matching
-	 * fields on the inflight item, so concurrent RESPs (e.g. a
-	 * later HAS_PIXELS arriving on the same cookie after this
-	 * SRC_CONSUMED clears src_buf) see the correct residual
-	 * state.  Actual vb2 buf_done calls happen below the lock.
+	 * Two routes the daemon can take, both supported:
 	 *
-	 * Sanity check on output_src_pts only when HAS_PIXELS is
-	 * set — the daemon's output_src_pts should equal this
-	 * inflight's stored src_pts, since the daemon routes pixels
-	 * to the cookie of the OUTPUT bitstream that contained the
-	 * frame's slices (which is what we stored at device_run time).
-	 * Surface a mismatch loudly — indicates daemon-side pts→cookie
-	 * mapping bug, not silent data corruption.
+	 *   (a) dmabuf path (Phase 8.6+) — daemon called
+	 *       DAEDALUS_IOC_GET_DMABUF, mmap'd the CAPTURE buffer,
+	 *       wrote pixels in place.  RESP_FRAME carries metadata
+	 *       only (pixels_len == 0).  Each plane's payload is
+	 *       the full plane size (the daemon wrote everything
+	 *       the format requires).
+	 *
+	 *   (b) Phase 8.5 inline path — daemon shipped raw NV12 in
+	 *       the chardev payload (≤ 64 KiB cap).  We memcpy
+	 *       into the vb2 buffer.  Plane payloads come from
+	 *       the daemon's NV12 luma/chroma counts.
 	 */
-	if (has_pixels) {
-		if (fr->output_src_pts != inf->src_pts)
-			pr_warn_ratelimited(
-				"daedalus_v4l2: RESP HAS_PIXELS cookie=%u output_src_pts=%llu but inflight.src_pts=%llu — daemon dispatch bug?\n",
-				cookie,
-				(unsigned long long) fr->output_src_pts,
-				(unsigned long long) inf->src_pts);
+	if (state == VB2_BUF_STATE_DONE) {
+		struct vb2_buffer *vb = &inf->dst_buf->vb2_buf;
+		unsigned int p;

-		dst_to_complete = inf->dst_buf;
-		dst_timestamp = inf->src_pts;
-		inf->dst_buf = NULL;
+		if (pixels_len) {
+			/* (b) inline NV12 copy — legacy 2-plane only */
+			y_size = min_t(u32, fr->luma_len,
+				       (u32) vb2_plane_size(vb, 0));
+			uv_size = vb->num_planes > 1 ?
+				min_t(u32, fr->chroma_len,
+				      (u32) vb2_plane_size(vb, 1)) : 0;
+			dst_y  = vb2_plane_vaddr(vb, 0);
+			dst_uv = vb->num_planes > 1 ?
+				vb2_plane_vaddr(vb, 1) : NULL;
+			if (dst_y && y_size && pixels_len >= y_size)
+				memcpy(dst_y, pixels, y_size);
+			else
+				y_size = 0;
+			if (dst_uv && uv_size &&
+			    pixels_len >= y_size + uv_size)
+				memcpy(dst_uv, pixels + y_size, uv_size);
+			else
+				uv_size = 0;
+			vb2_set_plane_payload(vb, 0, y_size);
+			if (vb->num_planes > 1)
+				vb2_set_plane_payload(vb, 1, uv_size);
+		} else {
+			/* (a) dmabuf path: plane is fully populated by
+			 * the daemon, so payload == sizeimage. */
+			for (p = 0; p < vb->num_planes; p++)
+				vb2_set_plane_payload(vb, p,
+						      vb2_plane_size(vb, p));
+		}
 	}

-	if (src_consumed) {
-		src_to_complete = inf->src_buf;
-		req_to_complete = inf->req;
-		inf->src_buf = NULL;
-		inf->req = NULL;
-	}
+	/*
+	 * Phase 8.14: if the src_buf was bound to a media_request
+	 * (libva-driven decode path), complete the per-request
+	 * control state BEFORE buf_done_and_job_finish.  vb2-core's
+	 * buf_done unbinds the buffer's req_obj on its own, but the
+	 * control object stays bound until v4l2_ctrl_request_complete
+	 * runs — only after BOTH objects unbind does the request
+	 * transition to MEDIA_REQUEST_STATE_COMPLETE and wake any
+	 * userspace poll on the request fd.
+	 *
+	 * For non-request flows (test_m2m_stream direct QBUF) inf->req
+	 * is NULL and v4l2_ctrl_request_complete just no-ops.
+	 */
+	if (inf->req)
+		v4l2_ctrl_request_complete(inf->req, &inf->ctx->hdl);

-	if (!inf->src_buf && !inf->dst_buf) {
-		list_del(&inf->list);
-		entry_freed = true;
-	}
+	/*
+	 * Use the buf_done_and_job_finish helper rather than plain
+	 * buf_done + job_finish: the helper pops the buffers off
+	 * the m2m queue before marking them done, otherwise the
+	 * scheduler immediately re-runs device_run on the same
+	 * still-queued src buffer.  Caught during Phase 8.5 first
+	 * run — second REQ_DECODE with identical bitstream + oops
+	 * in stop_streaming when the test client tore down.
+	 */
+	v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev, inf->ctx->m2m_ctx,
+					 state);
+
+	/*
+	 * Release our reference taken in device_run; safe to do
+	 * AFTER buf_done_and_job_finish (which dropped the vb2
+	 * reference) because we still hold this one.  If the
+	 * refcount hits zero here, media-core releases the request.
+	 */
+	if (inf->req)
+		media_request_put(inf->req);
+
+	kfree(inf);
+}
+
+/* -- daemon disconnect drain ----------------------------------------- */
+
+void daedalus_drain_inflight_on_disconnect(void)
+{
+	struct daedalus_dev *dev = g_daedalus_dev;
+	struct daedalus_inflight *inf, *tmp;
+	LIST_HEAD(local);
+
+	if (!dev)
+		return;
+
+	/*
+	 * Splice the in-flight list onto a local list under the lock,
+	 * then process each entry with the lock dropped — every
+	 * v4l2_m2m_buf_done_and_job_finish call may itself try to
+	 * re-enter device_run via the scheduler (which would need to
+	 * walk dev->inflight again on a future REQ_DECODE), and
+	 * v4l2_m2m_buf_done can sleep via vb2's buffer-done dispatch.
+	 * Holding inflight_lock across either is a deadlock invitation.
+	 */
+	mutex_lock(&dev->inflight_lock);
+	list_splice_init(&dev->inflight, &local);
 	mutex_unlock(&dev->inflight_lock);

-	/*
-	 * Complete the CAPTURE side first (when applicable).  vb2-core's
-	 * V4L2_BUF_FLAG_TIMESTAMP_COPY semantics no longer auto-copy
-	 * src→dst timestamps because src and dst are no longer paired
-	 * 1:1 in m2m's view — stamp dst explicitly from the inflight's
-	 * stored src_pts (= the OUTPUT vb2_buf.timestamp captured at
-	 * device_run).  The V4L2 client gets the same display-PTS it
-	 * originally set on the OUTPUT side.
-	 */
-	if (dst_to_complete) {
-		if (state == VB2_BUF_STATE_DONE)
-			daedalus_pack_pixels_into_dst(dst_to_complete, fr,
-						      pixels, pixels_len);
-		dst_to_complete->vb2_buf.timestamp = dst_timestamp;
+	list_for_each_entry_safe(inf, tmp, &local, list) {
+		list_del(&inf->list);
+
+		v4l2_warn(&dev->v4l2_dev,
+			  "draining inflight cookie=%u (daemon disconnect)\n",
+			  inf->cookie);
+
 		/*
-		 * The buffer was already removed from m2m's rdy_queue at
-		 * device_run time (see the "Take both buffers off ..."
-		 * block).  Just call buf_done here — calling
-		 * v4l2_m2m_dst_buf_remove_by_buf again would list_del a
-		 * list_head that's no longer linked, smashing the list.
+		 * Complete the per-request control state before
+		 * buf_done_and_job_finish, same ordering as the success
+		 * path in daedalus_complete_resp_frame().  For non-request
+		 * flows inf->req is NULL and v4l2_ctrl_request_complete
+		 * no-ops.
 		 */
-		v4l2_m2m_buf_done(dst_to_complete, state);
-	}
+		if (inf->req)
+			v4l2_ctrl_request_complete(inf->req, &inf->ctx->hdl);

-	/*
-	 * Complete the OUTPUT side: release the bound media_request's
-	 * controls (libva-driven path), drop our request reference taken
-	 * in device_run, mark src done, then job_finish so the m2m
-	 * scheduler can dispatch the next pending REQ on this ctx.  The
-	 * dst_buf for this cookie may still be parked (HAS_PIXELS hasn't
-	 * arrived yet — libavcodec is holding the frame for display-
-	 * order release).  That's fine: the next device_run picks a
-	 * different next_dst_buf out of the CAPTURE queue and proceeds.
-	 */
-	if (src_to_complete) {
-		if (req_to_complete)
-			v4l2_ctrl_request_complete(req_to_complete, &ctx->hdl);
-		/* Already off the rdy_queue (see device_run claim) — buf_done only. */
-		v4l2_m2m_buf_done(src_to_complete, state);
-		if (req_to_complete)
-			media_request_put(req_to_complete);
-		v4l2_m2m_job_finish(dev->m2m_dev, ctx->m2m_ctx);
-	}
+		/*
+		 * Mark both buffers ERROR and clear the m2m scheduler's
+		 * job_running flag.  This is what unsticks
+		 * v4l2_m2m_cancel_job() inside the consumer's close()
+		 * path; without it, the consumer hangs in TASK_UNINTERRUPTIBLE
+		 * forever (issue #146).
+		 */
+		v4l2_m2m_buf_done_and_job_finish(dev->m2m_dev,
+						 inf->ctx->m2m_ctx,
+						 VB2_BUF_STATE_ERROR);
+
+		if (inf->req)
+			media_request_put(inf->req);

-	if (entry_freed)
 		kfree(inf);
+	}
 }

 /* -- v4l2_ioctl_ops -------------------------------------------------- */
@@ -103,4 +103,27 @@ void daedalus_complete_resp_frame(u32 cookie,
 int  daedalus_export_capture_dmabuf(u32 cookie, u32 plane, u32 flags,
 				    int *out_fd);

+/**
+ * daedalus_drain_inflight_on_disconnect() - fail all in-flight m2m jobs
+ *
+ * Called from daedalus_chardev_release() when the daemon disconnects
+ * (graceful close, SIGKILL, daemon crash — anything that triggers
+ * chardev release).  Walks the in-flight list and, for every entry,
+ * marks both src+dst buffers VB2_BUF_STATE_ERROR and calls
+ * v4l2_m2m_buf_done_and_job_finish() to clear the m2m scheduler's
+ * "job_running" flag.
+ *
+ * Without this, v4l2_m2m_cancel_job() (called from
+ * v4l2_m2m_ctx_release() during the consumer's close() / task exit)
+ * blocks forever waiting for a job_finish that the dead daemon will
+ * never send — the consumer enters TASK_UNINTERRUPTIBLE and survives
+ * SIGKILL until reboot.  See issue #146 for the full trace.
+ *
+ * Safe to call with an empty in-flight list; no-op in that case.
+ * Must NOT be called from atomic context — uses inflight_lock
+ * (sleeping mutex) and v4l2_m2m_buf_done_and_job_finish (which can
+ * sleep via vb2 buffer-done dispatch).
+ */
+void daedalus_drain_inflight_on_disconnect(void);
+
 #endif /* DAEDALUS_V4L2_MAIN_H */