Merge pull request 'picture, request_pool: transparent OUTPUT-pool resize on bitstream overrun (#15 )' (#16 ) from claude-noether/libva-v4l2-request-fourier:noether/output-pool-resize-issue-15 into master

Reviewed-on: marfrit/libva-v4l2-request-fourier#16
picture, request_pool: transparent OUTPUT-pool resize on bitstream overrun
2026-05-21 11:23:08 +00:00 · 2026-05-21 13:11:55 +02:00 · 2026-05-21 10:17:15 +00:00 · 2026-05-21 12:14:48 +02:00 · 2026-05-20 19:14:49 +00:00 · 2026-05-20 21:13:07 +02:00
16 changed files with 1044 additions and 25 deletions
@@ -4,3 +4,14 @@ option(
    value : '',
    description: 'Path to sanitized Linux Kernel headers'
 )
+
+option(
+    'daedalus_v4l2',
+    type : 'boolean',
+    value : true,
+    description: 'Enable probe + dispatch for the out-of-tree daedalus_v4l2 ' +
+                 'stateless decoder shim (Pi 5 / CM5 daemon-backed VP9/AV1/H264). ' +
+                 'Default true; disable on platforms where the daedalus_v4l2 ' +
+                 'kernel module will never be present to slim the probe array.'
+)
+
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * AV1 codec dispatcher.  Populates V4L2_CID_STATELESS_AV1_SEQUENCE
+ * (struct v4l2_ctrl_av1_sequence) from VAAPI's VADecPictureParameterBufferAV1.
+ *
+ * Why a single SEQUENCE control and not the full V4L2_CID_STATELESS_AV1_*
+ * family (FRAME, TILE_GROUP_ENTRY, FILM_GRAIN):
+ *
+ *   - The daedalus_v4l2 daemon path consumes the OUTPUT bitstream
+ *     directly via libavcodec/libdav1d.  libdav1d needs a complete OBU
+ *     stream that includes the sequence header — ffmpeg-vaapi strips the
+ *     sequence header on the client side (its parser is split across
+ *     VAPictureParameterBufferAV1 + slice payload, with OBU_SEQUENCE_HEADER
+ *     consumed and not re-emitted), so the daemon side has to synthesise
+ *     it from the SEQUENCE ctrl.  The other AV1 ctrls (FRAME / TILE /
+ *     FILM_GRAIN) are not needed for that synthesis — the OBU_FRAME_HEADER
+ *     + OBU_TILE_GROUP that libdav1d also needs are still in the slice
+ *     bitstream.
+ *
+ *   - The vpu981 (RK3588 dedicated AV1 hantro) hardware path doesn't
+ *     consult these controls either — vpu981's driver parses the AV1
+ *     bitstream directly.  So setting only SEQUENCE is correct for both
+ *     destination decoders.
+ *
+ * Reference: marfrit/libva-v4l2-request-fourier issue #11
+ *            (DAEMON-PPS-style sequence-header re-synthesis on the daemon
+ *            side, paralleling the H.264 SPS/PPS work in DAEMON-PPS).
+ *            kernel uAPI: <linux/v4l2-controls.h> @ 2891-2919.
+ *            VAAPI:       <va/va_dec_av1.h> typedef
+ *                         VADecPictureParameterBufferAV1.
+ */
+
+#include "av1.h"
+
+#include "v4l2.h"
+#include "utils.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/v4l2-controls.h>
+#include <linux/videodev2.h>
+
+/*
+ * VADecPictureParameterBufferAV1 reaches us transitively via surface.h →
+ * va_backend.h → va.h → va_dec_av1.h (va_dec_av1.h alone won't compile
+ * standalone — it needs va.h's VA_PADDING_LOW / va_deprecated machinery).
+ */
+
+/* Compile-time UAPI shift guard, sibling to vp9.c's pattern. */
+_Static_assert(sizeof(struct v4l2_ctrl_av1_sequence) == 12,
+	       "v4l2_ctrl_av1_sequence size mismatch — kernel UAPI changed");
+
+/*
+ * Map VAAPI bit_depth_idx (0/1/2 → 8/10/12) to the kernel ctrl's plain
+ * uint8_t bit_depth field.  ffmpeg-vaapi sets idx from the bitstream
+ * BitDepth value, so this is an exact inverse of AV1 spec 5.5.2.
+ */
+static uint8_t av1_bit_depth_from_idx(uint8_t idx)
+{
+	switch (idx) {
+	case 0:  return 8;
+	case 1:  return 10;
+	case 2:  return 12;
+	default:
+		/* Spec-illegal; pass through so a reviewer / test catches it. */
+		return 8;
+	}
+}
+
+int av1_set_controls(struct request_data *driver_data,
+		     struct object_context *context,
+		     struct object_surface *surface_object)
+{
+	VADecPictureParameterBufferAV1 *picture =
+		&surface_object->params.av1.picture;
+	struct v4l2_ctrl_av1_sequence sequence;
+	struct v4l2_ext_control ctrls[1];
+	int rc;
+
+	(void)context;
+
+	memset(&sequence, 0, sizeof sequence);
+
+	/*
+	 * Scalar mapping.  Names align with kernel uAPI; off-by-one and
+	 * idx→value translations are annotated.
+	 */
+	sequence.seq_profile = picture->profile;
+	sequence.order_hint_bits =
+		(uint8_t)(picture->order_hint_bits_minus_1 + 1u);
+	sequence.bit_depth = av1_bit_depth_from_idx(picture->bit_depth_idx);
+	sequence.max_frame_width_minus_1 = picture->frame_width_minus1;
+	sequence.max_frame_height_minus_1 = picture->frame_height_minus1;
+
+	/*
+	 * Sequence-header flag mapping.  VAAPI exposes most of these directly
+	 * in seq_info_fields.fields.*; the ones that don't have a 1:1 mirror
+	 * (V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION, _ENABLE_REF_FRAME_MVS,
+	 * _ENABLE_SUPERRES, _ENABLE_RESTORATION, _SEPARATE_UV_DELTA_Q) live in
+	 * VAAPI's per-frame pic_info_fields rather than the sequence struct.
+	 * For SEQUENCE-control purposes we treat them as best-effort
+	 * unobservable from libva and leave the corresponding bits clear; the
+	 * daedalus daemon's OBU synthesiser (issue #11 daemon track) carries
+	 * the SEQUENCE bytes verbatim, so per-frame consumers (libdav1d) will
+	 * still see the full bitstream truth for those toggles via the
+	 * OBU_FRAME stream already in the slice buffer.  See feedback memory
+	 * `feedback_vaapi_blind_to_some_hevc_sps_fields` for the precedent.
+	 */
+	if (picture->seq_info_fields.fields.still_picture)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE;
+	if (picture->seq_info_fields.fields.use_128x128_superblock)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK;
+	if (picture->seq_info_fields.fields.enable_filter_intra)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA;
+	if (picture->seq_info_fields.fields.enable_intra_edge_filter)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER;
+	if (picture->seq_info_fields.fields.enable_interintra_compound)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND;
+	if (picture->seq_info_fields.fields.enable_masked_compound)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND;
+	if (picture->seq_info_fields.fields.enable_dual_filter)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER;
+	if (picture->seq_info_fields.fields.enable_order_hint)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT;
+	if (picture->seq_info_fields.fields.enable_jnt_comp)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP;
+	if (picture->seq_info_fields.fields.enable_cdef)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF;
+	if (picture->seq_info_fields.fields.mono_chrome)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME;
+	if (picture->seq_info_fields.fields.color_range)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE;
+	if (picture->seq_info_fields.fields.subsampling_x)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X;
+	if (picture->seq_info_fields.fields.subsampling_y)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y;
+	if (picture->seq_info_fields.fields.film_grain_params_present)
+		sequence.flags |= V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT;
+
+	/* Single-control batched submission. */
+	memset(ctrls, 0, sizeof ctrls);
+	ctrls[0].id   = V4L2_CID_STATELESS_AV1_SEQUENCE;
+	ctrls[0].ptr  = &sequence;
+	ctrls[0].size = sizeof sequence;
+
+	rc = v4l2_set_controls(driver_data->video_fd,
+			       surface_object->request_fd,
+			       ctrls, 1);
+	if (rc < 0)
+		return VA_STATUS_ERROR_OPERATION_FAILED;
+
+	return VA_STATUS_SUCCESS;
+}
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * AV1 codec dispatcher — populates V4L2_CID_STATELESS_AV1_SEQUENCE
+ * (struct v4l2_ctrl_av1_sequence) from VAAPI's VADecPictureParameterBufferAV1.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _AV1_H_
+#define _AV1_H_
+
+#include "context.h"
+#include "request.h"
+#include "surface.h"
+
+int av1_set_controls(struct request_data *driver_data,
+		     struct object_context *context,
+		     struct object_surface *surface);
+
+#endif /* _AV1_H_ */
@@ -46,6 +46,20 @@ unsigned int pixelformat_for_profile(VAProfile profile)
 		return V4L2_PIX_FMT_VP8_FRAME;
 	case VAProfileVP9Profile0:
 		return V4L2_PIX_FMT_VP9_FRAME;
+	case VAProfileAV1Profile0:
+		/*
+		 * ampere-av1-enablement Phase 2: AV1 Profile 0 routes to
+		 * vpu981 (RK3588's dedicated AV1 hantro). Per-codec ctrl
+		 * dispatch (V4L2_CID_STATELESS_AV1_*) is NOT YET WIRED on
+		 * master — vainfo lists the profile + RequestCreateConfig
+		 * succeeds, but consumers that submit decode buffers hit
+		 * a NOP path until the per-codec dispatch lands. The
+		 * av1-iter1 operator branch has Phase 3 bit-exact bring-up
+		 * underway; this commit gives master the bare enumeration +
+		 * fd-routing layer so consumers like ffmpeg-vaapi at least
+		 * see VAProfileAV1Profile0 today.
+		 */
+		return V4L2_PIX_FMT_AV1_FRAME;
 	default:
 		return 0;
 	}
@@ -81,6 +81,16 @@ VAStatus RequestCreateConfig(VADriverContextP context, VAProfile profile,
 		// cap is V4L2_MPEG_VIDEO_VP9_PROFILE_0). Do not add a case for
 		// VAProfileVP9Profile2 — kernel will reject.
 		break;
+	case VAProfileAV1Profile0:
+		// ampere-av1-enablement Phase 2: AV1 Profile 0 routes to
+		// vpu981 (RK3588 dedicated AV1 hantro instance). Decode-side
+		// ctrl dispatch (V4L2_CID_STATELESS_AV1_*) is NOT YET WIRED
+		// on master — vainfo will list the profile + CreateConfig
+		// succeeds, but consumers that submit decode buffers hit a
+		// NOP path until av1.{c,h} dispatch scaffolding is ported
+		// from the av1-iter1 operator branch (where Phase 3-5 has
+		// 3/10 frames bit-exact already).
+		break;
 	default:
 		return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 	}
@@ -162,13 +172,20 @@ VAStatus RequestDestroyConfig(VADriverContextP context, VAConfigID config_id)
 static bool any_fd_supports_output_format(struct request_data *driver_data,
 					  unsigned int fmt)
 {
-	int fds[3] = {
+	int fds[6] = {
 		driver_data->video_fd,
 		driver_data->video_fd_rkvdec,
 		driver_data->video_fd_hantro,
+		driver_data->video_fd_rpi_hevc_dec,  /* iter40 */
+		driver_data->video_fd_vpu981,        /* ampere-av1 Phase 2 */
+#ifdef HAVE_DAEDALUS_V4L2
+		driver_data->video_fd_daedalus,      /* LIBVA-1: H.264/VP9/AV1 */
+#else
+		-1,
+#endif
 	};
 	int i;
-	for (i = 0; i < 3; i++) {
+	for (i = 0; i < 6; i++) {
 		if (fds[i] < 0) continue;
 		if (v4l2_find_format(fds[i], V4L2_BUF_TYPE_VIDEO_OUTPUT, fmt))
 			return true;
@@ -249,6 +266,17 @@ VAStatus RequestQueryConfigProfiles(VADriverContextP context,
 	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
 		profiles[index++] = VAProfileVP9Profile0;

+	/*
+	 * ampere-av1-enablement Phase 2: AV1 Profile 0 advertised when
+	 * vpu981 (RK3588 dedicated AV1 hantro) is probed. MAX_PROFILES
+	 * bumped to 14 in request.h to safely fit even if iter39 Option
+	 * B is reverted (Hi10P + Main10 back in enumeration → 13 total
+	 * with AV1, the `< MAX - 1` guard then needs MAX ≥ 14).
+	 */
+	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_AV1_FRAME);
+	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
+		profiles[index++] = VAProfileAV1Profile0;
+
 	*profiles_count = index;

 	return VA_STATUS_SUCCESS;
@@ -272,6 +300,7 @@ VAStatus RequestQueryConfigEntrypoints(VADriverContextP context,
 	case VAProfileHEVCMain10:
 	case VAProfileVP8Version0_3:
 	case VAProfileVP9Profile0:
+	case VAProfileAV1Profile0:
 		entrypoints[0] = VAEntrypointVLD;
 		*entrypoints_count = 1;
 		break;
@@ -537,7 +537,9 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	 */
 	rc = request_pool_init(&driver_data->output_pool,
 			       driver_data->video_fd, driver_data->media_fd,
-			       output_type, 16);
+			       output_type, 16, pixelformat,
+			       (unsigned int)picture_width,
+			       (unsigned int)picture_height);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_ALLOCATION_FAILED;
 		goto error;
@@ -827,10 +827,63 @@ int h264_set_controls(struct request_data *driver_data,

 	dpb_update(context, &surface->params.h264.picture);

+	/*
+	 * Dump the raw VAAPI fields at the libva boundary so issue #8
+	 * follow-up can disambiguate "ffmpeg-vaapi didn't populate" from
+	 * "downstream consumer (daedalus_v4l2 wire protocol) corrupted the
+	 * value". One-line; safe to leave in — costs a single printf per frame.
+	 */
+	request_log("h264_set_controls: VAProfile=%d seq_fields=0x%08x pic_fields=0x%08x num_ref_frames=%u bit_depth_luma_m8=%u bit_depth_chroma_m8=%u w_mbs_m1=%u h_mbs_m1=%u\n",
+		    (int)profile,
+		    surface->params.h264.picture.seq_fields.value,
+		    surface->params.h264.picture.pic_fields.value,
+		    surface->params.h264.picture.num_ref_frames,
+		    surface->params.h264.picture.bit_depth_luma_minus8,
+		    surface->params.h264.picture.bit_depth_chroma_minus8,
+		    surface->params.h264.picture.picture_width_in_mbs_minus1,
+		    surface->params.h264.picture.picture_height_in_mbs_minus1);
+
 	h264_va_picture_to_v4l2(driver_data, context, surface,
 				&surface->params.h264.picture,
 				&decode, &pps, &sps);

+	/*
+	 * max_num_ref_frames fallback. Some VAAPI clients (older ffmpeg-vaapi
+	 * paths, some daedalus_v4l2 consumers) leave VAPicture->num_ref_frames
+	 * at zero. Hardware decoders tolerate; libavcodec-via-daedalus enforces
+	 * sps.max_num_ref_frames strictly and rejects every frame.
+	 *
+	 * Count valid DPB entries first (the bitstream-true reference count we
+	 * can see); fall back to a per-profile spec minimum if even that is 0.
+	 * See marfrit/libva-v4l2-request-fourier issue #8.
+	 */
+	if (sps.max_num_ref_frames == 0) {
+		unsigned int valid = 0;
+		unsigned int i;
+		for (i = 0; i < 16; i++) {
+			const VAPictureH264 *ref =
+				&surface->params.h264.picture.ReferenceFrames[i];
+			if (!(ref->flags & VA_PICTURE_H264_INVALID))
+				valid++;
+		}
+		if (valid > 0) {
+			sps.max_num_ref_frames = (uint8_t)valid;
+		} else {
+			switch (profile) {
+			case VAProfileH264ConstrainedBaseline:
+				sps.max_num_ref_frames = 1;
+				break;
+			case VAProfileH264Main:
+			case VAProfileH264High:
+			case VAProfileH264MultiviewHigh:
+			case VAProfileH264StereoHigh:
+			default:
+				sps.max_num_ref_frames = 4;
+				break;
+			}
+		}
+	}
+
 	/*
 	 * Populate the scaling matrix unconditionally: from VAAPI's
 	 * VAIQMatrixBufferH264 when the consumer sent one this frame
@@ -22,6 +22,9 @@

 autoconf_data = configuration_data()
 autoconf_data.set('VA_DRIVER_INIT_FUNC', va_driver_init_func)
+if get_option('daedalus_v4l2')
+	autoconf_data.set('HAVE_DAEDALUS_V4L2', 1)
+endif

 autoconf = configure_file(
 	output: 'autoconfig.h',
@@ -50,6 +53,7 @@ sources = [
 	'h265.c',
 	'vp8.c',
 	'vp9.c',
+	'av1.c',
 	'codec.c',
 	'nv15.c',
 	'nv12_col128.c',
@@ -36,6 +36,8 @@
 #include "mpeg2.h"
 #include "vp8.h"
 #include "vp9.h"
+#include "av1.h"
+#include "request_pool.h"

 #include <assert.h>
 #include <stdio.h>
@@ -54,6 +56,159 @@

 #include "autoconfig.h"

+/*
+ * iter#15 — issue #15: ensure the in-flight surface's OUTPUT mmap has
+ * room for `delta` more bytes appended to slices_size; if not, grow the
+ * pool transparently via request_pool_resize.
+ *
+ * Sequence on overflow:
+ *   1. Snapshot the surface's accumulated bytes to a temp heap buffer.
+ *   2. Release the surface's OUTPUT pool slot back to FREE (resize
+ *      requires no slot be borrowed).
+ *   3. Compute new sizeimage = roundup(needed * 2, 4 KiB), and at least
+ *      double the current source_size so geometric growth amortises
+ *      repeated overruns at the same resolution.
+ *   4. Call request_pool_resize.
+ *   5. Re-acquire a pool slot (the new pool has fresh indices and fds).
+ *   6. Re-mirror surface_object->source_{index,data,size,request_fd}
+ *      from the new slot.
+ *   7. Restore the saved bytes via memcpy into the new mmap.
+ *
+ * Returns VA_STATUS_SUCCESS on clean resize (or no resize needed) and
+ * VA_STATUS_ERROR_ALLOCATION_FAILED on heap-alloc / V4L2 / kernel
+ * failure — the libva client falls back to surface re-creation as
+ * before the resize hook landed.
+ *
+ * NOTE on inline-Sync invariant: RequestEndPicture calls
+ * RequestSyncSurface inline, so when codec_store_buffer runs no other
+ * pool slot is borrowed across libva-driver-API entry points. The
+ * temporary release-then-reacquire of the in-flight slot here keeps
+ * that invariant intact across the resize.
+ */
+static VAStatus
+codec_store_buffer_ensure_capacity(struct request_data *driver_data,
+				   struct object_surface *surface_object,
+				   size_t need)
+{
+	struct request_pool_slot *slot;
+	uint8_t *save_buf;
+	size_t save_size;
+	unsigned int saved_index;
+	size_t want_sizeimage;
+	unsigned int new_sizeimage;
+	int new_index;
+	int rc;
+
+	if (need <= surface_object->source_size)
+		return VA_STATUS_SUCCESS;
+
+	save_size = surface_object->slices_size;
+	save_buf = NULL;
+	if (save_size > 0) {
+		save_buf = malloc(save_size);
+		if (save_buf == NULL) {
+			request_log("codec_store_buffer_ensure_capacity: malloc(%zu) for resize-save failed\n",
+				    save_size);
+			return VA_STATUS_ERROR_ALLOCATION_FAILED;
+		}
+		memcpy(save_buf, surface_object->source_data, save_size);
+	}
+
+	/*
+	 * Temporarily release the in-flight slot. The slot's V4L2 buffer
+	 * has NOT been QBUF'd yet (QBUF lives in RequestEndPicture, after
+	 * this codec_store_buffer call), so the release is a clean
+	 * busy=false flip; no kernel state is in question.  The slot's
+	 * stale request_fd does not need to be saved — the resize closes
+	 * every slot's fd and the post-resize acquire below re-mirrors a
+	 * fresh slot's request_fd into surface_object->request_fd.
+	 */
+	saved_index = surface_object->source_index;
+	request_pool_release(&driver_data->output_pool, saved_index);
+
+	/*
+	 * Geometric growth: at least 2× the current source_size, but no
+	 * less than 2× the required total — so a single resize covers the
+	 * triggering append plus comfortable headroom for the rest of
+	 * this frame. Round up to a 4 KiB page boundary so the kernel's
+	 * own alignment doesn't waste pages.  Compute in size_t so the
+	 * 2× doubling can't silently wrap at 2 GiB on 32-bit unsigned int
+	 * (sizeimage stays bounded by V4L2's u32, but the doubling target
+	 * could otherwise overflow before the clamp).
+	 */
+	want_sizeimage = need * 2;
+	if (want_sizeimage < (size_t)surface_object->source_size * 2)
+		want_sizeimage = (size_t)surface_object->source_size * 2;
+	if (want_sizeimage > 0x40000000u) /* 1 GiB hard cap — V4L2 sizeimage is u32 */
+		want_sizeimage = 0x40000000u;
+	want_sizeimage = (want_sizeimage + 0xFFFu) & ~(size_t)0xFFFu;
+	new_sizeimage = (unsigned int)want_sizeimage;
+
+	request_log("codec_store_buffer: OUTPUT-pool resize (need %zu > cap %u → new_sizeimage %u)\n",
+		    need, surface_object->source_size, new_sizeimage);
+
+	rc = request_pool_resize(&driver_data->output_pool, new_sizeimage);
+	if (rc < 0) {
+		/*
+		 * Resize failed. The original slot was already released
+		 * above, so surface_object->source_data is now pointing
+		 * at a FREE-but-still-borrowable mmap. Restore the
+		 * surface's slot mirror so EndPicture / DestroyContext
+		 * unwind paths see a consistent (if partial) state.
+		 *
+		 * If the resize aborted early (pre-STREAMOFF), the slot
+		 * is intact: re-acquiring the same index is the inverse
+		 * of the temporary release above. If it aborted later
+		 * (post-teardown), the slot's data/size were zeroed in
+		 * place by request_pool_resize and the re-acquire flips
+		 * busy=true on a dead slot — still safe, because the
+		 * caller will return ERROR_ALLOCATION_FAILED and the
+		 * libva consumer destroys the surface/context.
+		 */
+		(void)request_pool_acquire(&driver_data->output_pool);
+		free(save_buf);
+		return VA_STATUS_ERROR_ALLOCATION_FAILED;
+	}
+
+	new_index = request_pool_acquire(&driver_data->output_pool);
+	if (new_index < 0) {
+		free(save_buf);
+		return VA_STATUS_ERROR_ALLOCATION_FAILED;
+	}
+	slot = request_pool_slot(&driver_data->output_pool,
+				 (unsigned int)new_index);
+	if (slot == NULL) {
+		request_pool_release(&driver_data->output_pool,
+				     (unsigned int)new_index);
+		free(save_buf);
+		return VA_STATUS_ERROR_ALLOCATION_FAILED;
+	}
+
+	surface_object->source_index = slot->index;
+	surface_object->source_data = slot->data;
+	surface_object->source_size = slot->size;
+	surface_object->request_fd = slot->request_fd;
+
+	if (need > surface_object->source_size) {
+		/*
+		 * Kernel rounded the new sizeimage down below what we
+		 * needed — drivers may clamp at their per-codec ceiling.
+		 * Don't corrupt memory; surface the error to libva.
+		 */
+		request_log("codec_store_buffer_ensure_capacity: kernel returned sizeimage %u < required %zu\n",
+			    surface_object->source_size, need);
+		free(save_buf);
+		return VA_STATUS_ERROR_ALLOCATION_FAILED;
+	}
+
+	if (save_buf != NULL) {
+		memcpy(surface_object->source_data, save_buf, save_size);
+		free(save_buf);
+	}
+
+	return VA_STATUS_SUCCESS;
+}
+
 static VAStatus codec_store_buffer(struct request_data *driver_data,
 				   struct object_context *context,
 				   VAProfile profile,
@@ -61,16 +216,36 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 				   struct object_buffer *buffer_object)
 {
 	switch (buffer_object->type) {
-	case VASliceDataBufferType:
+	case VASliceDataBufferType: {
 		/*
 		 * Since there is no guarantee that the allocation
 		 * order is the same as the submission order (via
 		 * RenderPicture), we can't use a V4L2 buffer directly
 		 * and have to copy from a regular buffer.
+		 *
+		 * Capacity guard (issue #13 + #15): surface_object->source_data
+		 * points at an OUTPUT-pool mmap of size source_size, negotiated
+		 * at S_FMT time. A stream-level resolution upshift can produce
+		 * a slice larger than this allocation. Each append site below
+		 * computes the post-append running total and calls
+		 * codec_store_buffer_ensure_capacity, which transparently grows
+		 * the OUTPUT pool (request_pool_resize) so the existing memcpy
+		 * has room. The hard error path (VA_STATUS_ERROR_ALLOCATION_FAILED)
+		 * only fires if both the heap save buffer AND the kernel-side
+		 * grow fail — at which point libavcodec recreates the surface.
 		 */
+		size_t need;
+		VAStatus ensure_rc;
+
 		if (context->h264_start_code) {
 			static const char start_code[3] = { 0x00, 0x00, 0x01 };

+			need = (size_t)surface_object->slices_size +
+			       sizeof(start_code);
+			ensure_rc = codec_store_buffer_ensure_capacity(
+				driver_data, surface_object, need);
+			if (ensure_rc != VA_STATUS_SUCCESS)
+				return ensure_rc;
 			memcpy(surface_object->source_data +
 			       surface_object->slices_size,
 			       start_code, sizeof(start_code));
@@ -104,19 +279,32 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			unsigned int header_size =
 				surface_object->params.vp8.picture.pic_fields.bits.key_frame == 0 ?
 					10 : 3;
+			need = (size_t)surface_object->slices_size + header_size;
+			ensure_rc = codec_store_buffer_ensure_capacity(
+				driver_data, surface_object, need);
+			if (ensure_rc != VA_STATUS_SUCCESS)
+				return ensure_rc;
 			memset(surface_object->source_data +
 			       surface_object->slices_size,
 			       0, header_size);
 			surface_object->slices_size += header_size;
 		}
-		memcpy(surface_object->source_data +
-			       surface_object->slices_size,
-		       buffer_object->data,
-		       buffer_object->size * buffer_object->count);
-		surface_object->slices_size +=
-			buffer_object->size * buffer_object->count;
+		{
+			size_t payload = (size_t)buffer_object->size *
+					 buffer_object->count;
+			need = (size_t)surface_object->slices_size + payload;
+			ensure_rc = codec_store_buffer_ensure_capacity(
+				driver_data, surface_object, need);
+			if (ensure_rc != VA_STATUS_SUCCESS)
+				return ensure_rc;
+			memcpy(surface_object->source_data +
+				       surface_object->slices_size,
+			       buffer_object->data, payload);
+			surface_object->slices_size += payload;
+		}
 		surface_object->slices_count++;
 		break;
+	}

 	case VAPictureParameterBufferType:
 		switch (profile) {
@@ -157,6 +345,12 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			       sizeof(surface_object->params.vp9.picture));
 			break;

+		case VAProfileAV1Profile0:
+			memcpy(&surface_object->params.av1.picture,
+			       buffer_object->data,
+			       sizeof(surface_object->params.av1.picture));
+			break;
+
 		default:
 			break;
 		}
@@ -318,6 +512,26 @@ static VAStatus codec_set_controls(struct request_data *driver_data,
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;

+	case VAProfileAV1Profile0:
+		/*
+		 * Populates V4L2_CID_STATELESS_AV1_SEQUENCE from
+		 * VAPictureParameterBufferAV1.  The daedalus_v4l2 daemon
+		 * (issue #11 daemon track) synthesises an OBU_SEQUENCE_HEADER
+		 * from this ctrl and prepends it to the slice bitstream
+		 * before handing it to libavcodec/libdav1d, which otherwise
+		 * cannot parse the (sequence-header-stripped) OUTPUT buffer
+		 * that ffmpeg-vaapi delivers.
+		 *
+		 * On the RK3588 vpu981 hardware path the same SEQUENCE ctrl
+		 * is harmless: vpu981's driver parses the OBU stream
+		 * directly and ignores the ctrl payload, so no per-decoder
+		 * gating is required here.
+		 */
+		rc = av1_set_controls(driver_data, context, surface_object);
+		if (rc < 0)
+			return VA_STATUS_ERROR_OPERATION_FAILED;
+		break;
+
 	default:
 		return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 	}
@@ -94,6 +94,9 @@ static const char * const known_decoder_drivers[] = {
 	"rkvdec",
 	"hantro-vpu",
 	"rpi-hevc-dec",  /* iter40: Pi 5 / CM5 stateless HEVC */
+#ifdef HAVE_DAEDALUS_V4L2
+	"daedalus_v4l2", /* phase 8.10: Pi 5 daemon-backed VP9/AV1/H264 */
+#endif
 	"cedrus",
 	"sun4i_csi",
 	NULL
@@ -409,6 +412,16 @@ char request_device_kind_for_profile(VAProfile profile)
 	case VAProfileMPEG2Main:
 	case VAProfileVP8Version0_3:
 		return 'h';
+	case VAProfileAV1Profile0:
+		/*
+		 * ampere-av1-enablement Phase 2: RK3588 vpu981 dedicated
+		 * AV1 hantro instance. 'a' kind dispatches to
+		 * driver_data->video_fd_vpu981. On hosts without the AV1
+		 * instance the fd stays -1 and RequestQueryConfigProfiles
+		 * never enumerates AV1, so this branch is unreachable for
+		 * non-RK3588 hosts.
+		 */
+		return 'a';
 	default:
 		return '?';
 	}
@@ -448,6 +461,44 @@ int request_switch_device_for_profile(struct request_data *driver_data,
 		kind = 'p';
 	}

+#ifdef HAVE_DAEDALUS_V4L2
+	/*
+	 * LIBVA-1: VP9/AV1/H.264 → daedalus_v4l2 when the daemon-backed
+	 * decoder fd is open.  Pi 5 has no rkvdec (those profiles map to
+	 * 'r' by default → video_fd_rkvdec = -1 → "stay on whatever's
+	 * active" fallback would put H.264 frames on rpi-hevc-dec's fd
+	 * and S_FMT would fail).  Re-route to the daedalus daemon instead.
+	 *
+	 * HEVC stays on 'p' (rpi-hevc-dec is HEVC-only — daedalus would
+	 * accept it via FFmpeg, but rpi-hevc-dec has the GPU-backed
+	 * hardware path so it's the right choice on this SoC).
+	 *
+	 * AV1 'a' kind (RK3588 vpu981) wins ONLY if vpu981 was probed.
+	 * On a Pi 5 the vpu981 slot stays -1, so we still route AV1 to
+	 * daedalus here.  Check video_fd_vpu981 to preserve the RK3588
+	 * priority for that case.
+	 */
+	if (driver_data->video_fd_daedalus >= 0 &&
+	    driver_data->media_fd_daedalus >= 0) {
+		switch (profile) {
+		case VAProfileH264Main:
+		case VAProfileH264High:
+		case VAProfileH264ConstrainedBaseline:
+		case VAProfileH264MultiviewHigh:
+		case VAProfileH264StereoHigh:
+		case VAProfileVP9Profile0:
+			kind = 'd';
+			break;
+		case VAProfileAV1Profile0:
+			if (driver_data->video_fd_vpu981 < 0)
+				kind = 'd';
+			break;
+		default:
+			break;
+		}
+	}
+#endif
+
 	if (kind == 'r') {
 		target_video = driver_data->video_fd_rkvdec;
 		target_media = driver_data->media_fd_rkvdec;
@@ -457,6 +508,14 @@ int request_switch_device_for_profile(struct request_data *driver_data,
 	} else if (kind == 'p') {
 		target_video = driver_data->video_fd_rpi_hevc_dec;
 		target_media = driver_data->media_fd_rpi_hevc_dec;
+	} else if (kind == 'a') {
+		target_video = driver_data->video_fd_vpu981;
+		target_media = driver_data->media_fd_vpu981;
+#ifdef HAVE_DAEDALUS_V4L2
+	} else if (kind == 'd') {
+		target_video = driver_data->video_fd_daedalus;
+		target_media = driver_data->media_fd_daedalus;
+#endif
 	} else {
 		return -1;
 	}
@@ -646,6 +705,10 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 	driver_data->media_fd_hantro = -1;
 	driver_data->video_fd_rpi_hevc_dec = -1;
 	driver_data->media_fd_rpi_hevc_dec = -1;
+	driver_data->video_fd_daedalus = -1;
+	driver_data->media_fd_daedalus = -1;
+	driver_data->video_fd_vpu981 = -1;
+	driver_data->media_fd_vpu981 = -1;

 	/*
 	 * iter38: probe BOTH rkvdec and hantro-vpu so a single libva session
@@ -677,14 +740,35 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 				driver_data->video_fd_hantro = video_fd;
 				driver_data->media_fd_hantro = media_fd;
 			} else if (strcmp(info.driver, "rpi-hevc-dec") == 0) {
-				/* iter40: Pi 5 / CM5 — sole decoder is rpi-hevc-dec.
-				 * No alt driver to probe; the rkvdec / hantro slots
-				 * stay -1 and HEVC routes to 'p' via
-				 * request_device_kind_for_profile. */
+				/* iter40 + LIBVA-1: Pi 5 / CM5.  rpi-hevc-dec is
+				 * HEVC-only.  If daedalus_v4l2 is ALSO loaded (Pi 5
+				 * mixed deployment — out-of-tree daemon-backed
+				 * decoder for VP9/AV1/H264), pick it up as the alt
+				 * so VP9/AV1/H264 have somewhere to land. */
 				primary_driver = "rpi-hevc-dec";
+#ifdef HAVE_DAEDALUS_V4L2
+				alt_driver = "daedalus_v4l2";
+#else
 				alt_driver = NULL;
+#endif
 				driver_data->video_fd_rpi_hevc_dec = video_fd;
 				driver_data->media_fd_rpi_hevc_dec = media_fd;
+#ifdef HAVE_DAEDALUS_V4L2
+			} else if (strcmp(info.driver, "daedalus_v4l2") == 0) {
+				/* phase 8.10 + LIBVA-1: Pi 5 daemon-backed decoder.
+				 * VP9 / AV1 / H.264 route through it via the 'd'
+				 * kind below.  On a mixed-driver box where
+				 * rpi-hevc-dec is ALSO loaded, pick it up as the
+				 * alt so HEVC has somewhere to land too — find_
+				 * codec_device's known_decoder_drivers[] order
+				 * normally puts rpi-hevc-dec first (we hit the
+				 * other branch in practice), but symmetric handling
+				 * keeps us correct if probe order ever flips. */
+				primary_driver = "daedalus_v4l2";
+				alt_driver = "rpi-hevc-dec";
+				driver_data->video_fd_daedalus = video_fd;
+				driver_data->media_fd_daedalus = media_fd;
+#endif
 			}
 		}

@@ -696,15 +780,38 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 				int alt_v = open(alt_video, O_RDWR | O_NONBLOCK);
 				int alt_m = (alt_v >= 0) ? open(alt_media, O_RDWR | O_NONBLOCK) : -1;
 				if (alt_v >= 0 && alt_m >= 0) {
+					/* Dispatch into the matching per-driver slot.
+					 * iter38 only had rkvdec/hantro pairs; iter40 +
+					 * LIBVA-1 extended this to rpi-hevc-dec and
+					 * daedalus_v4l2 for the Pi 5 mixed-decoder
+					 * deployment. */
 					if (strcmp(alt_driver, "rkvdec") == 0) {
 						driver_data->video_fd_rkvdec = alt_v;
 						driver_data->media_fd_rkvdec = alt_m;
-					} else {
+					} else if (strcmp(alt_driver, "hantro-vpu") == 0) {
 						driver_data->video_fd_hantro = alt_v;
 						driver_data->media_fd_hantro = alt_m;
+					} else if (strcmp(alt_driver, "rpi-hevc-dec") == 0) {
+						driver_data->video_fd_rpi_hevc_dec = alt_v;
+						driver_data->media_fd_rpi_hevc_dec = alt_m;
+#ifdef HAVE_DAEDALUS_V4L2
+					} else if (strcmp(alt_driver, "daedalus_v4l2") == 0) {
+						driver_data->video_fd_daedalus = alt_v;
+						driver_data->media_fd_daedalus = alt_m;
+#endif
+					} else {
+						/* Shouldn't happen — primary_driver branches
+						 * above only set alt_driver to one of the
+						 * names handled here.  Close and move on. */
+						close(alt_v);
+						close(alt_m);
+						alt_v = -1;
+						alt_m = -1;
+					}
+					if (alt_v >= 0) {
+						request_log("iter38: also opened %s decoder at %s + %s\n",
+							    alt_driver, alt_video, alt_media);
 					}
-					request_log("iter38: also opened %s decoder at %s + %s\n",
-						    alt_driver, alt_video, alt_media);
 				} else {
 					if (alt_v >= 0) close(alt_v);
 					if (alt_m >= 0) close(alt_m);
@@ -712,6 +819,61 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 			}
 		}
 		(void)primary_driver;
+
+		/*
+		 * ampere-av1-enablement Phase 2: walk hantro-vpu media nodes
+		 * for a SECOND one that advertises V4L2_PIX_FMT_AV1_FRAME
+		 * (AV1F) as OUTPUT pixfmt. RK3588 has 3 hantro-vpu instances
+		 * (legacy MPEG2/VP8 decoder, vepu121 encoder, vpu981 AV1
+		 * decoder) all reporting driver="hantro-vpu" / model="hantro-
+		 * vpu" — so OUTPUT-format probe is the only reliable
+		 * disambiguator that doesn't depend on parsing card-name
+		 * strings (which are DTS-dependent). First match wins.
+		 *
+		 * On non-RK3588 hosts the slot stays -1; RequestQueryConfig
+		 * Profiles' AV1 push then no-ops because any_fd_supports_
+		 * output_format() returns false for AV1F.
+		 */
+		{
+			int i;
+			char path[32], av1_video[32];
+
+			for (i = 0; i < 16; i++) {
+				int mfd, vfd;
+				struct media_device_info info;
+
+				snprintf(path, sizeof path, "/dev/media%d", i);
+				mfd = open(path, O_RDWR | O_NONBLOCK);
+				if (mfd < 0) continue;
+				memset(&info, 0, sizeof info);
+				if (ioctl(mfd, MEDIA_IOC_DEVICE_INFO, &info) != 0 ||
+				    strcmp(info.driver, "hantro-vpu") != 0) {
+					close(mfd);
+					continue;
+				}
+				if (find_decoder_video_node_via_topology(
+					    mfd, av1_video, sizeof av1_video) != 0) {
+					close(mfd);
+					continue;
+				}
+				vfd = open(av1_video, O_RDWR | O_NONBLOCK);
+				if (vfd < 0) {
+					close(mfd);
+					continue;
+				}
+				if (!v4l2_find_format(vfd, V4L2_BUF_TYPE_VIDEO_OUTPUT, V4L2_PIX_FMT_AV1_FRAME) &&
+				    !v4l2_find_format(vfd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, V4L2_PIX_FMT_AV1_FRAME)) {
+					close(vfd);
+					close(mfd);
+					continue;
+				}
+				driver_data->video_fd_vpu981 = vfd;
+				driver_data->media_fd_vpu981 = mfd;
+				request_log("ampere-av1: vpu981 AV1 decoder at %s + %s\n",
+					    av1_video, path);
+				break;
+			}
+		}
 	}

 	/*
@@ -737,6 +899,14 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 			    driver_data->video_fd_rpi_hevc_dec,
 			    driver_data->media_fd_rpi_hevc_dec);
 	}
+#ifdef HAVE_DAEDALUS_V4L2
+	if (driver_data->video_fd_daedalus >= 0) {
+		request_log("phase 8.10: opened daedalus_v4l2 at video_fd=%d "
+			    "media_fd=%d (Pi 5 daemon-backed VP9/AV1/H264)\n",
+			    driver_data->video_fd_daedalus,
+			    driver_data->media_fd_daedalus);
+	}
+#endif

 	status = VA_STATUS_SUCCESS;
 	goto complete;
@@ -784,6 +954,20 @@ VAStatus RequestTerminate(VADriverContextP context)
 		close(driver_data->video_fd_hantro);
 	if (driver_data->media_fd_hantro >= 0)
 		close(driver_data->media_fd_hantro);
+	if (driver_data->video_fd_rpi_hevc_dec >= 0)
+		close(driver_data->video_fd_rpi_hevc_dec);
+	if (driver_data->media_fd_rpi_hevc_dec >= 0)
+		close(driver_data->media_fd_rpi_hevc_dec);
+	if (driver_data->video_fd_vpu981 >= 0)
+		close(driver_data->video_fd_vpu981);
+	if (driver_data->media_fd_vpu981 >= 0)
+		close(driver_data->media_fd_vpu981);
+#ifdef HAVE_DAEDALUS_V4L2
+	if (driver_data->video_fd_daedalus >= 0)
+		close(driver_data->video_fd_daedalus);
+	if (driver_data->media_fd_daedalus >= 0)
+		close(driver_data->media_fd_daedalus);
+#endif
 	/* Fall back to direct close if neither alt fd captured the active
 	 * pair (env-override path). */
 	if (driver_data->video_fd_rkvdec < 0 && driver_data->video_fd_hantro < 0) {
@@ -42,7 +42,16 @@

 #define V4L2_REQUEST_STR_VENDOR			"v4l2-request"

-#define V4L2_REQUEST_MAX_PROFILES		13
+/*
+ * Sized for max-possible enumeration with iter39 Option B reverted:
+ * MPEG2(2) + H264(6 incl. Hi10P) + HEVC(2 incl. Main10) + VP8 + VP9 + AV1 = 13.
+ * The per-group guards use `if (... && index < (MAX_PROFILES - N))` where N
+ * is the push-group size, so MAX must be ≥ total+1 — 14 here. Bumping
+ * defensively now so a future re-enable of Hi10P/Main10 doesn't silently
+ * drop AV1 through the off-by-one trap that ate ampere-av1's enumeration
+ * for a week (see issue marfrit/libva-v4l2-request-fourier#2).
+ */
+#define V4L2_REQUEST_MAX_PROFILES		14
 #define V4L2_REQUEST_MAX_ENTRYPOINTS		5
 #define V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES	10
 #define V4L2_REQUEST_MAX_IMAGE_FORMATS		10
@@ -87,6 +96,39 @@ struct request_data {
 	 */
 	int video_fd_rpi_hevc_dec;
 	int media_fd_rpi_hevc_dec;
+	/*
+	 * phase 8.10: fifth multi-device-probe slot for daedalus_v4l2 — the
+	 * out-of-tree V4L2 stateless decoder shim that forwards bitstream
+	 * to a userspace daemon (daedalus-v4l2 sibling repo). Daemon does
+	 * FFmpeg-software decode for VP9 / AV1 / H.264 and ships pixels
+	 * back via dmabuf into the CAPTURE buffer.  Picked up via the
+	 * same media-controller probe + known_decoder_drivers[] entry
+	 * pattern as iter40 rpi-hevc-dec.  Stays -1 on hosts without the
+	 * daedalus module loaded; HEVC routes to rpi-hevc-dec as before.
+	 *
+	 * Fields are unconditional (8 bytes per session) so the struct
+	 * layout is stable regardless of meson option.  The active
+	 * probe + dispatch code in request.c is gated by
+	 * HAVE_DAEDALUS_V4L2; when disabled the fields stay at their
+	 * -1 init and no codepath touches them.
+	 */
+	int video_fd_daedalus;
+	int media_fd_daedalus;
+	/*
+	 * ampere-av1-enablement Phase 2: fourth multi-device-probe slot
+	 * for vpu981 (RK3588's dedicated AV1 hantro instance, kernel
+	 * card="rockchip,rk3588-av1-vpu-dec", driver name "hantro-vpu" —
+	 * shared with the legacy MPEG-2/VP8/H.264 hantro). Discriminated
+	 * by V4L2_PIX_FMT_AV1_FRAME (AV1F) OUTPUT-pixfmt capability since
+	 * the driver name alone is ambiguous on RK3588. Stays -1 on hosts
+	 * without the AV1 vpu-dec.
+	 *
+	 * Named "vpu981" for consistency with the in-progress av1-iter1
+	 * operator branch (Phase 3-5 bit-exact AV1 work — when that lands
+	 * these fields receive the actual decode dispatch wiring).
+	 */
+	int video_fd_vpu981;
+	int media_fd_vpu981;

 	/*
 	 * iter2 (ampere-kernel-decoders campaign) — per-fd probe result
@@ -21,7 +21,10 @@
 #include "v4l2.h"

 int request_pool_init(struct request_pool *pool, int video_fd, int media_fd,
-		      unsigned int output_type, unsigned int count)
+		      unsigned int output_type, unsigned int count,
+		      unsigned int pixelformat,
+		      unsigned int picture_width,
+		      unsigned int picture_height)
 {
 	unsigned int index_base;
 	unsigned int length;
@@ -43,6 +46,16 @@ int request_pool_init(struct request_pool *pool, int video_fd, int media_fd,
 	pool->next = 0;
 	pool->media_fd = media_fd;	/* iter7: kept for force_release re-alloc */

+	/*
+	 * iter#15: cache the S_FMT params so request_pool_resize can
+	 * re-issue S_FMT with a sizeimage hint override on overrun.
+	 */
+	pool->video_fd = video_fd;
+	pool->output_type = output_type;
+	pool->pixelformat = pixelformat;
+	pool->picture_width = picture_width;
+	pool->picture_height = picture_height;
+
 	for (i = 0; i < count; i++)
 		pool->slots[i].request_fd = -1;

@@ -94,6 +107,118 @@ error:
 	return -1;
 }

+int request_pool_resize(struct request_pool *pool,
+			unsigned int new_sizeimage_min)
+{
+	unsigned int index_base;
+	unsigned int length;
+	unsigned int offset;
+	unsigned int saved_count;
+	unsigned int i;
+	int rc;
+
+	if (pool == NULL || !pool->initialized || pool->count == 0)
+		return -1;
+
+	/*
+	 * Pre-condition guard: no slot may be borrowed when we tear the
+	 * pool down. The caller in codec_store_buffer temporarily releases
+	 * the current in-flight surface's slot before invoking us; the
+	 * inline-Sync-in-EndPicture pattern guarantees no other slot is
+	 * borrowed elsewhere in the driver. Bail loudly if anyone breaks
+	 * that invariant rather than corrupting in-flight V4L2 state.
+	 */
+	for (i = 0; i < pool->count; i++) {
+		if (pool->slots[i].busy) {
+			request_log("request_pool_resize: slot %u still busy — "
+				    "caller must release before resize\n", i);
+			return -1;
+		}
+	}
+
+	saved_count = pool->count;
+
+	/* STREAMOFF the OUTPUT queue so REQBUFS(0) is accepted. */
+	rc = v4l2_set_stream(pool->video_fd, pool->output_type, false);
+	if (rc < 0)
+		return -1;
+
+	/*
+	 * Tear down every slot: munmap, close per-slot request_fd. Slot
+	 * fields are zeroed in place so failure halfway is recoverable.
+	 */
+	for (i = 0; i < pool->count; i++) {
+		if (pool->slots[i].data != NULL && pool->slots[i].size > 0) {
+			munmap(pool->slots[i].data, pool->slots[i].size);
+			pool->slots[i].data = NULL;
+			pool->slots[i].size = 0;
+		}
+		if (pool->slots[i].request_fd >= 0) {
+			close(pool->slots[i].request_fd);
+			pool->slots[i].request_fd = -1;
+		}
+	}
+
+	/*
+	 * Release the V4L2 OUTPUT buffer indices. REQBUFS(0) is the only
+	 * way to ask the kernel to free buffers so CREATE_BUFS can re-
+	 * allocate with a new per-buffer sizeimage.
+	 */
+	rc = v4l2_request_buffers(pool->video_fd, pool->output_type, 0);
+	if (rc < 0)
+		return -1;
+
+	/*
+	 * Re-issue S_FMT with the cached dimensions but a larger
+	 * sizeimage. The kernel may round up further (driver-specific
+	 * page / alignment rules); we accept whatever it returns and
+	 * pick that up from per-slot v4l2_query_buffer below.
+	 */
+	rc = v4l2_set_format_sizeimage(pool->video_fd, pool->output_type,
+				       pool->pixelformat,
+				       pool->picture_width,
+				       pool->picture_height,
+				       new_sizeimage_min);
+	if (rc < 0)
+		return -1;
+
+	rc = v4l2_create_buffers(pool->video_fd, pool->output_type,
+				 saved_count, &index_base);
+	if (rc < 0)
+		return -1;
+
+	for (i = 0; i < saved_count; i++) {
+		pool->slots[i].index = index_base + i;
+		pool->slots[i].busy = false;
+
+		rc = v4l2_query_buffer(pool->video_fd, pool->output_type,
+				       pool->slots[i].index,
+				       &length, &offset, 1);
+		if (rc < 0)
+			return -1;
+
+		pool->slots[i].data = mmap(NULL, length,
+					   PROT_READ | PROT_WRITE,
+					   MAP_SHARED, pool->video_fd, offset);
+		if (pool->slots[i].data == MAP_FAILED) {
+			pool->slots[i].data = NULL;
+			return -1;
+		}
+		pool->slots[i].size = length;
+
+		pool->slots[i].request_fd = media_request_alloc(pool->media_fd);
+		if (pool->slots[i].request_fd < 0)
+			return -1;
+	}
+
+	rc = v4l2_set_stream(pool->video_fd, pool->output_type, true);
+	if (rc < 0)
+		return -1;
+
+	pool->next = 0;
+	return 0;
+}
+
 void request_pool_destroy(struct request_pool *pool)
 {
 	unsigned int i;
@@ -52,16 +52,71 @@ struct request_pool {
 	int				 media_fd;	/* iter7: kept for
 							 * force_release re-alloc */
 	bool				 initialized;
+
+	/*
+	 * iter#15: cached S_FMT params from request_pool_init, so
+	 * request_pool_resize can re-S_FMT the OUTPUT queue with a new
+	 * sizeimage override on a mid-session resolution upshift overrun
+	 * without the caller having to re-thread these through six call
+	 * sites. video_fd is also cached so the resize is fully
+	 * self-contained — request_pool_resize takes only the pool and
+	 * the new sizeimage hint.
+	 */
+	int				 video_fd;
+	unsigned int			 output_type;
+	unsigned int			 pixelformat;
+	unsigned int			 picture_width;
+	unsigned int			 picture_height;
 };

 /*
 * Allocate count OUTPUT buffers via VIDIOC_CREATE_BUFS, query and mmap
 * each, populate pool->slots[]. Caller must have already done
- * VIDIOC_S_FMT on the OUTPUT queue. Returns 0 on success, -1 on
- * failure.
+ * VIDIOC_S_FMT on the OUTPUT queue. The S_FMT params (pixelformat,
+ * picture_width, picture_height) are stashed on the pool so that
+ * request_pool_resize can re-issue S_FMT with the same dimensions but
+ * a larger sizeimage hint. Returns 0 on success, -1 on failure.
 */
 int request_pool_init(struct request_pool *pool, int video_fd, int media_fd,
-		      unsigned int output_type, unsigned int count);
+		      unsigned int output_type, unsigned int count,
+		      unsigned int pixelformat,
+		      unsigned int picture_width,
+		      unsigned int picture_height);
+
+/*
+ * iter#15: grow the OUTPUT pool's per-slot sizeimage in place.
+ *
+ * Issued from codec_store_buffer when an Annex-B start code / VP8
+ * header pad / slice payload won't fit in the current
+ * surface->source_size — i.e. the stream's per-frame bitstream budget
+ * has outgrown the OUTPUT pool slot's mmap (typical cause: SPS-driven
+ * resolution upshift mid-session).
+ *
+ * Steps:
+ *   1. STREAMOFF the OUTPUT queue.
+ *   2. munmap every slot, close every per-slot media-request fd.
+ *   3. VIDIOC_REQBUFS(count=0) to release the V4L2 buffer indices.
+ *   4. S_FMT with the cached pixelformat / picture_width /
+ *      picture_height but a sizeimage hint of new_sizeimage_min.
+ *   5. CREATE_BUFS with the original slot count.
+ *   6. Per-slot: query buffer length, mmap, alloc fresh request_fd.
+ *   7. STREAMON.
+ *
+ * Returns 0 on success, -1 on failure (caller falls back to
+ * VA_STATUS_ERROR_ALLOCATION_FAILED — the libva consumer recreates
+ * the surface at the new resolution).
+ *
+ * Pre-condition: NO pool slot is currently borrowed (busy=false on
+ * every slot) AND no buffer is in-flight on the OUTPUT queue. The
+ * inline-Sync-in-EndPicture pattern (RequestEndPicture calls
+ * RequestSyncSurface before returning) makes this trivially true at
+ * codec_store_buffer time for the only-supported single-context
+ * single-render-surface flow: the in-flight surface's slot is the
+ * sole borrowed slot, and the resize caller temporarily releases it
+ * before calling here.
+ */
+int request_pool_resize(struct request_pool *pool,
+			unsigned int new_sizeimage_min);

 /*
 * Munmap all slots and free the slots array. Idempotent.
@@ -122,6 +122,18 @@ struct object_surface {
 			VADecPictureParameterBufferVP9 picture;
 			VASliceParameterBufferVP9 slice;
 		} vp9;
+		struct {
+			/*
+			 * AV1 picture parameter buffer.  Slice params are
+			 * intentionally absent — the daedalus daemon track
+			 * (issue #11) consumes the slice OBU bytes directly
+			 * from the OUTPUT bitstream and synthesises only the
+			 * sequence-header OBU from V4L2_CID_STATELESS_AV1_
+			 * SEQUENCE.  No per-tile-group struct→OBU re-synthesis
+			 * required from libva today.
+			 */
+			VADecPictureParameterBufferAV1 picture;
+		} av1;
 	} params;

 	int request_fd;
@@ -113,6 +113,28 @@ static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
 	}
 }

+static void v4l2_setup_format_sizeimage(struct v4l2_format *format,
+					unsigned int type,
+					unsigned int width, unsigned int height,
+					unsigned int pixelformat,
+					unsigned int sizeimage)
+{
+	memset(format, 0, sizeof(*format));
+	format->type = type;
+
+	if (v4l2_type_is_mplane(type)) {
+		format->fmt.pix_mp.width = width;
+		format->fmt.pix_mp.height = height;
+		format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
+		format->fmt.pix_mp.pixelformat = pixelformat;
+	} else {
+		format->fmt.pix.width = width;
+		format->fmt.pix.height = height;
+		format->fmt.pix.sizeimage = sizeimage;
+		format->fmt.pix.pixelformat = pixelformat;
+	}
+}
+
 bool v4l2_find_format(int video_fd, unsigned int type, unsigned int pixelformat)
 {
 	struct v4l2_fmtdesc fmtdesc;
@@ -172,6 +194,30 @@ int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
 	return 0;
 }

+int v4l2_set_format_sizeimage(int video_fd, unsigned int type,
+			      unsigned int pixelformat,
+			      unsigned int width, unsigned int height,
+			      unsigned int sizeimage)
+{
+	struct v4l2_format format;
+	int rc;
+
+	if (sizeimage == 0)
+		return v4l2_set_format(video_fd, type, pixelformat, width, height);
+
+	v4l2_setup_format_sizeimage(&format, type, width, height, pixelformat,
+				    sizeimage);
+
+	rc = ioctl(video_fd, VIDIOC_S_FMT, &format);
+	if (rc < 0) {
+		request_log("Unable to set format (sizeimage=%u) for type %d: %s\n",
+			    sizeimage, type, strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
 int v4l2_get_format(int video_fd, unsigned int type, unsigned int *width,
 		    unsigned int *height, unsigned int *bytesperline,
 		    unsigned int *sizes, unsigned int *planes_count)
@@ -476,12 +522,35 @@ int v4l2_set_controls(int video_fd, int request_fd,
 		      struct v4l2_ext_control *control_array,
 		      unsigned int num_controls)
 {
+	struct v4l2_ext_controls controls;
 	int rc;

-	rc = v4l2_ioctl_controls(video_fd, request_fd, VIDIOC_S_EXT_CTRLS,
-				 control_array, num_controls);
+	memset(&controls, 0, sizeof(controls));
+	controls.controls = control_array;
+	controls.count = num_controls;
+	if (request_fd >= 0) {
+		controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
+		controls.request_fd = request_fd;
+	}
+
+	rc = ioctl(video_fd, VIDIOC_S_EXT_CTRLS, &controls);
 	if (rc < 0) {
-		request_log("Unable to set control(s): %s\n", strerror(errno));
+		/* error_idx is the index of the first failing control;
+		 * if it equals count, the ioctl itself failed (not a
+		 * specific control payload).  Useful for triaging
+		 * which V4L2_CID_STATELESS_* the kernel rejected. */
+		if (controls.error_idx < num_controls)
+			request_log("Unable to set control(s): %s "
+				    "(error_idx=%u/%u failing_ctrl_id=0x%x size=%u)\n",
+				    strerror(errno),
+				    controls.error_idx, controls.count,
+				    control_array[controls.error_idx].id,
+				    control_array[controls.error_idx].size);
+		else
+			request_log("Unable to set control(s): %s "
+				    "(error_idx=%u/%u ioctl-level)\n",
+				    strerror(errno),
+				    controls.error_idx, controls.count);
 		return -1;
 	}

@@ -36,6 +36,17 @@ bool v4l2_find_format(int video_fd, unsigned int type,
 		      unsigned int pixelformat);
 int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
 		    unsigned int width, unsigned int height);
+/*
+ * Same as v4l2_set_format but explicitly overrides the OUTPUT
+ * sizeimage hint. Pass sizeimage=0 to get the v4l2_set_format default
+ * (SOURCE_SIZE_MAX for OUTPUT, 0 for CAPTURE). Used by
+ * request_pool_resize on a mid-session bitstream-budget overrun to
+ * grow the OUTPUT pool slots past the SOURCE_SIZE_MAX floor.
+ */
+int v4l2_set_format_sizeimage(int video_fd, unsigned int type,
+			      unsigned int pixelformat,
+			      unsigned int width, unsigned int height,
+			      unsigned int sizeimage);
 int v4l2_get_format(int video_fd, unsigned int type, unsigned int *width,
 		    unsigned int *height, unsigned int *bytesperline,
 		    unsigned int *sizes, unsigned int *planes_count);