ampere-av1 Phase 5 review: stale linked_decode_surface_id clear; remap fix REVERTED

Two of three Phase 5 sonnet-architect review amendments addressed. Amendment 4 (kept): clear surface_object->linked_decode_surface_id at BeginPicture after the iter2 Fix 3 release. Prevents stale-link borrows in copy_surface_to_image when ffmpeg-vaapi recycles a former display surface as a decode target. No-op for non-AV1 codecs (link field stays VA_INVALID_SURFACE for them throughout). Amendment 1 (reverted): reviewer proposed remap_lr_type table {NONE, SWITCHABLE, WIENER, SGRPROJ} per Kwiboo's permutation, arguing AV1 spec FrameRestoreType wire encoding differs from V4L2_AV1_FRAME_RESTORE_* enum order. Applied the proposed table empirically → regressed ALL tests (allintra 10/10 → 0/10, test_av1 bit-exact → DIFF). Reverted to identity mapping. Either VAAPI's yframe_restoration_type is already in V4L2-enum order, or vpu981 interprets the V4L2 enum values via a mapping that differs from the uAPI header documentation. Per [[feedback_review_empirical_over_theoretical]] empirical PASS wins; updated the code comment to capture the investigation outcome so the next session has the context. Amendment 5 (SEPARATE_UV_DELTA_Q sequence flag missing): noted but not actionable — VAAPI doesn't expose color_config.separate_uv_delta_q. Will need bitstream-side info to surface. Not blocking current tests. Verification on ampere: test_av1.ivf: bit-exact PASS sha 029ee72c214b37c1 av1-1-b8-02-allintra.ivf: 10/10 PASS (no regression) av1_larger.ivf: 3/10 PASS (no regression) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
ampere-av1 Phase 3 finding: iter2 Fix 3 release is NOT the divergence cause
2026-05-17 12:19:19 +00:00 · 2026-05-17 12:12:23 +00:00 · 2026-05-17 10:55:07 +00:00 · 2026-05-17 10:45:31 +00:00 · 2026-05-17 10:31:24 +00:00 · 2026-05-17 10:28:32 +00:00
60 changed files with 19956 additions and 1128 deletions
@@ -0,0 +1,9 @@
+# STUDY.md → moved
+
+The Phase 0 / Phase 2 substrate that previously lived here has been transformed into the campaign-level Phase 0 document at:
+
+- [`../phase0_findings.md`](../phase0_findings.md)
+
+That document also points at the remaining open questions for Phase 1 lock and the verification gate at Phase 7. Read it together with the campaign README at [`../README.md`](../README.md).
+
+The git commit that this file points back to (the last commit while STUDY.md still held the substrate content) is `e0acc33` — `git show e0acc33:STUDY.md` recovers the historical content if needed.
@@ -1,197 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * These are the H.264 state controls for use with stateless H.264
- * codec drivers.
- *
- * It turns out that these structs are not stable yet and will undergo
- * more changes. So keep them private until they are stable and ready to
- * become part of the official public API.
- */
-
-#ifndef _H264_CTRLS_H_
-#define _H264_CTRLS_H_
-
-#include <linux/videodev2.h>
-
-/* Our pixel format isn't stable at the moment */
-#define V4L2_PIX_FMT_H264_SLICE_RAW v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
-
-/*
- * This is put insanely high to avoid conflicting with controls that
- * would be added during the phase where those controls are not
- * stable. It should be fixed eventually.
- */
-#define V4L2_CID_MPEG_VIDEO_H264_SPS		(V4L2_CID_MPEG_BASE+1000)
-#define V4L2_CID_MPEG_VIDEO_H264_PPS		(V4L2_CID_MPEG_BASE+1001)
-#define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
-#define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
-#define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
-
-/* enum v4l2_ctrl_type type values */
-#define V4L2_CTRL_TYPE_H264_SPS			0x0110
-#define V4L2_CTRL_TYPE_H264_PPS			0x0111
-#define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
-#define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
-#define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
-
-#define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
-#define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
-#define V4L2_H264_SPS_CONSTRAINT_SET2_FLAG			0x04
-#define V4L2_H264_SPS_CONSTRAINT_SET3_FLAG			0x08
-#define V4L2_H264_SPS_CONSTRAINT_SET4_FLAG			0x10
-#define V4L2_H264_SPS_CONSTRAINT_SET5_FLAG			0x20
-
-#define V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE		0x01
-#define V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS	0x02
-#define V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO		0x04
-#define V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED	0x08
-#define V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY			0x10
-#define V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD		0x20
-#define V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE			0x40
-
-struct v4l2_ctrl_h264_sps {
-	__u8 profile_idc;
-	__u8 constraint_set_flags;
-	__u8 level_idc;
-	__u8 seq_parameter_set_id;
-	__u8 chroma_format_idc;
-	__u8 bit_depth_luma_minus8;
-	__u8 bit_depth_chroma_minus8;
-	__u8 log2_max_frame_num_minus4;
-	__u8 pic_order_cnt_type;
-	__u8 log2_max_pic_order_cnt_lsb_minus4;
-	__u8 max_num_ref_frames;
-	__u8 num_ref_frames_in_pic_order_cnt_cycle;
-	__s32 offset_for_ref_frame[255];
-	__s32 offset_for_non_ref_pic;
-	__s32 offset_for_top_to_bottom_field;
-	__u16 pic_width_in_mbs_minus1;
-	__u16 pic_height_in_map_units_minus1;
-	__u32 flags;
-};
-
-#define V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE				0x0001
-#define V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT	0x0002
-#define V4L2_H264_PPS_FLAG_WEIGHTED_PRED				0x0004
-#define V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT		0x0008
-#define V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED			0x0010
-#define V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT			0x0020
-#define V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE				0x0040
-#define V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT			0x0080
-
-struct v4l2_ctrl_h264_pps {
-	__u8 pic_parameter_set_id;
-	__u8 seq_parameter_set_id;
-	__u8 num_slice_groups_minus1;
-	__u8 num_ref_idx_l0_default_active_minus1;
-	__u8 num_ref_idx_l1_default_active_minus1;
-	__u8 weighted_bipred_idc;
-	__s8 pic_init_qp_minus26;
-	__s8 pic_init_qs_minus26;
-	__s8 chroma_qp_index_offset;
-	__s8 second_chroma_qp_index_offset;
-	__u16 flags;
-};
-
-struct v4l2_ctrl_h264_scaling_matrix {
-	__u8 scaling_list_4x4[6][16];
-	__u8 scaling_list_8x8[6][64];
-};
-
-struct v4l2_h264_weight_factors {
-	__s16 luma_weight[32];
-	__s16 luma_offset[32];
-	__s16 chroma_weight[32][2];
-	__s16 chroma_offset[32][2];
-};
-
-struct v4l2_h264_pred_weight_table {
-	__u16 luma_log2_weight_denom;
-	__u16 chroma_log2_weight_denom;
-	struct v4l2_h264_weight_factors weight_factors[2];
-};
-
-#define V4L2_H264_SLICE_TYPE_P				0
-#define V4L2_H264_SLICE_TYPE_B				1
-#define V4L2_H264_SLICE_TYPE_I				2
-#define V4L2_H264_SLICE_TYPE_SP				3
-#define V4L2_H264_SLICE_TYPE_SI				4
-
-#define V4L2_H264_SLICE_FLAG_FIELD_PIC			0x01
-#define V4L2_H264_SLICE_FLAG_BOTTOM_FIELD		0x02
-#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED	0x04
-#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH		0x08
-
-struct v4l2_ctrl_h264_slice_params {
-	/* Size in bytes, including header */
-	__u32 size;
-	/* Offset in bits to slice_data() from the beginning of this slice. */
-	__u32 header_bit_size;
-
-	__u16 first_mb_in_slice;
-	__u8 slice_type;
-	__u8 pic_parameter_set_id;
-	__u8 colour_plane_id;
-	__u8 redundant_pic_cnt;
-	__u16 frame_num;
-	__u16 idr_pic_id;
-	__u16 pic_order_cnt_lsb;
-	__s32 delta_pic_order_cnt_bottom;
-	__s32 delta_pic_order_cnt0;
-	__s32 delta_pic_order_cnt1;
-
-	struct v4l2_h264_pred_weight_table pred_weight_table;
-	/* Size in bits of dec_ref_pic_marking() syntax element. */
-	__u32 dec_ref_pic_marking_bit_size;
-	/* Size in bits of pic order count syntax. */
-	__u32 pic_order_cnt_bit_size;
-
-	__u8 cabac_init_idc;
-	__s8 slice_qp_delta;
-	__s8 slice_qs_delta;
-	__u8 disable_deblocking_filter_idc;
-	__s8 slice_alpha_c0_offset_div2;
-	__s8 slice_beta_offset_div2;
-	__u8 num_ref_idx_l0_active_minus1;
-	__u8 num_ref_idx_l1_active_minus1;
-	__u32 slice_group_change_cycle;
-
-	/*
-	 * Entries on each list are indices into
-	 * v4l2_ctrl_h264_decode_params.dpb[].
-	 */
-	__u8 ref_pic_list0[32];
-	__u8 ref_pic_list1[32];
-
-	__u32 flags;
-};
-
-#define V4L2_H264_DPB_ENTRY_FLAG_VALID		0x01
-#define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE		0x02
-#define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM	0x04
-
-struct v4l2_h264_dpb_entry {
-	__u64 reference_ts;
-	__u16 frame_num;
-	__u16 pic_num;
-	/* Note that field is indicated by v4l2_buffer.field */
-	__s32 top_field_order_cnt;
-	__s32 bottom_field_order_cnt;
-	__u32 flags; /* V4L2_H264_DPB_ENTRY_FLAG_* */
-};
-
-#define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC	0x01
-
-struct v4l2_ctrl_h264_decode_params {
-	struct v4l2_h264_dpb_entry dpb[16];
-	__u16 num_slices;
-	__u16 nal_ref_idc;
-	__u8 ref_pic_list_p0[32];
-	__u8 ref_pic_list_b0[32];
-	__u8 ref_pic_list_b1[32];
-	__s32 top_field_order_cnt;
-	__s32 bottom_field_order_cnt;
-	__u32 flags; /* V4L2_H264_DECODE_PARAM_FLAG_* */
-};
-
-#endif
@@ -1,185 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*
- * These are the HEVC state controls for use with stateless HEVC
- * codec drivers.
- *
- * It turns out that these structs are not stable yet and will undergo
- * more changes. So keep them private until they are stable and ready to
- * become part of the official public API.
- */
-
-#ifndef _HEVC_CTRLS_H_
-#define _HEVC_CTRLS_H_
-
-/* The pixel format isn't stable at the moment and will likely be renamed. */
-#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-
-#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
-#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
-#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
-
-/* enum v4l2_ctrl_type type values */
-#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-
-#define V4L2_HEVC_SLICE_TYPE_B	0
-#define V4L2_HEVC_SLICE_TYPE_P	1
-#define V4L2_HEVC_SLICE_TYPE_I	2
-
-/* The controls are not stable at the moment and will likely be reworked. */
-struct v4l2_ctrl_hevc_sps {
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-	__u8	chroma_format_idc;
-	__u8	separate_colour_plane_flag;
-	__u16	pic_width_in_luma_samples;
-	__u16	pic_height_in_luma_samples;
-	__u8	bit_depth_luma_minus8;
-	__u8	bit_depth_chroma_minus8;
-	__u8	log2_max_pic_order_cnt_lsb_minus4;
-	__u8	sps_max_dec_pic_buffering_minus1;
-	__u8	sps_max_num_reorder_pics;
-	__u8	sps_max_latency_increase_plus1;
-	__u8	log2_min_luma_coding_block_size_minus3;
-	__u8	log2_diff_max_min_luma_coding_block_size;
-	__u8	log2_min_luma_transform_block_size_minus2;
-	__u8	log2_diff_max_min_luma_transform_block_size;
-	__u8	max_transform_hierarchy_depth_inter;
-	__u8	max_transform_hierarchy_depth_intra;
-	__u8	scaling_list_enabled_flag;
-	__u8	amp_enabled_flag;
-	__u8	sample_adaptive_offset_enabled_flag;
-	__u8	pcm_enabled_flag;
-	__u8	pcm_sample_bit_depth_luma_minus1;
-	__u8	pcm_sample_bit_depth_chroma_minus1;
-	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-	__u8	pcm_loop_filter_disabled_flag;
-	__u8	num_short_term_ref_pic_sets;
-	__u8	long_term_ref_pics_present_flag;
-	__u8	num_long_term_ref_pics_sps;
-	__u8	sps_temporal_mvp_enabled_flag;
-	__u8	strong_intra_smoothing_enabled_flag;
-};
-
-struct v4l2_ctrl_hevc_pps {
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-	__u8	dependent_slice_segment_flag;
-	__u8	output_flag_present_flag;
-	__u8	num_extra_slice_header_bits;
-	__u8	sign_data_hiding_enabled_flag;
-	__u8	cabac_init_present_flag;
-	__s8	init_qp_minus26;
-	__u8	constrained_intra_pred_flag;
-	__u8	transform_skip_enabled_flag;
-	__u8	cu_qp_delta_enabled_flag;
-	__u8	diff_cu_qp_delta_depth;
-	__s8	pps_cb_qp_offset;
-	__s8	pps_cr_qp_offset;
-	__u8	pps_slice_chroma_qp_offsets_present_flag;
-	__u8	weighted_pred_flag;
-	__u8	weighted_bipred_flag;
-	__u8	transquant_bypass_enabled_flag;
-	__u8	tiles_enabled_flag;
-	__u8	entropy_coding_sync_enabled_flag;
-	__u8	num_tile_columns_minus1;
-	__u8	num_tile_rows_minus1;
-	__u8	column_width_minus1[20];
-	__u8	row_height_minus1[22];
-	__u8	loop_filter_across_tiles_enabled_flag;
-	__u8	pps_loop_filter_across_slices_enabled_flag;
-	__u8	deblocking_filter_override_enabled_flag;
-	__u8	pps_disable_deblocking_filter_flag;
-	__s8	pps_beta_offset_div2;
-	__s8	pps_tc_offset_div2;
-	__u8	lists_modification_present_flag;
-	__u8	log2_parallel_merge_level_minus2;
-	__u8	slice_segment_header_extension_present_flag;
-	__u8	padding;
-};
-
-#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
-#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
-#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
-
-#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-
-struct v4l2_hevc_dpb_entry {
-	__u64	timestamp;
-	__u8	rps;
-	__u8	field_pic;
-	__u16	pic_order_cnt[2];
-	__u8	padding[2];
-};
-
-struct v4l2_hevc_pred_weight_table {
-	__u8	luma_log2_weight_denom;
-	__s8	delta_chroma_log2_weight_denom;
-
-	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-
-	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-
-	__u8	padding[2];
-};
-
-struct v4l2_ctrl_hevc_slice_params {
-	__u32	bit_size;
-	__u32	data_bit_offset;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-	__u8	nal_unit_type;
-	__u8	nuh_temporal_id_plus1;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-	__u8	slice_type;
-	__u8	colour_plane_id;
-	__u16	slice_pic_order_cnt;
-	__u8	slice_sao_luma_flag;
-	__u8	slice_sao_chroma_flag;
-	__u8	slice_temporal_mvp_enabled_flag;
-	__u8	num_ref_idx_l0_active_minus1;
-	__u8	num_ref_idx_l1_active_minus1;
-	__u8	mvd_l1_zero_flag;
-	__u8	cabac_init_flag;
-	__u8	collocated_from_l0_flag;
-	__u8	collocated_ref_idx;
-	__u8	five_minus_max_num_merge_cand;
-	__u8	use_integer_mv_flag;
-	__s8	slice_qp_delta;
-	__s8	slice_cb_qp_offset;
-	__s8	slice_cr_qp_offset;
-	__s8	slice_act_y_qp_offset;
-	__s8	slice_act_cb_qp_offset;
-	__s8	slice_act_cr_qp_offset;
-	__u8	slice_deblocking_filter_disabled_flag;
-	__s8	slice_beta_offset_div2;
-	__s8	slice_tc_offset_div2;
-	__u8	slice_loop_filter_across_slices_enabled_flag;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-	__u8	pic_struct;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	num_active_dpb_entries;
-	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-
-	__u8	num_rps_poc_st_curr_before;
-	__u8	num_rps_poc_st_curr_after;
-	__u8	num_rps_poc_lt_curr;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-	struct v4l2_hevc_pred_weight_table pred_weight_table;
-
-	__u8	padding[2];
-};
-
+/* Fourier-local override: HEVC controls are upstream since linux-media
+ * 6.6+, so defer to the kernel's linux/v4l2-controls.h instead of
+ * duplicating the struct definitions (duplication causes redefinition
+ * errors on newer linux-api-headers). */
+#ifndef _LIBVA_V4L2_REQUEST_HEVC_CTRLS_H
+#define _LIBVA_V4L2_REQUEST_HEVC_CTRLS_H
+#include <linux/v4l2-controls.h>
 #endif
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * These are the MPEG2 state controls for use with stateless MPEG-2
- * codec drivers.
- *
- * It turns out that these structs are not stable yet and will undergo
- * more changes. So keep them private until they are stable and ready to
- * become part of the official public API.
- */
-
-#ifndef _MPEG2_CTRLS_H_
-#define _MPEG2_CTRLS_H_
-
-#define V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS		(V4L2_CID_MPEG_BASE+250)
-#define V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION		(V4L2_CID_MPEG_BASE+251)
-
-/* enum v4l2_ctrl_type type values */
-#define V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS 0x0103
-#define	V4L2_CTRL_TYPE_MPEG2_QUANTIZATION 0x0104
-
-#define V4L2_MPEG2_PICTURE_CODING_TYPE_I	1
-#define V4L2_MPEG2_PICTURE_CODING_TYPE_P	2
-#define V4L2_MPEG2_PICTURE_CODING_TYPE_B	3
-#define V4L2_MPEG2_PICTURE_CODING_TYPE_D	4
-
-struct v4l2_mpeg2_sequence {
-	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
-	__u16	horizontal_size;
-	__u16	vertical_size;
-	__u32	vbv_buffer_size;
-
-	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
-	__u16	profile_and_level_indication;
-	__u8	progressive_sequence;
-	__u8	chroma_format;
-};
-
-struct v4l2_mpeg2_picture {
-	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
-	__u8	picture_coding_type;
-
-	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
-	__u8	f_code[2][2];
-	__u8	intra_dc_precision;
-	__u8	picture_structure;
-	__u8	top_field_first;
-	__u8	frame_pred_frame_dct;
-	__u8	concealment_motion_vectors;
-	__u8	q_scale_type;
-	__u8	intra_vlc_format;
-	__u8	alternate_scan;
-	__u8	repeat_first_field;
-	__u16	progressive_frame;
-};
-
-struct v4l2_ctrl_mpeg2_slice_params {
-	__u32	bit_size;
-	__u32	data_bit_offset;
-	__u64	backward_ref_ts;
-	__u64	forward_ref_ts;
-
-	struct v4l2_mpeg2_sequence sequence;
-	struct v4l2_mpeg2_picture picture;
-
-	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
-	__u32	quantiser_scale_code;
-};
-
-struct v4l2_ctrl_mpeg2_quantization {
-	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
-	__u8	load_intra_quantiser_matrix;
-	__u8	load_non_intra_quantiser_matrix;
-	__u8	load_chroma_intra_quantiser_matrix;
-	__u8	load_chroma_non_intra_quantiser_matrix;
-
-	__u8	intra_quantiser_matrix[64];
-	__u8	non_intra_quantiser_matrix[64];
-	__u8	chroma_intra_quantiser_matrix[64];
-	__u8	chroma_non_intra_quantiser_matrix[64];
-};
-
-#endif
@@ -0,0 +1,689 @@
+/*
+ * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
+ *
+ * ampere-av1-enablement Phase 2.1: AV1 codec dispatcher for libva-v4l2-
+ * request-fourier. Translates VAAPI AV1 picture/slice parameter buffers
+ * into V4L2 stateless AV1 controls (V4L2_CID_STATELESS_AV1_*) for the
+ * Rockchip vpu981 hardware on RK3588.
+ *
+ * Reference: Kwiboo/FFmpeg v4l2-request-n8.1:libavcodec/v4l2_request_av1.c
+ * (636 LoC; reads from FFmpeg's AV1RawSequenceHeader + AV1RawFrameHeader).
+ * VAAPI exposes the same AV1 spec semantics through different struct
+ * shapes: sequence-level fields are folded into VADecPictureParameterBufferAV1
+ * (no separate sequence buffer); per-frame fields live in the same struct.
+ *
+ * F1/F2/F3 risk mitigations per phase1_plan_v2 §"General fill_frame
+ * implementation risks":
+ *   F1 tile_info.mi_col/row_starts sentinel = 2 * ((frame_width + 7) >> 3)
+ *      mirrors Kwiboo lines 238/244 exactly.
+ *   F2 superres_denom: VAAPI exposes superres_scale_denominator directly
+ *      and per spec it's already 8 when use_superres=0. No offset math
+ *      needed (Kwiboo does it because FFmpeg stores raw coded_denom).
+ *   F3 loop_restoration_size[] gated on USES_LR flag mirrors Kwiboo
+ *      lines 281-287 exactly.
+ *
+ * V4L2 controls (4 per frame, batched in one VIDIOC_S_EXT_CTRLS):
+ *   1. V4L2_CID_STATELESS_AV1_SEQUENCE
+ *   2. V4L2_CID_STATELESS_AV1_FRAME
+ *   3. V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY[] (DYNAMIC_ARRAY)
+ *   4. V4L2_CID_STATELESS_AV1_FILM_GRAIN (conditional on driver_data->
+ *      has_av1_film_grain probe)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "av1.h"
+
+#include "context.h"
+#include "object_heap.h"
+#include "request.h"
+#include "surface.h"
+#include "utils.h"
+#include "v4l2.h"
+
+#include <va/va.h>
+
+#include <linux/videodev2.h>
+#include <linux/v4l2-controls.h>
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Sanity asserts to catch kernel uAPI drift. If these fire, the kernel
+ * headers on the build machine are out of sync with what the running
+ * driver expects — silent register-misalignment bugs result. Cross-compile
+ * hazard per Janet v3 amendment: native-arm64 builds only (boltzmann +
+ * ampere); no cross from x86 against ARM kernel headers. */
+_Static_assert(sizeof(struct v4l2_ctrl_av1_tile_group_entry) == 16,
+	       "v4l2_ctrl_av1_tile_group_entry size drift — recheck uAPI");
+
+/* Per AV1 spec, when use_superres=0 the superres denominator is 8.
+ * VAAPI's superres_scale_denominator already encodes this directly
+ * (per va_dec_av1.h: "When use_superres=0, superres_scale_denominator
+ * must be 8"). Kwiboo's AV1_SUPERRES_DENOM_MIN+coded_denom math is
+ * not needed when reading from VAAPI. */
+#define AV1_SUPERRES_NUM 8
+
+/* AV1 spec maxima used for V4L2 array sizing. */
+#define BACKEND_AV1_MAX_SEGMENTS	8
+#define BACKEND_AV1_SEG_LVL_MAX		8
+#define BACKEND_AV1_SEG_LVL_REF_FRAME	5
+#define BACKEND_AV1_NUM_REF_FRAMES	8
+#define BACKEND_AV1_TOTAL_REFS_PER_FRAME 8
+#define BACKEND_AV1_REFS_PER_FRAME	7
+
+/* ===== fill_sequence ===== */
+static void av1_fill_sequence(VADecPictureParameterBufferAV1 *picture,
+			      struct v4l2_ctrl_av1_sequence *ctrl)
+{
+	uint8_t bit_depth;
+
+	memset(ctrl, 0, sizeof(*ctrl));
+
+	switch (picture->bit_depth_idx) {
+	case 0: bit_depth = 8; break;
+	case 1: bit_depth = 10; break;
+	case 2: bit_depth = 12; break;
+	default: bit_depth = 8; break;
+	}
+
+	ctrl->seq_profile = picture->profile;
+	ctrl->order_hint_bits = picture->seq_info_fields.fields.enable_order_hint ?
+				(picture->order_hint_bits_minus_1 + 1) : 0;
+	ctrl->bit_depth = bit_depth;
+	/* VAAPI does NOT separately expose max_frame_{width,height}_minus_1
+	 * (sequence-level). Use the current frame size as a proxy. Correct
+	 * for fixed-size sequences (the 208/352/1080p test vectors). */
+	ctrl->max_frame_width_minus_1 = picture->frame_width_minus1;
+	ctrl->max_frame_height_minus_1 = picture->frame_height_minus1;
+
+	if (picture->seq_info_fields.fields.still_picture)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE;
+	if (picture->seq_info_fields.fields.use_128x128_superblock)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK;
+	if (picture->seq_info_fields.fields.enable_filter_intra)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA;
+	if (picture->seq_info_fields.fields.enable_intra_edge_filter)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER;
+	if (picture->seq_info_fields.fields.enable_interintra_compound)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND;
+	if (picture->seq_info_fields.fields.enable_masked_compound)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND;
+	/* VAAPI doesn't expose enable_warped_motion as a sequence flag;
+	 * per-frame allow_warped_motion gates it. Conservative: set true so
+	 * per-frame flag is honored. */
+	ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION;
+	if (picture->seq_info_fields.fields.enable_dual_filter)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER;
+	if (picture->seq_info_fields.fields.enable_order_hint)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT;
+	if (picture->seq_info_fields.fields.enable_jnt_comp)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP;
+	/* enable_ref_frame_mvs / enable_restoration not exposed at sequence
+	 * level — conservative set-true (kdirect also sets these for the
+	 * test streams; gating doesn't matter because per-frame flags
+	 * govern actual use). */
+	ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS;
+	/* enable_superres: gate on the current frame's use_superres so the
+	 * SEQUENCE flag matches the bitstream-derived value. Empirical
+	 * strace diff vs kdirect: kdirect clears this for streams that
+	 * never use superres; we were unconditionally setting it true. */
+	if (picture->pic_info_fields.bits.use_superres)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES;
+	if (picture->seq_info_fields.fields.enable_cdef)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF;
+	ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION;
+	if (picture->seq_info_fields.fields.mono_chrome)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME;
+	if (picture->seq_info_fields.fields.color_range)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE;
+	if (picture->seq_info_fields.fields.subsampling_x)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X;
+	if (picture->seq_info_fields.fields.subsampling_y)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y;
+	if (picture->seq_info_fields.fields.film_grain_params_present)
+		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT;
+}
+
+/* ===== fill_frame ===== */
+static void av1_fill_frame(VADecPictureParameterBufferAV1 *picture,
+			   struct v4l2_ctrl_av1_frame *ctrl)
+{
+	unsigned int i, j;
+
+	memset(ctrl, 0, sizeof(*ctrl));
+
+	/* ---- tile_info ---- */
+	ctrl->tile_info.context_update_tile_id = picture->context_update_tile_id;
+	ctrl->tile_info.tile_cols = picture->tile_cols;
+	ctrl->tile_info.tile_rows = picture->tile_rows;
+	if (picture->tile_cols > 1 || picture->tile_rows > 1)
+		ctrl->tile_info.tile_size_bytes = 4;
+	else
+		ctrl->tile_info.tile_size_bytes = 0;
+
+	if (picture->pic_info_fields.bits.uniform_tile_spacing_flag)
+		ctrl->tile_info.flags |= V4L2_AV1_TILE_INFO_FLAG_UNIFORM_TILE_SPACING;
+
+	/* F1: mi_col/row_starts[]: prefix-sum from width_in_sbs_minus_1[]+1
+	 * (Kwiboo reads tile_start_col_sb[] directly; VAAPI doesn't expose
+	 * starts, only widths — reconstruct via accumulation). Plus the
+	 * sentinel at index tile_cols/tile_rows. */
+	{
+		uint16_t cum = 0;
+		for (i = 0; i < picture->tile_cols && i < 63; i++) {
+			ctrl->tile_info.mi_col_starts[i] = cum;
+			ctrl->tile_info.width_in_sbs_minus_1[i] =
+				picture->width_in_sbs_minus_1[i];
+			cum = (uint16_t)(cum + picture->width_in_sbs_minus_1[i] + 1);
+		}
+		ctrl->tile_info.mi_col_starts[picture->tile_cols] =
+			2 * ((picture->frame_width_minus1 + 1 + 7) >> 3);
+	}
+	{
+		uint16_t cum = 0;
+		for (i = 0; i < picture->tile_rows && i < 63; i++) {
+			ctrl->tile_info.mi_row_starts[i] = cum;
+			ctrl->tile_info.height_in_sbs_minus_1[i] =
+				picture->height_in_sbs_minus_1[i];
+			cum = (uint16_t)(cum + picture->height_in_sbs_minus_1[i] + 1);
+		}
+		ctrl->tile_info.mi_row_starts[picture->tile_rows] =
+			2 * ((picture->frame_height_minus1 + 1 + 7) >> 3);
+	}
+
+	/* ---- quantization ---- */
+	ctrl->quantization.base_q_idx = picture->base_qindex;
+	ctrl->quantization.delta_q_y_dc = picture->y_dc_delta_q;
+	ctrl->quantization.delta_q_u_dc = picture->u_dc_delta_q;
+	ctrl->quantization.delta_q_u_ac = picture->u_ac_delta_q;
+	ctrl->quantization.delta_q_v_dc = picture->v_dc_delta_q;
+	ctrl->quantization.delta_q_v_ac = picture->v_ac_delta_q;
+	ctrl->quantization.qm_y = picture->qmatrix_fields.bits.qm_y;
+	ctrl->quantization.qm_u = picture->qmatrix_fields.bits.qm_u;
+	ctrl->quantization.qm_v = picture->qmatrix_fields.bits.qm_v;
+	ctrl->quantization.delta_q_res =
+		picture->mode_control_fields.bits.log2_delta_q_res;
+
+	if (picture->u_dc_delta_q != picture->v_dc_delta_q ||
+	    picture->u_ac_delta_q != picture->v_ac_delta_q)
+		ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_DIFF_UV_DELTA;
+	if (picture->qmatrix_fields.bits.using_qmatrix)
+		ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_USING_QMATRIX;
+	if (picture->mode_control_fields.bits.delta_q_present_flag)
+		ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_DELTA_Q_PRESENT;
+
+	/* ---- segmentation ---- */
+	if (picture->seg_info.segment_info_fields.bits.enabled)
+		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_ENABLED;
+	if (picture->seg_info.segment_info_fields.bits.update_map)
+		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_UPDATE_MAP;
+	if (picture->seg_info.segment_info_fields.bits.temporal_update)
+		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_TEMPORAL_UPDATE;
+	if (picture->seg_info.segment_info_fields.bits.update_data)
+		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_UPDATE_DATA;
+
+	for (i = 0; i < BACKEND_AV1_MAX_SEGMENTS; i++) {
+		for (j = 0; j < BACKEND_AV1_SEG_LVL_MAX; j++) {
+			if (picture->seg_info.feature_mask[i] & (1 << j)) {
+				ctrl->segmentation.feature_enabled[i] |=
+					V4L2_AV1_SEGMENT_FEATURE_ENABLED(j);
+				ctrl->segmentation.last_active_seg_id = i;
+				if (j >= BACKEND_AV1_SEG_LVL_REF_FRAME)
+					ctrl->segmentation.flags |=
+					    V4L2_AV1_SEGMENTATION_FLAG_SEG_ID_PRE_SKIP;
+			}
+			ctrl->segmentation.feature_data[i][j] =
+				picture->seg_info.feature_data[i][j];
+		}
+	}
+
+	/* ---- loop_filter ---- */
+	ctrl->loop_filter.level[0] = picture->filter_level[0];
+	ctrl->loop_filter.level[1] = picture->filter_level[1];
+	ctrl->loop_filter.level[2] = picture->filter_level_u;
+	ctrl->loop_filter.level[3] = picture->filter_level_v;
+	ctrl->loop_filter.sharpness =
+		picture->loop_filter_info_fields.bits.sharpness_level;
+	ctrl->loop_filter.mode_deltas[0] = picture->mode_deltas[0];
+	ctrl->loop_filter.mode_deltas[1] = picture->mode_deltas[1];
+	ctrl->loop_filter.delta_lf_res =
+		picture->mode_control_fields.bits.log2_delta_lf_res;
+	for (i = 0; i < BACKEND_AV1_NUM_REF_FRAMES; i++)
+		ctrl->loop_filter.ref_deltas[i] = picture->ref_deltas[i];
+
+	if (picture->loop_filter_info_fields.bits.mode_ref_delta_enabled)
+		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_ENABLED;
+	if (picture->loop_filter_info_fields.bits.mode_ref_delta_update)
+		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_UPDATE;
+	if (picture->mode_control_fields.bits.delta_lf_present_flag)
+		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_PRESENT;
+	if (picture->mode_control_fields.bits.delta_lf_multi)
+		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_MULTI;
+
+	/* ---- cdef ---- */
+	ctrl->cdef.damping_minus_3 = picture->cdef_damping_minus_3;
+	ctrl->cdef.bits = picture->cdef_bits;
+	for (i = 0; i < (unsigned)(1 << picture->cdef_bits) && i < 8; i++) {
+		uint8_t y = picture->cdef_y_strengths[i];
+		uint8_t uv = picture->cdef_uv_strengths[i];
+		ctrl->cdef.y_pri_strength[i] = (y >> 2) & 0x0F;
+		ctrl->cdef.y_sec_strength[i] = y & 0x03;
+		ctrl->cdef.uv_pri_strength[i] = (uv >> 2) & 0x0F;
+		ctrl->cdef.uv_sec_strength[i] = uv & 0x03;
+	}
+
+	/* ---- loop_restoration ---- (F3)
+	 * Phase 5 review Amendment 1 was REVERTED. The reviewer proposed
+	 * remap = {NONE, SWITCHABLE, WIENER, SGRPROJ} (Kwiboo's table)
+	 * based on AV1 spec FrameRestoreType wire encoding
+	 * {NONE=0, SWITCHABLE=1, WIENER=2, SGRPROJ=3} differing from V4L2's
+	 * {NONE=0, WIENER=1, SGRPROJ=2, SWITCHABLE=3}. Empirically applying
+	 * that permutation regressed ALL tests (allintra 10/10 → 0/10) —
+	 * so either VAAPI's yframe_restoration_type is NOT the raw spec
+	 * value (already-remapped to V4L2 enum semantics?), or vpu981
+	 * interprets the V4L2 enum values via a different mapping than
+	 * the V4L2 uAPI header documents. Per
+	 * [[feedback_review_empirical_over_theoretical]] keep the
+	 * identity mapping that empirically works; revisit if a
+	 * restoration-using fixture surfaces a real decode bug.
+	 */
+	{
+		uint8_t remap[4] = {
+			V4L2_AV1_FRAME_RESTORE_NONE,
+			V4L2_AV1_FRAME_RESTORE_WIENER,
+			V4L2_AV1_FRAME_RESTORE_SGRPROJ,
+			V4L2_AV1_FRAME_RESTORE_SWITCHABLE,
+		};
+		uint8_t y_t = picture->loop_restoration_fields.bits.yframe_restoration_type & 3;
+		uint8_t cb_t = picture->loop_restoration_fields.bits.cbframe_restoration_type & 3;
+		uint8_t cr_t = picture->loop_restoration_fields.bits.crframe_restoration_type & 3;
+		bool uses_lr = false;
+
+		ctrl->loop_restoration.frame_restoration_type[0] = remap[y_t];
+		ctrl->loop_restoration.frame_restoration_type[1] = remap[cb_t];
+		ctrl->loop_restoration.frame_restoration_type[2] = remap[cr_t];
+		if (y_t != 0)
+			uses_lr = true;
+		if (cb_t != 0 || cr_t != 0) {
+			uses_lr = true;
+			ctrl->loop_restoration.flags |=
+				V4L2_AV1_LOOP_RESTORATION_FLAG_USES_CHROMA_LR;
+		}
+
+		ctrl->loop_restoration.lr_unit_shift =
+			picture->loop_restoration_fields.bits.lr_unit_shift;
+		ctrl->loop_restoration.lr_uv_shift =
+			picture->loop_restoration_fields.bits.lr_uv_shift;
+
+		if (uses_lr) {
+			uint8_t shift = picture->loop_restoration_fields.bits.lr_unit_shift;
+			uint8_t uv_shift = picture->loop_restoration_fields.bits.lr_uv_shift;
+			ctrl->loop_restoration.flags |=
+				V4L2_AV1_LOOP_RESTORATION_FLAG_USES_LR;
+			ctrl->loop_restoration.loop_restoration_size[0] =
+				1 << (6 + shift);
+			ctrl->loop_restoration.loop_restoration_size[1] =
+				1 << (6 + shift - uv_shift);
+			ctrl->loop_restoration.loop_restoration_size[2] =
+				1 << (6 + shift - uv_shift);
+		}
+	}
+
+	/* ---- global_motion ---- */
+	for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++) {
+		if (i == 0)
+			continue; /* INTRA_FRAME slot — no warp */
+		ctrl->global_motion.type[i] = picture->wm[i - 1].wmtype;
+		for (j = 0; j < 6; j++)
+			ctrl->global_motion.params[i][j] = picture->wm[i - 1].wmmat[j];
+		if (picture->wm[i - 1].invalid)
+			ctrl->global_motion.invalid |=
+				V4L2_AV1_GLOBAL_MOTION_IS_INVALID(i);
+		switch (picture->wm[i - 1].wmtype) {
+		case 1:
+			ctrl->global_motion.flags[i] |=
+				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_TRANSLATION;
+			ctrl->global_motion.flags[i] |=
+				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL;
+			break;
+		case 2:
+			ctrl->global_motion.flags[i] |=
+				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_ROT_ZOOM;
+			ctrl->global_motion.flags[i] |=
+				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL;
+			break;
+		case 3:
+			ctrl->global_motion.flags[i] |=
+				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* ---- reference frames + order hints ---- */
+	/* reference_frame_ts[] is filled by the orchestrator (av1_set_controls)
+	 * which has driver_data for the SURFACE() lookup. order_hints[] not
+	 * exposed per-ref by VAAPI — leave zero. ref_frame_idx[7] is the
+	 * index map from spec-defined ref slots (LAST..ALTREF) into
+	 * ref_frame_map[8] (the surface IDs). */
+	for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++)
+		ctrl->order_hints[i] = 0;
+	for (i = 0; i < BACKEND_AV1_REFS_PER_FRAME; i++)
+		ctrl->ref_frame_idx[i] = picture->ref_frame_idx[i];
+
+	/* F2: superres_denom direct from VAAPI; fallback to AV1_SUPERRES_NUM
+	 * if zero (spec violation but defensive). */
+	ctrl->superres_denom = picture->superres_scale_denominator
+		? picture->superres_scale_denominator : AV1_SUPERRES_NUM;
+
+	ctrl->skip_mode_frame[0] = 0;
+	ctrl->skip_mode_frame[1] = 0;
+	ctrl->primary_ref_frame = picture->primary_ref_frame;
+	ctrl->frame_type = picture->pic_info_fields.bits.frame_type;
+	ctrl->order_hint = picture->order_hint;
+	ctrl->upscaled_width = picture->frame_width_minus1 + 1;
+	ctrl->interpolation_filter = picture->interp_filter;
+	ctrl->tx_mode = picture->mode_control_fields.bits.tx_mode;
+	ctrl->frame_width_minus_1 = picture->frame_width_minus1;
+	ctrl->frame_height_minus_1 = picture->frame_height_minus1;
+	ctrl->render_width_minus_1 = picture->frame_width_minus1;
+	ctrl->render_height_minus_1 = picture->frame_height_minus1;
+	ctrl->current_frame_id = 0;
+	/* Phase 3: VAAPI doesn't expose refresh_frame_flags. For KEY/SWITCH
+	 * frames the AV1 spec mandates 0xff (refresh all DPB slots). For
+	 * inter frames we default to 0xff too — simple P-frame chains will
+	 * naturally rotate through slots without a precise per-slot value.
+	 * If the stream needs precise control, this needs SPS-side parsing.
+	 * Empirical diff vs kdirect shows kdirect always sends 0xff here. */
+	ctrl->refresh_frame_flags = 0xff;
+
+	/* ---- frame flags ---- */
+	if (picture->pic_info_fields.bits.show_frame)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SHOW_FRAME;
+	if (picture->pic_info_fields.bits.showable_frame)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME;
+	if (picture->pic_info_fields.bits.error_resilient_mode)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE;
+	if (picture->pic_info_fields.bits.disable_cdf_update)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE;
+	if (picture->pic_info_fields.bits.allow_screen_content_tools)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS;
+	if (picture->pic_info_fields.bits.force_integer_mv)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV;
+	if (picture->pic_info_fields.bits.allow_intrabc)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC;
+	if (picture->pic_info_fields.bits.use_superres)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_USE_SUPERRES;
+	if (picture->pic_info_fields.bits.allow_high_precision_mv)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV;
+	if (picture->pic_info_fields.bits.is_motion_mode_switchable)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE;
+	if (picture->pic_info_fields.bits.use_ref_frame_mvs)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS;
+	if (picture->pic_info_fields.bits.disable_frame_end_update_cdf)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF;
+	if (picture->pic_info_fields.bits.allow_warped_motion)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION;
+	if (picture->mode_control_fields.bits.reference_select)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT;
+	if (picture->mode_control_fields.bits.reduced_tx_set_used)
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET;
+	if (picture->mode_control_fields.bits.skip_mode_present) {
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED;
+		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT;
+	}
+}
+
+/* ===== fill_film_grain ===== */
+static void av1_fill_film_grain(VADecPictureParameterBufferAV1 *picture,
+				struct v4l2_ctrl_av1_film_grain *ctrl)
+{
+	VAFilmGrainStructAV1 *fg = &picture->film_grain_info;
+	unsigned int i;
+
+	memset(ctrl, 0, sizeof(*ctrl));
+
+	ctrl->cr_mult = fg->cr_mult;
+	ctrl->grain_seed = fg->grain_seed;
+	/* VAAPI doesn't expose film_grain_params_ref_idx (the reuse-from-
+	 * previous-frame index). Leave zero — only consulted when
+	 * update_grain=0, which VAAPI also doesn't expose. */
+	ctrl->film_grain_params_ref_idx = 0;
+	ctrl->num_y_points = fg->num_y_points;
+	ctrl->num_cb_points = fg->num_cb_points;
+	ctrl->num_cr_points = fg->num_cr_points;
+	ctrl->grain_scaling_minus_8 =
+		fg->film_grain_info_fields.bits.grain_scaling_minus_8;
+	ctrl->ar_coeff_lag = fg->film_grain_info_fields.bits.ar_coeff_lag;
+	ctrl->ar_coeff_shift_minus_6 =
+		fg->film_grain_info_fields.bits.ar_coeff_shift_minus_6;
+	ctrl->grain_scale_shift =
+		fg->film_grain_info_fields.bits.grain_scale_shift;
+	ctrl->cb_mult = fg->cb_mult;
+	ctrl->cb_luma_mult = fg->cb_luma_mult;
+	ctrl->cr_luma_mult = fg->cr_luma_mult;
+	ctrl->cb_offset = fg->cb_offset;
+	ctrl->cr_offset = fg->cr_offset;
+
+	if (fg->film_grain_info_fields.bits.apply_grain) {
+		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN;
+		/* kdirect strace diff confirmed: V4L2_AV1_FILM_GRAIN_FLAG_
+		 * UPDATE_GRAIN must be set when apply_grain=1 (kdirect's
+		 * flags byte is 0x0B = APPLY|UPDATE|...). VAAPI's
+		 * VAFilmGrainStructAV1 doesn't expose update_grain
+		 * separately. Default to UPDATE=1 (use submitted params,
+		 * not reuse from non-existent prior film_grain ref). The
+		 * earlier segfault we saw with this flag was unmasked by
+		 * the link-NULL deref (now fixed via linked_decode_surface);
+		 * not caused by UPDATE_GRAIN itself. */
+		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_UPDATE_GRAIN;
+	}
+	if (fg->film_grain_info_fields.bits.chroma_scaling_from_luma)
+		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_CHROMA_SCALING_FROM_LUMA;
+	if (fg->film_grain_info_fields.bits.overlap_flag)
+		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_OVERLAP;
+	if (fg->film_grain_info_fields.bits.clip_to_restricted_range)
+		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_CLIP_TO_RESTRICTED_RANGE;
+
+	if (!fg->film_grain_info_fields.bits.apply_grain)
+		return;
+
+	for (i = 0; i < fg->num_y_points && i < 14; i++) {
+		ctrl->point_y_value[i] = fg->point_y_value[i];
+		ctrl->point_y_scaling[i] = fg->point_y_scaling[i];
+	}
+	for (i = 0; i < fg->num_cb_points && i < 10; i++) {
+		ctrl->point_cb_value[i] = fg->point_cb_value[i];
+		ctrl->point_cb_scaling[i] = fg->point_cb_scaling[i];
+	}
+	for (i = 0; i < fg->num_cr_points && i < 10; i++) {
+		ctrl->point_cr_value[i] = fg->point_cr_value[i];
+		ctrl->point_cr_scaling[i] = fg->point_cr_scaling[i];
+	}
+	for (i = 0; i < 24; i++)
+		ctrl->ar_coeffs_y_plus_128[i] = (uint8_t)(fg->ar_coeffs_y[i] + 128);
+	for (i = 0; i < 25; i++) {
+		ctrl->ar_coeffs_cb_plus_128[i] = (uint8_t)(fg->ar_coeffs_cb[i] + 128);
+		ctrl->ar_coeffs_cr_plus_128[i] = (uint8_t)(fg->ar_coeffs_cr[i] + 128);
+	}
+}
+
+/* ===== orchestrator ===== */
+int av1_set_controls(struct request_data *driver_data,
+		     struct object_context *context,
+		     struct object_surface *surface_object)
+{
+	VADecPictureParameterBufferAV1 *picture =
+		&surface_object->params.av1.picture;
+	unsigned int num_tiles = surface_object->params.av1.num_tile_group_entries;
+	struct v4l2_ctrl_av1_sequence sequence;
+	struct v4l2_ctrl_av1_frame frame;
+	struct v4l2_ctrl_av1_film_grain film_grain;
+	struct v4l2_ctrl_av1_tile_group_entry *tile_entries = NULL;
+	struct v4l2_ext_control controls[4];
+	unsigned int n = 0;
+	unsigned int i;
+	unsigned int alloc_tiles;
+	int rc;
+
+	(void)context;
+
+	/*
+	 * AV1 film_grain link: when apply_grain=1, ffmpeg-vaapi allocates a
+	 * separate display surface (current_display_picture) from the decode
+	 * surface (current_frame). vpu981 HW applies grain inline to the
+	 * decode CAPTURE buffer, so the consumable data is in current_frame's
+	 * slot. ffmpeg then calls vaGetImage on current_display_picture which
+	 * has no slot bound. Link the display surface back to the decode
+	 * surface so copy_surface_to_image can borrow destination_data[].
+	 */
+	if (picture->current_display_picture != VA_INVALID_SURFACE &&
+	    picture->current_display_picture != picture->current_frame) {
+		struct object_surface *display_surface =
+			SURFACE(driver_data, picture->current_display_picture);
+		if (display_surface != NULL)
+			display_surface->linked_decode_surface_id =
+				picture->current_frame;
+	}
+
+	if (num_tiles > AV1_MAX_TILES)
+		num_tiles = AV1_MAX_TILES;
+
+	/* DYNAMIC_ARRAY size = MAX(num_tiles, 1) per Janet v2 Q1
+	 * amendment — kernel UB on size=0. */
+	alloc_tiles = num_tiles > 0 ? num_tiles : 1;
+	tile_entries = calloc(alloc_tiles, sizeof(*tile_entries));
+	if (tile_entries == NULL)
+		return -1;
+
+	for (i = 0; i < num_tiles; i++) {
+		VASliceParameterBufferAV1 *slice =
+			&surface_object->params.av1.tile_group_entries[i];
+		tile_entries[i].tile_offset = slice->slice_data_offset;
+		tile_entries[i].tile_size = slice->slice_data_size;
+		tile_entries[i].tile_row = (uint8_t)slice->tile_row;
+		tile_entries[i].tile_col = (uint8_t)slice->tile_column;
+	}
+
+	av1_fill_sequence(picture, &sequence);
+	av1_fill_frame(picture, &frame);
+
+	/*
+	 * Phase 2.1 + frame-2 divergence fix: wire reference_frame_ts[].
+	 * VAAPI exposes ref_frame_map[8] as VASurfaceIDs; the kernel needs
+	 * v4l2-style timestamps to cross-reference the corresponding
+	 * CAPTURE buffers (set on the OUTPUT buffer at QBUF time per
+	 * picture.c::EndPicture, via surface_object->timestamp). Mirrors
+	 * the vp9.c:614-628 pattern, scaled to AV1's 8 ref slots.
+	 *
+	 * VA_INVALID_SURFACE entries stay at the calloc'd zero timestamp
+	 * (kernel reads zero, doesn't try to dereference).
+	 */
+	/*
+	 * Empirical: DPB-slot iteration (i over ref_frame_map[i]) gives
+	 * better correctness than ref-name iteration via ref_frame_idx[].
+	 * Tried the ref-name reindex (Kwiboo convention via FFmpeg s->ref[i])
+	 * and lost frames that previously PASSed (3/10 → 1/10) — so the V4L2
+	 * uAPI semantic here may be DPB-slot-indexed despite the AV1 spec
+	 * lexicon. Phase 3 open question pending kernel-side disambiguation.
+	 */
+	for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++) {
+		VASurfaceID ref_id = picture->ref_frame_map[i];
+		struct object_surface *ref_surface;
+		uint64_t ts;
+		if (ref_id == VA_INVALID_SURFACE)
+			continue;
+		ref_surface = SURFACE(driver_data, ref_id);
+		if (ref_surface == NULL)
+			continue;
+		ts = v4l2_timeval_to_ns(&ref_surface->timestamp);
+		if (ts == 0 &&
+		    ref_surface->linked_decode_surface_id != VA_INVALID_SURFACE) {
+			struct object_surface *dec =
+				SURFACE(driver_data,
+					ref_surface->linked_decode_surface_id);
+			if (dec != NULL) {
+				ts = v4l2_timeval_to_ns(&dec->timestamp);
+				frame.order_hints[i] = dec->av1_order_hint;
+			}
+		} else {
+			frame.order_hints[i] = ref_surface->av1_order_hint;
+		}
+		frame.reference_frame_ts[i] = ts;
+	}
+
+	/* Phase 3: record this frame's order_hint on the surface so the
+	 * NEXT frame's ref-loop can populate order_hints[] for slots that
+	 * reference us. */
+	surface_object->av1_order_hint = picture->order_hint;
+	/* Also propagate to the linked display surface (if any), since
+	 * future frames' ref_frame_map[] may point at either. */
+	if (picture->current_display_picture != VA_INVALID_SURFACE &&
+	    picture->current_display_picture != picture->current_frame) {
+		struct object_surface *disp =
+			SURFACE(driver_data, picture->current_display_picture);
+		if (disp != NULL)
+			disp->av1_order_hint = picture->order_hint;
+	}
+
+	if (driver_data->has_av1_film_grain)
+		av1_fill_film_grain(picture, &film_grain);
+
+	controls[n++] = (struct v4l2_ext_control){
+		.id = V4L2_CID_STATELESS_AV1_SEQUENCE,
+		.ptr = &sequence,
+		.size = sizeof(sequence),
+	};
+	controls[n++] = (struct v4l2_ext_control){
+		.id = V4L2_CID_STATELESS_AV1_FRAME,
+		.ptr = &frame,
+		.size = sizeof(frame),
+	};
+	controls[n++] = (struct v4l2_ext_control){
+		.id = V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY,
+		.ptr = tile_entries,
+		.size = sizeof(*tile_entries) * alloc_tiles,
+	};
+	if (driver_data->has_av1_film_grain) {
+		controls[n++] = (struct v4l2_ext_control){
+			.id = V4L2_CID_STATELESS_AV1_FILM_GRAIN,
+			.ptr = &film_grain,
+			.size = sizeof(film_grain),
+		};
+	}
+
+	rc = v4l2_set_controls(driver_data->video_fd,
+			       surface_object->request_fd,
+			       controls, n);
+
+	free(tile_entries);
+
+	if (rc < 0) {
+		request_log("ampere-av1: VIDIOC_S_EXT_CTRLS failed rc=%d\n", rc);
+		return -1;
+	}
+
+	return 0;
+}
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
+ *
+ * ampere-av1-enablement Phase 2: AV1 codec dispatcher header for libva-
+ * v4l2-request-fourier. Mirrors vp9.h shape — single set_controls entry
+ * point that translates surface->params.av1.* VAAPI structures into a
+ * batch of V4L2_CID_STATELESS_AV1_{SEQUENCE,FRAME,TILE_GROUP_ENTRY,
+ * FILM_GRAIN} controls + the underlying request_fd / OUTPUT plane setup.
+ *
+ * V4L2 target: V4L2_PIX_FMT_AV1_FRAME on the vpu981 hantro instance
+ * (RK3588's dedicated AV1 decoder).
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _AV1_H_
+#define _AV1_H_
+
+#include "context.h"
+#include "request.h"
+#include "surface.h"
+
+int av1_set_controls(struct request_data *driver_data,
+		     struct object_context *context,
+		     struct object_surface *surface);
+
+#endif /* _AV1_H_ */
@@ -55,12 +55,14 @@ VAStatus RequestCreateBuffer(VADriverContextP context, VAContextID context_id,
 	VAStatus status;
 	VABufferID id;

+
 	switch (type) {
 	case VAPictureParameterBufferType:
 	case VAIQMatrixBufferType:
 	case VASliceParameterBufferType:
 	case VASliceDataBufferType:
 	case VAImageBufferType:
+	case VAProbabilityBufferType:
 		break;

 	default:
@@ -109,6 +111,7 @@ complete:

 VAStatus RequestDestroyBuffer(VADriverContextP context, VABufferID buffer_id)
 {
+
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;

@@ -128,6 +131,7 @@ VAStatus RequestDestroyBuffer(VADriverContextP context, VABufferID buffer_id)
 VAStatus RequestMapBuffer(VADriverContextP context, VABufferID buffer_id,
 			  void **data_map)
 {
+
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;

@@ -143,6 +147,7 @@ VAStatus RequestMapBuffer(VADriverContextP context, VABufferID buffer_id,

 VAStatus RequestUnmapBuffer(VADriverContextP context, VABufferID buffer_id)
 {
+
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;

@@ -245,6 +250,7 @@ VAStatus RequestAcquireBufferHandle(VADriverContextP context,
 VAStatus RequestReleaseBufferHandle(VADriverContextP context,
 	VABufferID buffer_id)
 {
+
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
 	int export_fd;
@@ -0,0 +1,303 @@
+/*
+ * Iteration 2 Fix 3: cap_pool implementation.
+ *
+ * Design rationale + limitations: see cap_pool.h docblock.
+ *
+ * Concurrency model:
+ *   - All public functions take pool->lock at entry, release at exit.
+ *   - cap_pool_acquire may sleep briefly while scanning slots; safe
+ *     under lock since the scan is bounded by pool->count (<= 24
+ *     typical).
+ *   - The slot pointer returned by acquire / mark_decoded /
+ *     mark_exported / release is stable across the call (lock is
+ *     dropped before return) but the slot's state may change between
+ *     calls. Callers MUST NOT cache slot pointers across sleep/I/O --
+ *     they should treat slot pointers as opaque references valid only
+ *     for the immediate operation.
+ *
+ *   In practice, our caller pattern is:
+ *     surface_object->current_slot = cap_pool_acquire(...);
+ *     v4l2_queue_buffer(slot->v4l2_index, ...);
+ *     // later, in SyncSurface for the same surface:
+ *     v4l2_dequeue_buffer(surface_object->current_slot->v4l2_index, ...);
+ *     cap_pool_mark_decoded(surface_object->current_slot);
+ *
+ *   surface_object->current_slot is the persistent reference; the
+ *   slot's V4L2 index is stable for the slot's lifetime. The state
+ *   field IS read by other threads (acquire scans for FREE) — that
+ *   reads are safe because:
+ *     - acquire holds the lock during the scan
+ *     - mark_decoded/mark_exported/release also hold the lock
+ *   So state transitions are serialized.
+ */
+
+#include "cap_pool.h"
+#include "v4l2.h"
+#include "utils.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include <linux/videodev2.h>
+
+static uint64_t monotonic_ns(void)
+{
+	struct timespec ts;
+	if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
+		return 0;
+	return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
+}
+
+int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type,
+		  unsigned int count, unsigned int v4l2_buffers_count_per_slot)
+{
+	unsigned int index_base;
+	unsigned int i, j;
+	int rc;
+
+	if (pool == NULL || count == 0)
+		return -EINVAL;
+
+	memset(pool, 0, sizeof(*pool));
+
+	rc = pthread_mutex_init(&pool->lock, NULL);
+	if (rc != 0)
+		return -rc;
+
+	pool->slots = calloc(count, sizeof(*pool->slots));
+	if (pool->slots == NULL) {
+		pthread_mutex_destroy(&pool->lock);
+		return -ENOMEM;
+	}
+	pool->count = count;
+
+	rc = v4l2_create_buffers(video_fd, capture_type, count, &index_base);
+	if (rc < 0) {
+		free(pool->slots);
+		pthread_mutex_destroy(&pool->lock);
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		struct cap_pool_slot *slot = &pool->slots[i];
+
+		slot->v4l2_index = index_base + i;
+		slot->buffers_count = v4l2_buffers_count_per_slot;
+		slot->state = CAP_SLOT_FREE;
+		slot->our_export_fd = -1;
+		slot->last_used_at_ns = 0;	/* never used → highest LRU priority */
+		slot->bound_to_surface_id = -1;
+
+		rc = v4l2_query_buffer(video_fd, capture_type, slot->v4l2_index,
+				       slot->map_lengths, slot->map_offsets,
+				       v4l2_buffers_count_per_slot);
+		if (rc < 0) {
+			request_log("cap_pool_init: query_buffer failed for "
+				    "slot %u (v4l2_index=%u)\n",
+				    i, slot->v4l2_index);
+			goto error_cleanup;
+		}
+
+		for (j = 0; j < v4l2_buffers_count_per_slot; j++) {
+			slot->map[j] = mmap(NULL, slot->map_lengths[j],
+					    PROT_READ | PROT_WRITE, MAP_SHARED,
+					    video_fd, slot->map_offsets[j]);
+			if (slot->map[j] == MAP_FAILED) {
+				request_log("cap_pool_init: mmap failed for "
+					    "slot %u plane %u\n", i, j);
+				slot->map[j] = NULL;
+				goto error_cleanup;
+			}
+		}
+	}
+
+	pool->initialized = true;
+	request_log("cap_pool_init: %u slots ready (v4l2_index=%u..%u, "
+		    "%u plane(s) per slot)\n",
+		    count, index_base, index_base + count - 1,
+		    v4l2_buffers_count_per_slot);
+	return 0;
+
+error_cleanup:
+	for (i = 0; i < count; i++) {
+		struct cap_pool_slot *slot = &pool->slots[i];
+		for (j = 0; j < v4l2_buffers_count_per_slot; j++) {
+			if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED)
+				munmap(slot->map[j], slot->map_lengths[j]);
+		}
+	}
+	(void)v4l2_request_buffers(video_fd, capture_type, 0);
+	free(pool->slots);
+	pthread_mutex_destroy(&pool->lock);
+	memset(pool, 0, sizeof(*pool));
+	return -EIO;
+}
+
+void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type)
+{
+	unsigned int i, j;
+
+	if (pool == NULL || !pool->initialized)
+		return;
+
+	pthread_mutex_lock(&pool->lock);
+
+	for (i = 0; i < pool->count; i++) {
+		struct cap_pool_slot *slot = &pool->slots[i];
+
+		if (slot->our_export_fd >= 0) {
+			close(slot->our_export_fd);
+			slot->our_export_fd = -1;
+		}
+
+		for (j = 0; j < slot->buffers_count; j++) {
+			if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED) {
+				munmap(slot->map[j], slot->map_lengths[j]);
+				slot->map[j] = NULL;
+			}
+		}
+	}
+
+	(void)v4l2_request_buffers(video_fd, capture_type, 0);
+
+	pthread_mutex_unlock(&pool->lock);
+	pthread_mutex_destroy(&pool->lock);
+
+	free(pool->slots);
+	pool->slots = NULL;
+	pool->count = 0;
+	pool->initialized = false;
+}
+
+struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id)
+{
+	struct cap_pool_slot *best = NULL;
+	uint64_t best_ts = UINT64_MAX;
+	unsigned int i;
+
+	if (pool == NULL || !pool->initialized)
+		return NULL;
+
+	pthread_mutex_lock(&pool->lock);
+
+	/* First pass: find the FREE slot with oldest last_used_at_ns. */
+	for (i = 0; i < pool->count; i++) {
+		struct cap_pool_slot *slot = &pool->slots[i];
+		if (slot->state != CAP_SLOT_FREE)
+			continue;
+		if (slot->last_used_at_ns < best_ts) {
+			best = slot;
+			best_ts = slot->last_used_at_ns;
+		}
+	}
+
+	/*
+	 * Second pass (fallback): if no FREE slot, force-recycle the
+	 * oldest EXPORTED slot. This is the documented Option A race
+	 * window — the consumer may still hold a dup'd fd to this
+	 * buffer's underlying physical memory, and the kernel will
+	 * happily DMA new content into it. For typical mpv 16-surface
+	 * playback with MIN_CAP_POOL=24, this fallback should never
+	 * fire. If it does, the visual artifact is bounded to a few
+	 * frames during recovery.
+	 */
+	if (best == NULL) {
+		best_ts = UINT64_MAX;
+		for (i = 0; i < pool->count; i++) {
+			struct cap_pool_slot *slot = &pool->slots[i];
+			if (slot->state != CAP_SLOT_EXPORTED)
+				continue;
+			if (slot->last_used_at_ns < best_ts) {
+				best = slot;
+				best_ts = slot->last_used_at_ns;
+			}
+		}
+		if (best != NULL) {
+			request_log("cap_pool_acquire: pool exhausted, "
+				    "force-recycling EXPORTED slot v4l2_index=%u "
+				    "(consumer race window may open)\n",
+				    best->v4l2_index);
+			if (best->our_export_fd >= 0) {
+				close(best->our_export_fd);
+				best->our_export_fd = -1;
+			}
+		}
+	}
+
+	if (best == NULL) {
+		pthread_mutex_unlock(&pool->lock);
+		request_log("cap_pool_acquire: no slot available "
+			    "(pool->count=%u, all slots IN_DECODE/DECODED?)\n",
+			    pool->count);
+		return NULL;
+	}
+
+	/*
+	 * Don't transition DECODED slots — they hold valid pixel content
+	 * a consumer may still be reading via DeriveImage (vaapi-copy
+	 * path). We never recycle DECODED. If a surface holds a DECODED
+	 * slot for an extended period, it stays held; the surface's
+	 * destruction (vaDestroySurfaces) is the only path that releases
+	 * it. mpv typically progresses through DECODED → EXPORTED quickly
+	 * for vaapi DMA-BUF; for vaapi-copy, DECODED → consumer reads
+	 * via mmap → consumer is done after copy_surface_to_image returns.
+	 * The vaapi-copy consumer has no explicit "I'm done" signal, so
+	 * we rely on the next BeginPicture for the same surface to
+	 * release the prior DECODED slot.
+	 */
+
+	best->state = CAP_SLOT_IN_DECODE;
+	best->bound_to_surface_id = surface_id;
+	best->last_used_at_ns = monotonic_ns();
+
+	pthread_mutex_unlock(&pool->lock);
+	return best;
+}
+
+void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot)
+{
+	if (pool == NULL || slot == NULL)
+		return;
+	pthread_mutex_lock(&pool->lock);
+	slot->state = CAP_SLOT_DECODED;
+	slot->last_used_at_ns = monotonic_ns();
+	pthread_mutex_unlock(&pool->lock);
+}
+
+void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd)
+{
+	if (pool == NULL || slot == NULL)
+		return;
+	pthread_mutex_lock(&pool->lock);
+	if (slot->our_export_fd >= 0 && slot->our_export_fd != our_fd) {
+		/*
+		 * Double-Export: a previous EXPBUF'd fd existed. Close
+		 * the old one. Consumer's old fd remains valid via
+		 * dma_buf refcount. Documented in surface.c export path.
+		 */
+		close(slot->our_export_fd);
+	}
+	slot->our_export_fd = our_fd;
+	slot->state = CAP_SLOT_EXPORTED;
+	slot->last_used_at_ns = monotonic_ns();
+	pthread_mutex_unlock(&pool->lock);
+}
+
+void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot)
+{
+	if (pool == NULL || slot == NULL)
+		return;
+	pthread_mutex_lock(&pool->lock);
+	if (slot->our_export_fd >= 0) {
+		close(slot->our_export_fd);
+		slot->our_export_fd = -1;
+	}
+	slot->state = CAP_SLOT_FREE;
+	slot->bound_to_surface_id = -1;
+	slot->last_used_at_ns = monotonic_ns();
+	pthread_mutex_unlock(&pool->lock);
+}
@@ -0,0 +1,156 @@
+/*
+ * Iteration 2 Fix 3: decoupled CAPTURE buffer pool with LRU recycling.
+ *
+ * Background — the bug this fixes:
+ *
+ *   Pre-iteration-2, each VAAPI surface was permanently 1:1 bound to a
+ *   V4L2 CAPTURE buffer index at vaCreateSurfaces2 time. Each decode
+ *   cycle re-QBUF'd that same physical buffer for the same surface ID.
+ *   When mpv reused a surface for a new decode while the compositor
+ *   still held an EXPBUF'd dma_buf fd to the prior frame's content,
+ *   the kernel wrote new decode output into the SAME physical memory
+ *   the compositor was reading from — visible as stutter / "back and
+ *   forth" frame swap during mpv --hwdec=vaapi --vo=gpu playback.
+ *
+ *   V4L2 does not enforce the constraint (it lets QBUF re-queue a
+ *   buffer regardless of dma_buf refcount on EXPBUF'd fds). userspace
+ *   must coordinate.
+ *
+ * Architecture (Sonnet Phase 5 review for iter2):
+ *
+ *   Pool of N CAPTURE buffers (N >= max(surfaces_count, MIN_CAP_POOL)).
+ *   Each slot has a state in {FREE, IN_DECODE, DECODED, EXPORTED}.
+ *   Surfaces are no longer permanently bound; each vaBeginPicture
+ *   acquires a FREE slot, binds it to the current decode, transitions
+ *   it through IN_DECODE → DECODED → optionally EXPORTED.
+ *
+ *   The DECODED state captures the window between SyncSurface DQBUF
+ *   and either ExportSurfaceHandle (DMA-BUF path) or DeriveImage
+ *   (vaapi-copy path). LRU recycling considers ONLY FREE slots, so
+ *   DECODED slots cannot be claimed by a concurrent decode while
+ *   the consumer is still using the bound surface's content.
+ *
+ *   Concurrency: a pthread_mutex_t protects pool state. VAAPI is
+ *   re-entrant for multi-threaded consumers (mpv may BeginPicture/
+ *   SyncSurface from one thread and ExportSurfaceHandle from
+ *   another).
+ *
+ * Limitations (deferred to iteration 3+):
+ *
+ *   - Option-A statistical mitigation, not a correct fix. The race
+ *     window narrows from "constant" to "only when pool is exhausted
+ *     and force-recycle of oldest EXPORTED slot fires." For typical
+ *     mpv 16-surface playback with MIN_CAP_POOL=24, this never fires
+ *     in practice (Sonnet review iter2 question 3). For pathological
+ *     workloads (paused-with-video-still-visible, multi-stream),
+ *     race windows still possible. Iteration 3 may revisit with
+ *     V4L2_MEMORY_DMABUF + userspace allocation.
+ *
+ *   - LRU "force-recycle" still has the race in the worst case.
+ *     Closing OUR EXPBUF fd does not close the consumer's dup — the
+ *     consumer's fd keeps the dma_buf alive but the V4L2 layer will
+ *     happily write new data into the underlying physical memory on
+ *     re-QBUF. There is no public V4L2 API to query dma_buf refcount.
+ *
+ *   - Multi-context concurrent use (two libva contexts open
+ *     simultaneously, e.g. Firefox playing two videos in different
+ *     tabs through separate RDD instances): not addressed. Each
+ *     context gets its own pool, but there's only one V4L2 device.
+ */
+
+#ifndef _CAP_POOL_H_
+#define _CAP_POOL_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <pthread.h>
+
+#include <linux/videodev2.h>	/* for VIDEO_MAX_PLANES */
+
+#define MIN_CAP_POOL 24
+
+enum cap_slot_state {
+	CAP_SLOT_FREE = 0,	/* available for a new decode acquisition */
+	CAP_SLOT_IN_DECODE,	/* QBUF'd to V4L2, kernel owns */
+	CAP_SLOT_DECODED,	/* DQBUF'd, valid pixel content; mapped by surface */
+	CAP_SLOT_EXPORTED,	/* EXPBUF'd; consumer holds a dma_buf fd */
+};
+
+struct cap_pool_slot {
+	unsigned int		v4l2_index;			/* V4L2 buffer index */
+	void			*map[VIDEO_MAX_PLANES];		/* mmap pointers */
+	unsigned int		map_lengths[VIDEO_MAX_PLANES];
+	unsigned int		map_offsets[VIDEO_MAX_PLANES];
+	unsigned int		buffers_count;			/* V4L2 buffers per logical NV12 (1 for single-plane MPLANE) */
+	enum cap_slot_state	state;
+	int			our_export_fd;			/* -1 if not exported; close on FREE transition */
+	uint64_t		last_used_at_ns;		/* CLOCK_MONOTONIC when last touched (LRU) */
+	int			bound_to_surface_id;		/* -1 if not bound; informational */
+};
+
+struct cap_pool {
+	struct cap_pool_slot	*slots;
+	unsigned int		count;		/* allocated slot count */
+	pthread_mutex_t		lock;
+	bool			initialized;
+};
+
+/*
+ * cap_pool_init — allocate a pool of `count` CAPTURE buffers via
+ * v4l2_create_buffers, mmap each buffer's planes, init slot states
+ * to FREE. `count` is min'd against any reasonable hardware cap.
+ *
+ * Returns 0 on success, negative errno on failure.
+ */
+int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type,
+		  unsigned int count, unsigned int v4l2_buffers_count_per_slot);
+
+/*
+ * cap_pool_destroy — close any outstanding our_export_fds, munmap all
+ * planes, REQBUFS(0), free slots. Safe to call on a non-initialized
+ * pool (no-op).
+ *
+ * Note: closing our_export_fd does not invalidate any consumer-held
+ * dup'd fds — the kernel keeps the dma_buf alive while any fd refs
+ * it. munmap on our side is independent of the consumer's mmap (each
+ * mmap of a dma_buf is a distinct VMA).
+ */
+void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type);
+
+/*
+ * cap_pool_acquire — find a FREE slot with the oldest last_used_at_ns
+ * (LRU). If no FREE slot is available, force-recycle the oldest
+ * EXPORTED slot (close our_export_fd, demote to IN_DECODE for the
+ * caller). Returns NULL only if no slots can be recycled at all
+ * (catastrophic — pool too small).
+ *
+ * The returned slot is in IN_DECODE state. Caller QBUFs it and
+ * transitions to DECODED via cap_pool_mark_decoded after DQBUF.
+ */
+struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id);
+
+/*
+ * cap_pool_mark_decoded — IN_DECODE → DECODED. Touches last_used_at_ns.
+ * Called from RequestSyncSurface after successful DQBUF.
+ */
+void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot);
+
+/*
+ * cap_pool_mark_exported — DECODED → EXPORTED. Stores `our_fd` so the
+ * pool owns OUR copy of the EXPBUF'd fd; the consumer received a
+ * dup'd / equivalent fd via the descriptor. last_used_at_ns is
+ * touched again so EXPORTED slots are recycled in LRU order.
+ *
+ * Called from RequestExportSurfaceHandle after VIDIOC_EXPBUF.
+ */
+void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd);
+
+/*
+ * cap_pool_release — explicitly return a slot to FREE (close our
+ * export fd if any). Called from RequestDestroySurfaces and from
+ * RequestBeginPicture when re-acquiring (the surface's previous slot
+ * is released first, then a new one acquired).
+ */
+void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot);
+
+#endif /* _CAP_POOL_H_ */
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codec.h"
+
+#include <linux/videodev2.h>
+
+unsigned int pixelformat_for_profile(VAProfile profile)
+{
+	switch (profile) {
+	case VAProfileMPEG2Simple:
+	case VAProfileMPEG2Main:
+		return V4L2_PIX_FMT_MPEG2_SLICE;
+	case VAProfileH264Main:
+	case VAProfileH264High:
+	case VAProfileH264ConstrainedBaseline:
+	case VAProfileH264MultiviewHigh:
+	case VAProfileH264StereoHigh:
+		return V4L2_PIX_FMT_H264_SLICE;
+	case VAProfileHEVCMain:
+		return V4L2_PIX_FMT_HEVC_SLICE;
+	case VAProfileVP8Version0_3:
+		return V4L2_PIX_FMT_VP8_FRAME;
+	case VAProfileVP9Profile0:
+		return V4L2_PIX_FMT_VP9_FRAME;
+	case VAProfileAV1Profile0:
+		return V4L2_PIX_FMT_AV1_FRAME;
+	default:
+		return 0;
+	}
+}
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _CODEC_H_
+#define _CODEC_H_
+
+#include <va/va.h>
+
+/**
+ * pixelformat_for_profile - map a VA-API VAProfile to its V4L2 OUTPUT-side
+ *                            pixel format FOURCC.
+ *
+ * @profile: VAProfile enum value as passed to vaCreateConfig.
+ *
+ * Returns the V4L2_PIX_FMT_* constant that the V4L2 device's OUTPUT_MPLANE
+ * (bitstream-input) queue should be set to in order for the kernel
+ * stateless decoder to dispatch to the right codec_mode. Used at
+ * RequestCreateConfig to populate object_config->pixelformat, and read
+ * from there at RequestCreateContext when committing the OUTPUT format
+ * to the V4L2 device.
+ *
+ * Returns 0 for an unhandled profile; caller is expected to either
+ * fall back to a safe default or refuse to proceed.
+ */
+unsigned int pixelformat_for_profile(VAProfile profile);
+
+#endif /* _CODEC_H_ */
@@ -34,10 +34,9 @@

 #include <linux/videodev2.h>

-#include <mpeg2-ctrls.h>
-#include <h264-ctrls.h>
 #include <hevc-ctrls.h>

+#include "codec.h"
 #include "utils.h"
 #include "v4l2.h"

@@ -54,22 +53,55 @@ VAStatus RequestCreateConfig(VADriverContextP context, VAProfile profile,
 	int i, index;

 	switch (profile) {
-	case VAProfileMPEG2Simple:
-	case VAProfileMPEG2Main:
+	
 	case VAProfileH264Main:
 	case VAProfileH264High:
 	case VAProfileH264ConstrainedBaseline:
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
-		case VAProfileHEVCMain:
-		if (entrypoint != VAEntrypointVLD)
-			return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT;
+		// FIXME
+		break;
+	case VAProfileMPEG2Simple:
+	case VAProfileMPEG2Main:
+		// fresnel-fourier iter1: MPEG-2 enabled. Same shape as H.264
+		// above — no profile-specific config validation in the libva
+		// backend; validation happens at vaCreateContext / control
+		// submission time.
+		break;
+	case VAProfileHEVCMain:
+		// fresnel-fourier iter2: HEVC enabled. Same shape as H.264/
+		// MPEG-2 above — no profile-specific config validation in the
+		// libva backend; validation happens at vaCreateContext / control
+		// submission time.
+		break;
+	case VAProfileVP8Version0_3:
+		// fresnel-fourier iter3: VP8 enabled. Same shape as iter1+iter2
+		// above — no profile-specific config validation in the libva
+		// backend; validation happens at vaCreateContext / control
+		// submission time.
+		break;
+	case VAProfileVP9Profile0:
+		// fresnel-fourier iter4: VP9 Profile 0 enabled on rkvdec.
+		// Same shape — no profile-specific validation here.
+		break;
+	case VAProfileAV1Profile0:
+		// ampere-av1-enablement: AV1 Profile 0 enabled on vpu981.
+		// Same shape — no profile-specific validation here.
 		break;
-
 	default:
 		return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 	}

+	/*
+	 * iter38: retarget the active V4L2 device to whichever physical
+	 * decoder (rkvdec or hantro-vpu on RK3399) serves this codec profile.
+	 * Safe no-op when the right device is already active. When a switch
+	 * is needed, output/capture pools and the video_format cache are
+	 * torn down so the next RequestCreateContext rebuilds them on the
+	 * new device.
+	 */
+	(void)request_switch_device_for_profile(driver_data, profile);
+
 	if (attributes_count > V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES)
 		attributes_count = V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES;

@@ -80,6 +112,16 @@ VAStatus RequestCreateConfig(VADriverContextP context, VAProfile profile,

 	config_object->profile = profile;
 	config_object->entrypoint = entrypoint;
+	/*
+	 * iter5b-β: cache the V4L2 OUTPUT-side FOURCC for this profile so
+	 * context.c::RequestCreateContext can read it without re-running
+	 * the profile→pixelformat mapping. Wires up the previously-dead
+	 * pixelformat field at config.h:46. The switch above already
+	 * rejected unsupported profiles via VA_STATUS_ERROR_UNSUPPORTED_PROFILE,
+	 * so pixelformat_for_profile here returns non-zero for every
+	 * profile that reaches this assignment.
+	 */
+	config_object->pixelformat = pixelformat_for_profile(profile);
 	config_object->attributes[0].type = VAConfigAttribRTFormat;
 	config_object->attributes[0].value = VA_RT_FORMAT_YUV420;
 	config_object->attributes_count = 1;
@@ -111,6 +153,31 @@ VAStatus RequestDestroyConfig(VADriverContextP context, VAConfigID config_id)
 	return VA_STATUS_SUCCESS;
 }

+/*
+ * iter38: check whether `fmt` is supported on any of the open V4L2 device
+ * fds (active + the two alt fds tracked since iter38 multi-device probe).
+ * Tries both VIDEO_OUTPUT and VIDEO_OUTPUT_MPLANE.
+ */
+static bool any_fd_supports_output_format(struct request_data *driver_data,
+					  unsigned int fmt)
+{
+	int fds[4] = {
+		driver_data->video_fd,
+		driver_data->video_fd_rkvdec,
+		driver_data->video_fd_hantro,
+		driver_data->video_fd_vpu981,
+	};
+	int i;
+	for (i = 0; i < 4; i++) {
+		if (fds[i] < 0) continue;
+		if (v4l2_find_format(fds[i], V4L2_BUF_TYPE_VIDEO_OUTPUT, fmt))
+			return true;
+		if (v4l2_find_format(fds[i], V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, fmt))
+			return true;
+	}
+	return false;
+}
+
 VAStatus RequestQueryConfigProfiles(VADriverContextP context,
 				    VAProfile *profiles, int *profiles_count)
 {
@@ -118,18 +185,14 @@ VAStatus RequestQueryConfigProfiles(VADriverContextP context,
 	unsigned int index = 0;
 	bool found;

-	found = v4l2_find_format(driver_data->video_fd,
-				 V4L2_BUF_TYPE_VIDEO_OUTPUT,
-				 V4L2_PIX_FMT_MPEG2_SLICE);
-	if (found && index < (V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES - 2)) {
+	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_MPEG2_SLICE);
+	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 2)) {
 		profiles[index++] = VAProfileMPEG2Simple;
 		profiles[index++] = VAProfileMPEG2Main;
 	}

-	found = v4l2_find_format(driver_data->video_fd,
-				 V4L2_BUF_TYPE_VIDEO_OUTPUT,
-				 V4L2_PIX_FMT_H264_SLICE_RAW);
-	if (found && index < (V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES - 5)) {
+	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_H264_SLICE);
+	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 5)) {
 		profiles[index++] = VAProfileH264Main;
 		profiles[index++] = VAProfileH264High;
 		profiles[index++] = VAProfileH264ConstrainedBaseline;
@@ -137,12 +200,29 @@ VAStatus RequestQueryConfigProfiles(VADriverContextP context,
 		profiles[index++] = VAProfileH264StereoHigh;
 	}

-	found = v4l2_find_format(driver_data->video_fd,
-				 V4L2_BUF_TYPE_VIDEO_OUTPUT,
-				 V4L2_PIX_FMT_HEVC_SLICE);
-	if (found && index < (V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES - 1))
+	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_HEVC_SLICE);
+	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
 		profiles[index++] = VAProfileHEVCMain;

+	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_VP8_FRAME);
+	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
+		profiles[index++] = VAProfileVP8Version0_3;
+
+	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_VP9_FRAME);
+	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
+		profiles[index++] = VAProfileVP9Profile0;
+
+	/*
+	 * ampere-av1-enablement: AV1 routes to vpu981 (advertised via the
+	 * new video_fd_vpu981 slot). V4L2_REQUEST_MAX_PROFILES=11 is now
+	 * EXACTLY full with this addition. Future profile additions
+	 * require bumping that constant + verifying libva consumers'
+	 * profiles[] sizing.
+	 */
+	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_AV1_FRAME);
+	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
+		profiles[index++] = VAProfileAV1Profile0;
+
 	*profiles_count = index;

 	return VA_STATUS_SUCCESS;
@@ -162,6 +242,9 @@ VAStatus RequestQueryConfigEntrypoints(VADriverContextP context,
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
 	case VAProfileHEVCMain:
+	case VAProfileVP8Version0_3:
+	case VAProfileVP9Profile0:
+	case VAProfileAV1Profile0:
 		entrypoints[0] = VAEntrypointVLD;
 		*entrypoints_count = 1;
 		break;
@@ -43,6 +43,7 @@ struct object_config {
 	VAEntrypoint entrypoint;
 	VAConfigAttrib attributes[V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES];
 	int attributes_count;
+	unsigned int pixelformat;
 };

 VAStatus RequestCreateConfig(VADriverContextP context, VAProfile profile,
@@ -29,6 +29,7 @@
 #include "request.h"
 #include "surface.h"

+#include <errno.h>
 #include <stdlib.h>
 #include <string.h>

@@ -39,8 +40,6 @@

 #include <linux/videodev2.h>

-#include <mpeg2-ctrls.h>
-#include <h264-ctrls.h>
 #include <hevc-ctrls.h>

 #include "utils.h"
@@ -55,35 +54,323 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 {
 	struct request_data *driver_data = context->pDriverData;
 	struct object_config *config_object;
-	struct object_surface *surface_object;
 	struct object_context *context_object = NULL;
 	struct video_format *video_format;
-	unsigned int length;
-	unsigned int offset;
-	void *source_data = MAP_FAILED;
+	unsigned int destination_sizes[VIDEO_MAX_PLANES];
+	unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
+	unsigned int destination_planes_count;
+	unsigned int format_width, format_height;
+	unsigned int pixelformat;
 	VASurfaceID *ids = NULL;
 	VAContextID id;
 	VAStatus status;
 	unsigned int output_type, capture_type;
-	unsigned int pixelformat;
-	unsigned int index_base;
-	unsigned int index;
-	unsigned int i;
+	unsigned int j;
+	bool found;
 	int rc;

-	video_format = driver_data->video_format;
-	if (video_format == NULL)
-		return VA_STATUS_ERROR_OPERATION_FAILED;
-
-	output_type = v4l2_type_video_output(video_format->v4l2_mplane);
-	capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
-
+	/*
+	 * iter5b-β: CreateContext owns the V4L2 OUTPUT-side device-format
+	 * lifecycle (S_FMT, CAPTURE-format probe, cap_pool_init, per-surface
+	 * destination_* fill). Pre-β these lived in CreateSurfaces2 with a
+	 * resolution-change gate; β moves them here because (a) config_id
+	 * is known so the right OUTPUT pixel format can be derived from
+	 * the bound profile, and (b) STREAMON happens at the end of this
+	 * function, so the queue is never streaming when we do S_FMT.
+	 *
+	 * DestroyContext is the only per-session teardown site under β
+	 * (no in-CreateSurfaces2 teardown branch). It STREAMOFFs both
+	 * queues, calls request_pool_destroy + cap_pool_destroy, and
+	 * REQBUFS(0) — leaving the V4L2 device in a clean slate for the
+	 * next CreateContext.
+	 */
 	config_object = CONFIG(driver_data, config_id);
 	if (config_object == NULL) {
 		status = VA_STATUS_ERROR_INVALID_CONFIG;
 		goto error;
 	}

+	pixelformat = config_object->pixelformat;
+	if (pixelformat == 0) {
+		/*
+		 * Defensive: CreateConfig rejects unhandled profiles, so
+		 * pixelformat is always non-zero by the time we get here.
+		 * Belt-and-suspenders.
+		 */
+		status = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
+		goto error;
+	}
+
+	/*
+	 * Probe the CAPTURE-side V4L2 format. video_format is a static
+	 * pointer into video.c's formats[]; it stays valid for the life of
+	 * the driver_data and is cached across CreateContext cycles. The
+	 * probe doesn't require any prior S_FMT — v4l2_find_format
+	 * enumerates the device's supported formats directly.
+	 */
+	if (!driver_data->video_format) {
+		video_format = NULL;
+		found = v4l2_find_format(driver_data->video_fd,
+					 V4L2_BUF_TYPE_VIDEO_CAPTURE,
+					 V4L2_PIX_FMT_SUNXI_TILED_NV12);
+		if (found)
+			video_format = video_format_find(V4L2_PIX_FMT_SUNXI_TILED_NV12);
+
+		found = v4l2_find_format(driver_data->video_fd,
+					 V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
+					 V4L2_PIX_FMT_NV12);
+		if (found)
+			video_format = video_format_find(V4L2_PIX_FMT_NV12);
+
+		if (video_format == NULL) {
+			status = VA_STATUS_ERROR_OPERATION_FAILED;
+			goto error;
+		}
+
+		driver_data->video_format = video_format;
+	}
+	video_format = driver_data->video_format;
+
+	output_type = v4l2_type_video_output(video_format->v4l2_mplane);
+	capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
+
+	/*
+	 * Commit the OUTPUT pixel format. picture_width/picture_height
+	 * are the kernel-facing dimensions for this decode session. With
+	 * profile-derived pixelformat, hantro's CAPTURE-format derivation
+	 * dispatches to the right codec_mode (pre-β hardcoded H264_SLICE
+	 * meant hantro silently substituted MPEG2_DECODER for HEVC/VP8/VP9
+	 * → all-zero CAPTURE; rkvdec silently dropped HEVC/VP9 → same
+	 * outcome).
+	 */
+	rc = v4l2_set_format(driver_data->video_fd, output_type, pixelformat,
+			     picture_width, picture_height);
+	if (rc < 0) {
+		status = VA_STATUS_ERROR_OPERATION_FAILED;
+		goto error;
+	}
+
+	/*
+	 * iter15 α-19: explicit S_FMT on CAPTURE for rkvdec.
+	 *
+	 * Original iter5b-β comment: "Do NOT VIDIOC_S_FMT on CAPTURE — hantro
+	 * reads the SPS from OUTPUT to set CAPTURE shape internally."
+	 *
+	 * Empirical finding at iter15 Phase 3 (2026-05-14): kdirect (ffmpeg-
+	 * v4l2request) does S_FMT on CAPTURE side after S_FMT(OUTPUT),
+	 * then CREATE_BUFS for CAPTURE. libva's old G_FMT-only path skipped
+	 * the S_FMT call. For hantro this was deliberate (works); for rkvdec
+	 * (HEVC + H.264 + VP9 on RK3399) the absence of explicit S_FMT puts
+	 * the driver into a state where it does NOT commit the chosen NV12
+	 * pixel format properly — and the resulting decode silently writes
+	 * garbage or zero for HEVC + H.264 (Bug 4 + Bug 5).
+	 *
+	 * Per [[feedback-per-driver-kludge-gating]]: this driver-specific
+	 * difference should be gated on driver_kind. For now use a single
+	 * always-on S_FMT call as the safe move: kdirect proves S_FMT
+	 * CAPTURE works on both hantro AND rkvdec (it's the reference path).
+	 * The iter5b-β comment is preserved-but-amended below.
+	 *
+	 * Sequence: S_FMT OUTPUT (above) → S_FMT CAPTURE (this) → G_FMT
+	 * CAPTURE (sanity read-back, matches what S_FMT committed).
+	 */
+	{
+		unsigned int capture_pixelformat = V4L2_PIX_FMT_NV12;
+		rc = v4l2_set_format(driver_data->video_fd, capture_type,
+				     capture_pixelformat, picture_width,
+				     picture_height);
+		if (rc < 0) {
+			/* Non-fatal: if the kernel rejects S_FMT CAPTURE (some
+			 * older hantro variants), fall through to G_FMT. */
+			request_log("iter15 α-19: S_FMT CAPTURE failed (continuing): %s\n",
+				    strerror(errno));
+		}
+	}
+
+	rc = v4l2_get_format(driver_data->video_fd, capture_type, &format_width,
+			     &format_height, destination_bytesperlines,
+			     destination_sizes, NULL);
+	if (rc < 0) {
+		status = VA_STATUS_ERROR_OPERATION_FAILED;
+		goto error;
+	}
+
+	/*
+	 * iter25 α-25: synthetic-SPS injection to pre-seed ctx->image_fmt
+	 * before CAPTURE buffer allocation.
+	 *
+	 * Root cause (iter17→iter24 kernel-printk chain): rkvdec_s_ctrl for
+	 * HEVC_SPS / H264_SPS calls get_image_fmt() and, if the resolved
+	 * image_fmt differs from the cached ctx->image_fmt (default
+	 * RKVDEC_IMG_FMT_ANY), tries to reset the CAPTURE format. The reset
+	 * returns -EBUSY when vb2_is_busy(CAPTURE_queue) — i.e. any CAPTURE
+	 * buffer is allocated.
+	 *
+	 * libva (iter5b-β CAPTURE pool) pre-allocates 24 CAPTURE buffers
+	 * via cap_pool_init below — before any per-frame S_EXT_CTRLS
+	 * arrives. So the first real HEVC_SPS at decode time fails with
+	 * -EBUSY in try_or_set_cluster, breaks v4l2_ctrl_request_setup's
+	 * outer loop, and leaves ctx->ctrl_hdl[SPS..DECODE_PARAMS] at all-
+	 * zero contents. rkvdec_hevc_run reads zero, hardware sees w=0
+	 * h=0, decoded CAPTURE is all-zero (Bug 5 + Bug 4).
+	 *
+	 * Fix: while CAPTURE is still empty (before cap_pool_init), inject
+	 * a synthetic SPS containing the profile's chroma + bit_depth so
+	 * rkvdec_s_ctrl resolves image_fmt and updates ctx->image_fmt
+	 * before vb2_is_busy can return true. From then on, per-frame
+	 * SPS submissions with matching profile parameters see
+	 * image_fmt_changed=false → skip reset → commit succeeds.
+	 *
+	 * Gated by config->profile: only HEVC and H.264 paths set
+	 * get_image_fmt in their rkvdec coded_fmt_desc->ops; VP9 / MPEG-2 /
+	 * VP8 are unaffected (rkvdec_s_ctrl returns 0 immediately when
+	 * get_image_fmt is NULL, or those codecs are routed to hantro).
+	 *
+	 * Failure is best-effort: if the kernel returns -EBUSY/-EINVAL here
+	 * (e.g. driver doesn't expose the control on this DT path), we fall
+	 * through and may still hit the original bug for that codec — but
+	 * the device-init DECODE_MODE + START_CODE block below ALSO uses
+	 * void-cast best-effort, so this is consistent with prior pattern.
+	 */
+	{
+		switch (config_object->profile) {
+		case VAProfileHEVCMain: {
+			struct v4l2_ctrl_hevc_sps dummy_sps;
+			struct v4l2_ext_control dummy_ctrl;
+
+			memset(&dummy_sps, 0, sizeof(dummy_sps));
+			dummy_sps.chroma_format_idc = 1; /* 4:2:0 */
+			dummy_sps.bit_depth_luma_minus8 = 0; /* 8-bit */
+			dummy_sps.bit_depth_chroma_minus8 = 0;
+			dummy_sps.pic_width_in_luma_samples = picture_width;
+			dummy_sps.pic_height_in_luma_samples = picture_height;
+
+			dummy_ctrl.id = V4L2_CID_STATELESS_HEVC_SPS;
+			dummy_ctrl.ptr = &dummy_sps;
+			dummy_ctrl.size = sizeof(dummy_sps);
+			(void)v4l2_set_controls(driver_data->video_fd, -1,
+						&dummy_ctrl, 1);
+			break;
+		}
+		case VAProfileH264Main:
+		case VAProfileH264High:
+		case VAProfileH264ConstrainedBaseline:
+		case VAProfileH264MultiviewHigh:
+		case VAProfileH264StereoHigh: {
+			struct v4l2_ctrl_h264_sps dummy_sps;
+			struct v4l2_ext_control dummy_ctrl;
+
+			memset(&dummy_sps, 0, sizeof(dummy_sps));
+			dummy_sps.chroma_format_idc = 1; /* 4:2:0 */
+			dummy_sps.bit_depth_luma_minus8 = 0;
+			dummy_sps.bit_depth_chroma_minus8 = 0;
+			dummy_sps.pic_width_in_mbs_minus1 =
+				(picture_width + 15) / 16 - 1;
+			dummy_sps.pic_height_in_map_units_minus1 =
+				(picture_height + 15) / 16 - 1;
+			dummy_sps.profile_idc = 100; /* High */
+			dummy_sps.level_idc = 41;
+			/*
+			 * FRAME_MBS_ONLY required: rkvdec_h264_validate_sps
+			 * doubles height for non-frame-mbs-only streams to
+			 * compute frame-height from field-height. Without
+			 * this flag, dummy with (height_in_map_units+1)*16
+			 * = 1088 doubles to 2176 > coded_fmt 1080 → -EINVAL.
+			 */
+			dummy_sps.flags = V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY;
+
+			dummy_ctrl.id = V4L2_CID_STATELESS_H264_SPS;
+			dummy_ctrl.ptr = &dummy_sps;
+			dummy_ctrl.size = sizeof(dummy_sps);
+			(void)v4l2_set_controls(driver_data->video_fd, -1,
+						&dummy_ctrl, 1);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+	destination_planes_count = video_format->planes_count;
+
+	/*
+	 * Initialize the CAPTURE buffer pool (cap_pool). Pool size =
+	 * max(surfaces_count, MIN_CAP_POOL). The headroom gives LRU
+	 * recycling enough margin to never reuse a buffer within the
+	 * consumer's compositor-hold window for typical playback
+	 * patterns. cap_pool_init does the V4L2 CREATE_BUFS + per-slot
+	 * mmap.
+	 *
+	 * `pool->initialized` is reset to false by cap_pool_destroy in
+	 * DestroyContext; subsequent CreateContext re-inits at the new
+	 * resolution.
+	 */
+	if (!driver_data->capture_pool.initialized) {
+		unsigned int pool_count = surfaces_count > MIN_CAP_POOL ?
+					  surfaces_count : MIN_CAP_POOL;
+		rc = cap_pool_init(&driver_data->capture_pool,
+				   driver_data->video_fd, capture_type,
+				   pool_count, video_format->v4l2_buffers_count);
+		if (rc < 0) {
+			status = VA_STATUS_ERROR_ALLOCATION_FAILED;
+			goto error;
+		}
+	}
+
+	/*
+	 * Compute format-uniform destination_* values. Same for all
+	 * surfaces of this format; written once per surface, never
+	 * changed by BeginPicture's slot acquisition.
+	 */
+	if (video_format->v4l2_buffers_count == 1) {
+		destination_sizes[0] = destination_bytesperlines[0] *
+				       format_height;
+		for (j = 1; j < destination_planes_count; j++)
+			destination_sizes[j] = destination_sizes[0] / 2;
+	}
+
+	/*
+	 * iter5b-β Commit D: cache the format-uniform CAPTURE geometry
+	 * in driver_data. CreateSurfaces2 calls AFTER this CreateContext
+	 * (ffmpeg vaapi-copy late-surface-allocation case) will lazy-fill
+	 * via surface_fill_format_uniform(); the surface_heap walk below
+	 * fills surfaces that pre-existed when CreateContext fired.
+	 */
+	driver_data->fmt_planes_count = destination_planes_count;
+	driver_data->fmt_buffers_count = video_format->v4l2_buffers_count;
+	driver_data->fmt_format_height = format_height;
+	for (j = 0; j < destination_planes_count; j++) {
+		driver_data->fmt_sizes[j] = destination_sizes[j];
+		driver_data->fmt_bytesperlines[j] =
+			destination_bytesperlines[j];
+	}
+	driver_data->fmt_valid = true;
+
+	/*
+	 * Walk the surface_heap (not just surfaces_ids[]) to populate
+	 * destination_* on every existing surface. Pre-Commit-D we walked
+	 * surfaces_ids[], which is empty for ffmpeg vaapi-copy consumers
+	 * that call vaCreateContext with surfaces_count=0 — those surfaces
+	 * exist in the heap but aren't in the param array. Walking the
+	 * heap catches both flows. Late-created surfaces (after this
+	 * CreateContext) fill via surface_fill_format_uniform in
+	 * CreateSurfaces2's per-surface init.
+	 */
+	{
+		struct object_surface *surface_iter;
+		int heap_iter;
+
+		surface_iter = (struct object_surface *)
+			object_heap_first(&driver_data->surface_heap,
+					  &heap_iter);
+		while (surface_iter != NULL) {
+			surface_fill_format_uniform(driver_data, surface_iter);
+			surface_iter = (struct object_surface *)
+				object_heap_next(&driver_data->surface_heap,
+						 &heap_iter);
+		}
+	}
+
 	id = object_heap_allocate(&driver_data->context_heap);
 	context_object = CONTEXT(driver_data, id);
 	if (context_object == NULL) {
@@ -91,40 +378,29 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 		goto error;
 	}
 	memset(&context_object->dpb, 0, sizeof(context_object->dpb));
+	context_object->timestamp_counter = 0;	/* iter9 α-7 */

-	switch (config_object->profile) {
-
-	case VAProfileMPEG2Simple:
-	case VAProfileMPEG2Main:
-		pixelformat = V4L2_PIX_FMT_MPEG2_SLICE;
-		break;
-
-	case VAProfileH264Main:
-	case VAProfileH264High:
-	case VAProfileH264ConstrainedBaseline:
-	case VAProfileH264MultiviewHigh:
-	case VAProfileH264StereoHigh:
-		pixelformat = V4L2_PIX_FMT_H264_SLICE_RAW;
-		break;
-
-	case VAProfileHEVCMain:
-		pixelformat = V4L2_PIX_FMT_HEVC_SLICE;
-		break;
-
-	default:
-		status = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
-		goto error;
-	}
-
-	rc = v4l2_set_format(driver_data->video_fd, output_type, pixelformat,
-			     picture_width, picture_height);
-	if (rc < 0) {
-		status = VA_STATUS_ERROR_OPERATION_FAILED;
-		goto error;
-	}
-
-	rc = v4l2_create_buffers(driver_data->video_fd, output_type,
-				 surfaces_count, &index_base);
+	/*
+	 * Initialize the OUTPUT (bitstream-input) buffer pool. Sized by
+	 * codec pipeline depth (4 H.264 frames in flight is sufficient
+	 * for current hantro/rkvdec scheduling); independent of caller-
+	 * supplied surfaces_count. Pool is owned by driver_data so it
+	 * outlives any single context destroy/recreate cycle.
+	 *
+	 * This replaces the prior per-surface OUTPUT loop, which (a)
+	 * created an empty queue when surfaces_count==0 (ffmpeg vaapi-
+	 * copy path) and (b) only populated surface->source_* for
+	 * surfaces present at vaCreateContext time, NULL-derefing on
+	 * surfaces created later.
+	 */
+	/*
+	 * iter6: pool size 16 gives comfortable headroom over typical H.264
+	 * MaxDpbFrames (16) for any consumer that pipelines decode requests.
+	 * Each slot owns its own request_fd (REINIT'd per use).
+	 */
+	rc = request_pool_init(&driver_data->output_pool,
+			       driver_data->video_fd, driver_data->media_fd,
+			       output_type, 16);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_ALLOCATION_FAILED;
 		goto error;
@@ -135,40 +411,107 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	 * we don't have any indication wrt its life time. Let's make sure
 	 * its life span is under our control.
 	 */
-	ids = malloc(surfaces_count * sizeof(VASurfaceID));
-	if (ids == NULL) {
-		status = VA_STATUS_ERROR_ALLOCATION_FAILED;
-		goto error;
+	if (surfaces_count > 0) {
+		ids = malloc(surfaces_count * sizeof(VASurfaceID));
+		if (ids == NULL) {
+			status = VA_STATUS_ERROR_ALLOCATION_FAILED;
+			goto error;
+		}
+
+		memcpy(ids, surfaces_ids,
+		       surfaces_count * sizeof(VASurfaceID));
 	}

-	memcpy(ids, surfaces_ids, surfaces_count * sizeof(VASurfaceID));
+	/*
+	 * Stateless H.264 device-wide controls. The kernel V4L2 stateless
+	 * framework requires DECODE_MODE and START_CODE be set on the
+	 * device fd (request_fd=-1) before VIDIOC_STREAMON; per-request
+	 * controls (SPS/PPS/etc.) attached to a request_fd come later.
+	 *
+	 * hantro-vpu via rockchip,rk3568-vpu DT compatible (covers RK3568
+	 * and RK3566 — PineTab2 silicon — since they're close enough)
+	 * accepts only DECODE_MODE_FRAME_BASED.
+	 * START_CODE_ANNEX_B preserves leading 0x00000001 in the slice
+	 * payload that h264.c assembles. Errors here are not fatal: not
+	 * every backing driver supports both controls (e.g. cedrus may
+	 * default to SLICE_BASED without exposing DECODE_MODE).
+	 */
+	{
+		struct v4l2_ext_control dev_ctrls[2] = {
+			{
+				.id = V4L2_CID_STATELESS_H264_DECODE_MODE,
+				.value = V4L2_STATELESS_H264_DECODE_MODE_FRAME_BASED,
+			},
+			{
+				.id = V4L2_CID_STATELESS_H264_START_CODE,
+				.value = V4L2_STATELESS_H264_START_CODE_ANNEX_B,
+			},
+		};
+		(void)v4l2_set_controls(driver_data->video_fd, -1,
+					dev_ctrls, 2);
+	}

-	for (i = 0; i < surfaces_count; i++) {
-		index = index_base + i;
+	/*
+	 * iter2: HEVC device-wide controls. Same best-effort pattern as
+	 * H.264 above — separate batched call so a kernel that does not
+	 * advertise HEVC controls (e.g. hantro-vpu-dec on RK3568/RK3399)
+	 * silently fails on this batch without invalidating the H.264
+	 * batch. rkvdec on RK3399 advertises HEVC and accepts FRAME_BASED
+	 * + ANNEX_B (only supported menu values per Phase 0 v4l2_inventory).
+	 */
+	{
+		struct v4l2_ext_control hevc_dev_ctrls[2] = {
+			{
+				.id = V4L2_CID_STATELESS_HEVC_DECODE_MODE,
+				.value = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+			},
+			{
+				.id = V4L2_CID_STATELESS_HEVC_START_CODE,
+				.value = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+			},
+		};
+		(void)v4l2_set_controls(driver_data->video_fd, -1,
+					hevc_dev_ctrls, 2);
+	}

-		surface_object = SURFACE(driver_data, surfaces_ids[i]);
-		if (surface_object == NULL) {
-			status = VA_STATUS_ERROR_INVALID_SURFACE;
-			goto error;
-		}
-
-		rc = v4l2_query_buffer(driver_data->video_fd, output_type,
-				       index, &length, &offset, 1);
-		if (rc < 0) {
-			status = VA_STATUS_ERROR_ALLOCATION_FAILED;
-			goto error;
-		}
-
-		source_data = mmap(NULL, length, PROT_READ | PROT_WRITE,
-				   MAP_SHARED, driver_data->video_fd, offset);
-		if (source_data == MAP_FAILED) {
-			status = VA_STATUS_ERROR_ALLOCATION_FAILED;
-			goto error;
-		}
-
-		surface_object->source_index = index;
-		surface_object->source_data = source_data;
-		surface_object->source_size = length;
+	/*
+	 * Mirror the ANNEX_B start-code mode set on the device above
+	 * into context_object->h264_start_code so picture.c::
+	 * codec_store_buffer prepends 0x00 0x00 0x01 to each slice
+	 * payload it copies into the OUTPUT buffer. Without this, the
+	 * kernel — which we just told to expect ANNEX_B — sees a raw
+	 * NAL stream with no start codes, fails to find slice
+	 * boundaries, and emits a zeroed CAPTURE buffer (visually a
+	 * flat dark-green frame).
+	 *
+	 * iter4 fix: this start-code prepend is ANNEX-B-specific and
+	 * applies to H.264 and HEVC ONLY. MPEG-2, VP8, and VP9 use raw
+	 * frame bitstreams without start codes — prepending 0x00 0x00 0x01
+	 * to a VP9 uncompressed header produces a frame_marker mismatch
+	 * (kernel reads 0x00 instead of 0x10), the rkvdec driver silently
+	 * fails to find a valid frame, and the CAPTURE slot stays at its
+	 * cap_pool init pattern (a dim 0x4c green). Phase 7 verification
+	 * caught this for VP9; iter1+iter3 transitive proof masked it for
+	 * MPEG-2/VP8 because those iters compared payload bytes, not
+	 * decoded pixels.
+	 *
+	 * h264_get_controls() exists for this purpose but is never
+	 * called in the current code path; the planned probe-then-set
+	 * commit will replace this hardcoded assignment with a runtime
+	 * read of the kernel's accepted START_CODE value.
+	 */
+	switch (config_object->profile) {
+	case VAProfileH264Main:
+	case VAProfileH264High:
+	case VAProfileH264ConstrainedBaseline:
+	case VAProfileH264MultiviewHigh:
+	case VAProfileH264StereoHigh:
+	case VAProfileHEVCMain:
+		context_object->h264_start_code = true;
+		break;
+	default:
+		context_object->h264_start_code = false;
+		break;
 	}

 	rc = v4l2_set_stream(driver_data->video_fd, output_type, true);
@@ -197,9 +540,6 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	goto complete;

 error:
-	if (source_data != MAP_FAILED)
-		munmap(source_data, length);
-
 	if (ids != NULL)
 		free(ids);

@@ -251,13 +591,51 @@ VAStatus RequestDestroyContext(VADriverContextP context, VAContextID context_id)
 	object_heap_free(&driver_data->context_heap,
 			 (struct object_base *)context_object);

+	/*
+	 * iter5b-β: tear down the OUTPUT pool (mmap unmaps) BEFORE
+	 * REQBUFS(0) frees the kernel-side buffers. Pre-β this was done
+	 * only by surface.c's resolution-change branch — which β removed.
+	 * Without this here, the next CreateContext's request_pool_init
+	 * sees pool->initialized=true with stale slot pointers, returns
+	 * 0 without re-CREATE_BUFS, and the next QBUF EINVALs because
+	 * the slots reference buffer indices that no longer exist
+	 * (Phase 5 v2 review CRIT-2).
+	 */
+	if (driver_data->output_pool.initialized)
+		request_pool_destroy(&driver_data->output_pool);
+
 	rc = v4l2_request_buffers(driver_data->video_fd, output_type, 0);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;

-	rc = v4l2_request_buffers(driver_data->video_fd, capture_type, 0);
-	if (rc < 0)
-		return VA_STATUS_ERROR_OPERATION_FAILED;
+	/*
+	 * Iter2 Fix 3 (still relevant under β): cap_pool owns the
+	 * CAPTURE buffers' mmaps + any outstanding our_export_fds. Tear
+	 * it down (which also issues REQBUFS(0) on CAPTURE), so the next
+	 * CreateContext cycle sees a clean slate.
+	 */
+	cap_pool_destroy(&driver_data->capture_pool, driver_data->video_fd,
+			 capture_type);
+
+	/*
+	 * iter5b-β: driver_data->video_format is a static-ref pointer
+	 * into video.c's formats[]; it stays valid for the life of the
+	 * driver_data and intentionally survives DestroyContext cycles.
+	 * The next CreateContext's `if (!driver_data->video_format)`
+	 * guard skips the probe — correct, because the device's CAPTURE
+	 * format menu doesn't change.
+	 *
+	 * The pre-β surface_reset_format_cache() call here is removed:
+	 * β doesn't have a last_output_{width,height,pixelformat} cache
+	 * (those fields are deleted). Each CreateContext is a fresh
+	 * S_FMT(OUTPUT) cycle.
+	 *
+	 * Commit D: invalidate the format-uniform cache so a CreateSurfaces2
+	 * call between DestroyContext and the next CreateContext doesn't
+	 * lazy-fill with stale geometry from the now-torn-down session.
+	 * The next CreateContext re-populates the cache.
+	 */
+	driver_data->fmt_valid = false;

 	return VA_STATUS_SUCCESS;
 }
@@ -27,6 +27,9 @@
 #ifndef _CONTEXT_H_
 #define _CONTEXT_H_

+#include <stdbool.h>
+#include <stdint.h>
+
 #include <va/va_backend.h>

 #include "object_heap.h"
@@ -50,6 +53,27 @@ struct object_context {

 	/* H264 only */
 	struct h264_dpb dpb;
+	bool h264_start_code;
+
+	/*
+	 * iter9 α-7: monotonic per-context timestamp counter (us). Replaces
+	 * gettimeofday in EndPicture so DPB.reference_ts / OUTPUT QBUF ts
+	 * are small values matching ffmpeg-v4l2request's pattern. Placed
+	 * here (object_context) not driver_data per Phase 5 IMP-1 to avoid
+	 * cross-context collisions.
+	 */
+	uint64_t timestamp_counter;
+
+	/* fresnel-fourier iter4: VP9 loop-filter delta state, persisted across
+	 * frames per kernel UAPI <linux/v4l2-controls.h>:2578 ("If this syntax
+	 * element is not present in the bitstream, users should pass its last
+	 * value.") and VP9 spec defaults from FFmpeg vp9.c:666-671. Reset on
+	 * keyframe / error-resilient / intra-only via vp9_lf.initialized=false. */
+	struct {
+		int8_t ref_deltas[4];
+		int8_t mode_deltas[2];
+		bool initialized;
+	} vp9_lf;
 };

 VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
@@ -28,16 +28,18 @@
 #include <assert.h>
 #include <limits.h>
 #include <string.h>
+#include <stdio.h>

 #include <sys/ioctl.h>
 #include <sys/mman.h>

 #include <linux/videodev2.h>
-#include <h264-ctrls.h>

 #include "request.h"
+#include "utils.h"
 #include "surface.h"
 #include "v4l2.h"
+#include "h264_slice_header.h"

 enum h264_slice_type {
 	H264_SLICE_P    = 0,
@@ -95,7 +97,8 @@ static struct h264_dpb_entry *dpb_find_entry(struct object_context *context)
 }

 static struct h264_dpb_entry *dpb_lookup(struct object_context *context,
-					 VAPictureH264 *pic, unsigned int *idx)
+					 VAPictureH264 *pic, unsigned int *idx,
+					 unsigned char *fields)
 {
 	unsigned int i;

@@ -109,6 +112,16 @@ static struct h264_dpb_entry *dpb_lookup(struct object_context *context,
 			if (idx)
 				*idx = i;

+			if (fields) {
+				//if (entry->pic.TopFieldOrderCnt < entry->pic.BottomFieldOrderCnt) {
+				//	*fields = V4L2_H264_TOP_FIELD_REF;
+				//} else if (entry->pic.TopFieldOrderCnt > entry->pic.BottomFieldOrderCnt) {
+				//	*fields = V4L2_H264_BOTTOM_FIELD_REF;
+				//} else {
+					*fields = V4L2_H264_FRAME_REF;
+				//}
+			}
+
 			return entry;
 		}
 	}
@@ -130,7 +143,7 @@ static void dpb_insert(struct object_context *context, VAPictureH264 *pic,
 	if (is_picture_null(pic))
 		return;

-	if (dpb_lookup(context, pic, NULL))
+	if (dpb_lookup(context, pic, NULL, NULL))
 		return;

 	if (!entry)
@@ -165,7 +178,7 @@ static void dpb_update(struct object_context *context,
 		if (is_picture_null(pic))
 			continue;

-		entry = dpb_lookup(context, pic, NULL);
+		entry = dpb_lookup(context, pic, NULL, NULL);
 		if (entry) {
 			entry->age = context->dpb.age;
 			entry->used = true;
@@ -175,10 +188,61 @@ static void dpb_update(struct object_context *context,
 	}
 }

+/*
+ * Strip ffmpeg-vaapi's POC sentinel.
+ *
+ * ffmpeg's H264POCContext initialises prev_poc_msb to (1 << 16) =
+ * 0x10000 in libavcodec/h264dec.c (lines 301 and 444 of v8.0). After
+ * an IDR the idr() helper resets prev_poc_msb to that same sentinel.
+ * ff_h264_init_poc (libavcodec/h264_parse.c lines 296-305) then
+ * computes pc->poc_msb as prev_poc_msb when the slice header's
+ * poc_lsb hasn't wrapped — which is the typical case for normal
+ * content. The sentinel leaks into field_poc[] and from there into
+ * VAPictureH264.TopFieldOrderCnt / BottomFieldOrderCnt at
+ * libavcodec/vaapi_h264.c::fill_vaapi_pic.
+ *
+ * Working VAAPI backends (intel-iHD, i965 verified empirically on
+ * meitner 2026-05-02) tolerate the high word — they either mask it
+ * or treat POCs as relative comparisons. V4L2 stateless H.264
+ * driver-side consumers (hantro_h264.c::prepare_table feeds the
+ * value direct to tbl->poc[]) need the spec value, so we strip the
+ * sentinel here at the libva-v4l2-request boundary.
+ *
+ * Detection by bit-16-set rather than blind subtraction so that a
+ * future ffmpeg version that fixes the sentinel leak degrades
+ * gracefully. POC values for non-degenerate H.264 content rarely
+ * exceed 16 bits; bit 16 set is a strong signal of the sentinel.
+ *
+ * Empty DPB slots (VA_PICTURE_H264_INVALID) carry POC=0 by
+ * libavcodec/vaapi_h264.c::init_vaapi_pic and need no fix-up.
+ */
+static inline int32_t h264_strip_ffmpeg_poc_sentinel(int32_t poc, uint32_t flags)
+{
+	if (flags & VA_PICTURE_H264_INVALID)
+		return 0;
+	/*
+	 * iter8 α-2: pass POC values through unchanged for rkvdec. The
+	 * sentinel-subtract was added for hantro's tbl->poc[] prepare_table
+	 * which fed the value through unmasked. rkvdec writes POC to MMIO
+	 * via writel_relaxed (rkvdec-h264.c:975-978) and the macro
+	 * RKVDEC_CUR_POC is a 32-bit passthrough. kdirect (ffmpeg-v4l2request)
+	 * delivers the sentinel-encoded value directly and decodes
+	 * correctly; libva's strip was the cause of the 16x32 partial-fill
+	 * Bug 4 symptom. Hantro+H.264 isn't exercised on RK3399 (hantro-dec
+	 * doesn't advertise H.264 there) — restoring the strip per-driver
+	 * is iter9 work if it ever surfaces.
+	 */
+	return poc;
+}
+
 static void h264_fill_dpb(struct request_data *data,
 			  struct object_context *context,
+			  VAPictureParameterBufferH264 *VAPicture,
 			  struct v4l2_ctrl_h264_decode_params *decode)
 {
+	const int max_frame_num =
+		1 << (VAPicture->seq_fields.bits.log2_max_frame_num_minus4 + 4);
+	const int cur_frame_num = (int)VAPicture->frame_num;
 	int i;

 	for (i = 0; i < H264_DPB_SIZE; i++) {
@@ -188,7 +252,26 @@ static void h264_fill_dpb(struct request_data *data,
 			SURFACE(data, entry->pic.picture_id);
 		uint64_t timestamp;

-		if (!entry->valid)
+		/*
+		 * Skip entries no longer referenced by the consumer's
+		 * VAPictureParameterBufferH264.ReferenceFrames[]. dpb_update()
+		 * clears `used` for all entries then re-marks only those in the
+		 * current ReferenceFrames list; entries with valid=true but
+		 * used=false are stale (a frame the libva consumer has retired
+		 * from its DPB).
+		 *
+		 * Without this skip, our V4L2 dpb[] grows monotonically until
+		 * H264_DPB_SIZE; by frame_num=10 it carries 7+ entries while
+		 * SPS.max_num_ref_frames may be 4. The kernel reflist builder /
+		 * cluster validator rejects the request with EINVAL once the
+		 * count exceeds the SPS contract — which iter1+iter2+iter3
+		 * surfaced as the "frame-11 EINVAL" carryover. iter4 fix:
+		 * report only currently-used entries to match FFmpeg's
+		 * libavcodec/v4l2_request_h264.c::fill_dpb behaviour (which
+		 * iterates h->short_ref[] / h->long_ref[] — exactly the
+		 * currently-referenced set).
+		 */
+		if (!entry->valid || !entry->used)
 			continue;

 		if (surface) {
@@ -197,8 +280,47 @@ static void h264_fill_dpb(struct request_data *data,
 		}

 		dpb->frame_num = entry->pic.frame_idx;
-		dpb->top_field_order_cnt = entry->pic.TopFieldOrderCnt;
-		dpb->bottom_field_order_cnt = entry->pic.BottomFieldOrderCnt;
+
+		/*
+		 * Per ext-ctrls-codec-stateless.rst, dpb[].pic_num must
+		 * equal the H.264 spec's PicNum (8-28) for short-term refs
+		 * or LongTermPicNum (8-29) for long-term refs.
+		 *
+		 * For frames (not field-coded), PicNum = FrameNumWrap.
+		 * FrameNumWrap = (frame_num > cur_frame_num)
+		 *                ? frame_num - max_frame_num
+		 *                : frame_num
+		 * (per spec section 8.2.4.1, frame_num wraparound).
+		 *
+		 * VAAPI convention (libavcodec/vaapi_h264.c::fill_vaapi_pic
+		 * line 64): VAPictureH264.frame_idx holds long_term_frame_idx
+		 * for long-term refs and frame_num for short-term refs. So
+		 * for long-term entries we copy frame_idx straight through
+		 * as LongTermPicNum.
+		 *
+		 * fourier's previous code set pic_num to picture_id (the
+		 * VAAPI surface id) which is unrelated to H.264 PicNum;
+		 * mediatek's vdec_h264_req_common.c::dst_entry->pic_num is
+		 * one consumer that fails on that. Hantro doesn't read
+		 * pic_num at all (uses reference_ts for ref resolution),
+		 * which is why fourier's wrong value never surfaced on
+		 * PineTab2 (RK3566 via hantro/rk3568-vpu).
+		 */
+		if (entry->pic.flags & VA_PICTURE_H264_LONG_TERM_REFERENCE) {
+			dpb->pic_num = entry->pic.frame_idx;
+		} else {
+			int frame_num = (int)entry->pic.frame_idx;
+			dpb->pic_num = (frame_num > cur_frame_num)
+				? frame_num - max_frame_num
+				: frame_num;
+		}
+
+		dpb->top_field_order_cnt =
+			h264_strip_ffmpeg_poc_sentinel(entry->pic.TopFieldOrderCnt,
+						       entry->pic.flags);
+		dpb->bottom_field_order_cnt =
+			h264_strip_ffmpeg_poc_sentinel(entry->pic.BottomFieldOrderCnt,
+						       entry->pic.flags);

 		dpb->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;

@@ -207,6 +329,27 @@ static void h264_fill_dpb(struct request_data *data,

 		if (entry->pic.flags & VA_PICTURE_H264_LONG_TERM_REFERENCE)
 			dpb->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
+
+		/*
+		 * Mark this DPB entry as a frame reference (both top + bottom
+		 * fields). The kernel's v4l2_h264_init_reflist_builder iterates
+		 * dpb[] and skips entries whose `fields` member is zero — they
+		 * count as "no valid field reference for this entry." For
+		 * frame-coded streams (BBB and most desktop H.264) every
+		 * reference is a frame reference; per UAPI doc
+		 * (ext-ctrls-codec-stateless.rst), fields must be set to
+		 * V4L2_H264_FRAME_REF (= TOP|BOTTOM) for frames.
+		 *
+		 * Cross-reference: FFmpeg libavcodec/v4l2_request_h264.c::
+		 * fill_dpb_entry sets entry->fields from pic->reference; for
+		 * frames pic->reference includes V4L2_H264_FRAME_REF. Without
+		 * this, P-slices that need to walk the reference list (the
+		 * first one in BBB is at frame 11) hit "no valid refs" inside
+		 * the kernel's reflist builder and S_EXT_CTRLS rejects the
+		 * whole request with EINVAL (error_idx == count, the kernel's
+		 * "application bug" sentinel).
+		 */
+		dpb->fields = V4L2_H264_FRAME_REF;
 	}
 }

@@ -218,11 +361,126 @@ static void h264_va_picture_to_v4l2(struct request_data *driver_data,
 				    struct v4l2_ctrl_h264_pps *pps,
 				    struct v4l2_ctrl_h264_sps *sps)
 {
-	h264_fill_dpb(driver_data, context, decode);
+	unsigned char *b;
+	unsigned char nal_ref_idc;
+	unsigned char nal_unit_type;

-	decode->num_slices = surface->slices_count;
-	decode->top_field_order_cnt = VAPicture->CurrPic.TopFieldOrderCnt;
-	decode->bottom_field_order_cnt = VAPicture->CurrPic.BottomFieldOrderCnt;
+	/* Extract missing nal_ref_idc and nal_unit_type */
+	b = surface->source_data;
+	if (context->h264_start_code)
+		b += 3;
+	nal_ref_idc = (b[0] >> 5) & 0x3;
+	nal_unit_type = b[0] & 0x1f;
+
+	/*
+	 * Bit-parse the slice_header() to recover fields VAAPI doesn't
+	 * forward and that hantro G1 hardware reads out of DECODE_PARAMS:
+	 *
+	 *   - dec_ref_pic_marking_bit_size  -> G1_REG_DEC_CTRL5_REFPIC_MK_LEN
+	 *   - idr_pic_id                    -> G1_REG_DEC_CTRL5_IDR_PIC_ID
+	 *   - pic_order_cnt_bit_size        -> G1_REG_DEC_CTRL6_POC_LENGTH
+	 *   - pic_order_cnt_lsb / delta_pic_order_cnt_* (used by hantro
+	 *     reference-list builder for poc_type=0/1 inter prediction)
+	 *
+	 * Without these set correctly, hantro's hardware bitstream parser
+	 * walks past zero bits, lands on garbage, decodes zero pixels —
+	 * the all-zero CAPTURE output observed during 2026-05-04 Phase 0.
+	 *
+	 * Spec: ITU-T H.264 §7.3.3 slice_header. Cross-reference (proven
+	 * working): FFmpeg libavcodec/h264_slice.c populates
+	 * H264SliceContext::ref_pic_marking_bit_size and
+	 * pic_order_cnt_bit_size by the same bit-precise parse.
+	 */
+	{
+		const struct h264_slice_header_context sh_ctx = {
+			.separate_colour_plane_flag =
+				(VAPicture->seq_fields.bits.residual_colour_transform_flag != 0),
+			.log2_max_frame_num_minus4 =
+				VAPicture->seq_fields.bits.log2_max_frame_num_minus4,
+			.frame_mbs_only_flag =
+				(VAPicture->seq_fields.bits.frame_mbs_only_flag != 0),
+			.pic_order_cnt_type =
+				VAPicture->seq_fields.bits.pic_order_cnt_type,
+			.log2_max_pic_order_cnt_lsb_minus4 =
+				VAPicture->seq_fields.bits.log2_max_pic_order_cnt_lsb_minus4,
+			.delta_pic_order_always_zero_flag =
+				(VAPicture->seq_fields.bits.delta_pic_order_always_zero_flag != 0),
+			.bottom_field_pic_order_in_frame_present_flag =
+				(VAPicture->pic_fields.bits.pic_order_present_flag != 0),
+			.redundant_pic_cnt_present_flag =
+				(VAPicture->pic_fields.bits.redundant_pic_cnt_present_flag != 0),
+			.weighted_pred_flag =
+				(VAPicture->pic_fields.bits.weighted_pred_flag != 0),
+			.weighted_bipred_idc =
+				VAPicture->pic_fields.bits.weighted_bipred_idc,
+			.num_ref_idx_l0_default_active_minus1 =
+				surface->params.h264.slice.num_ref_idx_l0_active_minus1,
+			.num_ref_idx_l1_default_active_minus1 =
+				surface->params.h264.slice.num_ref_idx_l1_active_minus1,
+			.chroma_format_idc =
+				VAPicture->seq_fields.bits.chroma_format_idc,
+			.bit_depth_luma_minus8 =
+				VAPicture->bit_depth_luma_minus8,
+			.bit_depth_chroma_minus8 =
+				VAPicture->bit_depth_chroma_minus8,
+			.nal_unit_type = nal_unit_type,
+			.nal_ref_idc   = nal_ref_idc,
+		};
+		struct h264_slice_header_info sh = { 0 };
+		unsigned char *nal_payload = b + 1; /* past NAL header byte */
+		size_t nal_payload_len = surface->slices_size -
+			(size_t)((nal_payload) - (unsigned char *)surface->source_data);
+		int sh_rc = h264_parse_slice_header(nal_payload, nal_payload_len,
+						    &sh_ctx, &sh);
+		if (sh_rc == 0) {
+			decode->idr_pic_id		= sh.idr_pic_id;
+			decode->pic_order_cnt_lsb	= sh.pic_order_cnt_lsb;
+			decode->delta_pic_order_cnt_bottom = sh.delta_pic_order_cnt_bottom;
+			decode->delta_pic_order_cnt0	= sh.delta_pic_order_cnt0;
+			decode->delta_pic_order_cnt1	= sh.delta_pic_order_cnt1;
+			decode->pic_order_cnt_bit_size	= sh.pic_order_cnt_bit_size;
+			decode->dec_ref_pic_marking_bit_size = sh.dec_ref_pic_marking_bit_size;
+		} else {
+			request_log("slice_header parse FAILED rc=%d "
+				    "(payload_len=%zu) — DECODE_PARAMS bit_size "
+				    "fields left zero, hantro will likely produce zeros\n",
+				    sh_rc, nal_payload_len);
+		}
+	}
+
+	h264_fill_dpb(driver_data, context, VAPicture, decode);
+
+	/*
+	 * Populate every V4L2_CID_STATELESS_H264_DECODE_PARAMS field
+	 * we can derive from VAAPI's pre-parsed VAPictureParameterBuffer
+	 * + bitstream byte. Cross-reference: GStreamer
+	 * gstv4l2codech264dec.c::gst_v4l2_codec_h264_dec_fill_decoder_params
+	 * (lines 632-678).
+	 *
+	 * Fields not derivable from VAAPI (idr_pic_id, pic_order_cnt_lsb,
+	 * delta_pic_order_cnt_*, dec_ref_pic_marking_bit_size,
+	 * pic_order_cnt_bit_size, slice_group_change_cycle) require a
+	 * full slice_header() bit-level parse, which libva-v4l2-request
+	 * does not currently do. They are left at zero-init and the
+	 * kernel-side hantro-vpu may compute them itself when scanning
+	 * the OUTPUT bitstream — a hypothesis verified empirically by
+	 * running this patch and inspecting the CAPTURE buffer.
+	 */
+	decode->nal_ref_idc = nal_ref_idc;
+	decode->frame_num = VAPicture->frame_num;
+	decode->top_field_order_cnt =
+		h264_strip_ffmpeg_poc_sentinel(VAPicture->CurrPic.TopFieldOrderCnt,
+					       VAPicture->CurrPic.flags);
+	decode->bottom_field_order_cnt =
+		h264_strip_ffmpeg_poc_sentinel(VAPicture->CurrPic.BottomFieldOrderCnt,
+					       VAPicture->CurrPic.flags);
+
+	if (nal_unit_type == 5)
+		decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC;
+	if (VAPicture->pic_fields.bits.field_pic_flag)
+		decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC;
+	if (VAPicture->CurrPic.flags & VA_PICTURE_H264_BOTTOM_FIELD)
+		decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD;

 	pps->weighted_bipred_idc =
 		VAPicture->pic_fields.bits.weighted_bipred_idc;
@@ -255,6 +513,7 @@ static void h264_va_picture_to_v4l2(struct request_data *driver_data,
 	if (VAPicture->pic_fields.bits.redundant_pic_cnt_present_flag)
 		pps->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT;

+	sps->max_num_ref_frames = VAPicture->num_ref_frames;
 	sps->chroma_format_idc = VAPicture->seq_fields.bits.chroma_format_idc;
 	sps->bit_depth_luma_minus8 = VAPicture->bit_depth_luma_minus8;
 	sps->bit_depth_chroma_minus8 = VAPicture->bit_depth_chroma_minus8;
@@ -301,6 +560,32 @@ static void h264_va_matrix_to_v4l2(struct request_data *driver_data,
 	       sizeof(v4l2_matrix->scaling_list_8x8[3]));
 }

+/*
+ * H.264 spec default scaling matrices: Flat_4x4_16 and Flat_8x8_16
+ * (every entry = 16). When sps_scaling_matrix_present_flag and
+ * pps_scaling_matrix_present_flag are both false, the bitstream
+ * carries no explicit scaling lists and the decoder uses these
+ * flat defaults — matching ITU-T H.264 (08/2024) §7.4.2.1.1.1
+ * (sequence scaling) and §7.4.2.2 (picture scaling).
+ *
+ * Why we always provide the matrix: hantro G1's set_params reads
+ * pps->flags & V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT to drive
+ * the G1_REG_DEC_CTRL2_TYPE1_QUANT_E hardware bit. FFmpeg's
+ * v4l2_request_h264.c always submits the SCALING_MATRIX control
+ * with the spec default when the bitstream omits explicit lists,
+ * and always sets the SCALING_MATRIX_PRESENT flag (commit
+ * comment: "FFmpeg always provide a scaling matrix"). We mirror
+ * that so the kernel sees a consistent control set across drivers.
+ */
+static void h264_default_flat_scaling_matrix(
+	struct v4l2_ctrl_h264_scaling_matrix *v4l2_matrix)
+{
+	memset(v4l2_matrix->scaling_list_4x4, 16,
+	       sizeof(v4l2_matrix->scaling_list_4x4));
+	memset(v4l2_matrix->scaling_list_8x8, 16,
+	       sizeof(v4l2_matrix->scaling_list_8x8));
+}
+
 static void h264_copy_pred_table(struct v4l2_h264_weight_factors *factors,
 				 unsigned int num_refs,
 				 int16_t luma_weight[32],
@@ -327,10 +612,12 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 				  struct object_context *context,
 				  VASliceParameterBufferH264 *VASlice,
 				  VAPictureParameterBufferH264 *VAPicture,
-				  struct v4l2_ctrl_h264_slice_params *slice)
+				  struct v4l2_ctrl_h264_slice_params *slice,
+				  struct v4l2_ctrl_h264_pred_weights *weights)
 {
-	slice->size = VASlice->slice_data_size;
 	slice->header_bit_size = VASlice->slice_data_bit_offset;
+	//if (context->h264_start_code)	
+	//	slice->header_bit_size += 3 * 8;
 	slice->first_mb_in_slice = VASlice->first_mb_in_slice;
 	slice->slice_type = VASlice->slice_type;
 	slice->cabac_init_idc = VASlice->cabac_init_idc;
@@ -351,12 +638,14 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 			VAPictureH264 *pic = &VASlice->RefPicList0[i];
 			struct h264_dpb_entry *entry;
 			unsigned int idx;
+			unsigned char fields;

-			entry = dpb_lookup(context, pic, &idx);
+			entry = dpb_lookup(context, pic, &idx, &fields);
 			if (!entry)
 				continue;

-			slice->ref_pic_list0[i] = idx;
+			slice->ref_pic_list0[i].index = idx;
+			slice->ref_pic_list0[i].fields = fields;
 		}
 	}

@@ -370,26 +659,28 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 			VAPictureH264 *pic = &VASlice->RefPicList1[i];
 			struct h264_dpb_entry *entry;
 			unsigned int idx;
+			unsigned char fields;

-			entry = dpb_lookup(context, pic, &idx);
+			entry = dpb_lookup(context, pic, &idx, &fields);
 			if (!entry)
 				continue;

-			slice->ref_pic_list1[i] = idx;
+			slice->ref_pic_list1[i].index = idx;
+			slice->ref_pic_list1[i].fields = fields;
 		}
 	}

 	if (VASlice->direct_spatial_mv_pred_flag)
 		slice->flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED;

-	slice->pred_weight_table.chroma_log2_weight_denom =
+	weights->chroma_log2_weight_denom =
 		VASlice->chroma_log2_weight_denom;
-	slice->pred_weight_table.luma_log2_weight_denom =
+	weights->luma_log2_weight_denom =
 		VASlice->luma_log2_weight_denom;

 	if (((VASlice->slice_type % 5) == H264_SLICE_P) ||
 	    ((VASlice->slice_type % 5) == H264_SLICE_B))
-		h264_copy_pred_table(&slice->pred_weight_table.weight_factors[0],
+		h264_copy_pred_table(&weights->weight_factors[0],
 				     slice->num_ref_idx_l0_active_minus1 + 1,
 				     VASlice->luma_weight_l0,
 				     VASlice->luma_offset_l0,
@@ -397,7 +688,7 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 				     VASlice->chroma_offset_l0);

 	if ((VASlice->slice_type % 5) == H264_SLICE_B)
-		h264_copy_pred_table(&slice->pred_weight_table.weight_factors[1],
+		h264_copy_pred_table(&weights->weight_factors[1],
 				     slice->num_ref_idx_l1_active_minus1 + 1,
 				     VASlice->luma_weight_l1,
 				     VASlice->luma_offset_l1,
@@ -405,20 +696,130 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 				     VASlice->chroma_offset_l1);
 }

+int h264_get_controls(struct request_data *driver_data,
+		      struct object_context *context)
+{
+	struct v4l2_ext_control controls[2] = {
+		{
+			.id = V4L2_CID_STATELESS_H264_DECODE_MODE,
+		}, {
+			.id = V4L2_CID_STATELESS_H264_START_CODE,
+		}
+	};
+	int rc;
+
+	rc = v4l2_get_controls(driver_data->video_fd, -1, controls, 2);
+	if (rc < 0)
+		return VA_STATUS_ERROR_OPERATION_FAILED;
+
+	switch (controls[0].value) {
+	case V4L2_STATELESS_H264_DECODE_MODE_SLICE_BASED:
+		break;
+	case V4L2_STATELESS_H264_DECODE_MODE_FRAME_BASED:
+		break;
+	default:
+		request_log("Unsupported decode mode\n");
+		return VA_STATUS_ERROR_OPERATION_FAILED;
+	}
+
+	switch (controls[1].value) {
+	case V4L2_STATELESS_H264_START_CODE_NONE:
+		context->h264_start_code = false;
+		break;
+	case V4L2_STATELESS_H264_START_CODE_ANNEX_B:
+		context->h264_start_code = true;
+		break;
+	default:
+		request_log("Unsupported start code\n");
+		return VA_STATUS_ERROR_OPERATION_FAILED;
+	}
+
+	return VA_STATUS_SUCCESS;
+}
+
+static inline __u8 h264_profile_to_idc(VAProfile profile)
+{
+	switch (profile) {
+	case VAProfileH264Main:
+		return 77;
+	case VAProfileH264High:
+		return 100;
+	case VAProfileH264ConstrainedBaseline:
+		return 66;
+	case VAProfileH264MultiviewHigh:
+		return 118;
+	case VAProfileH264StereoHigh:
+		return 128;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * Derive sps.level_idc from the encoded frame size in macroblocks per
+ * H.264 Annex A.3 (Table A-1) MaxFS thresholds. Each level's MaxFS is
+ * the maximum encoded frame size in MBs the level supports; we pick
+ * the smallest level whose MaxFS contains the actual frame size.
+ *
+ * Level decoding for the V4L2 control: level_idc = level * 10
+ *   Level 1.0 → 10, Level 4.1 → 41, Level 5.1 → 51, Level 6.0 → 60.
+ *
+ * VAAPI does not expose the bitstream's actual level_idc on the
+ * decode side (VAPictureParameterBufferH264 has no such field) — see
+ * va.h. The H.264 SPS NAL is parsed client-side by ffmpeg-vaapi /
+ * mpv and only slice data is forwarded in VASliceDataBuffer, so a
+ * SPS-NAL byte parser is not viable at this layer.
+ *
+ * Without framerate we cannot also check MaxMBPS / MaxBR / MaxCPB.
+ * That gap is acceptable in practice: consumers that push
+ * temporally-dense streams (high MBPS) almost always also push
+ * spatially-large frames (high MaxFS), so frame-size-based level
+ * selection over-allocates on the temporal axis but never
+ * under-allocates a level the consumer relies on for correct
+ * decode-resource sizing.
+ *
+ * Picks for typical content:
+ *   1080p (8160 MBs) → Level 4.1 (level_idc = 41)
+ *   4K   (32400 MBs) → Level 5.1 (level_idc = 51)
+ *   8K  (138240 MBs) → Level 6.0 (level_idc = 60)
+ *
+ * Replaces the hardcoded level_idc=51 from patch 0013.
+ */
+static inline __u8 h264_derive_level_idc(unsigned int width_in_mbs,
+					 unsigned int height_in_mbs)
+{
+	const unsigned int frame_size_mbs = width_in_mbs * height_in_mbs;
+
+	if (frame_size_mbs <= 99)     return 10;  /* Level 1.0 */
+	if (frame_size_mbs <= 396)    return 11;  /* Level 1.1 - 2.0 */
+	if (frame_size_mbs <= 792)    return 21;  /* Level 2.1 */
+	if (frame_size_mbs <= 1620)   return 22;  /* Level 2.2 - 3.0 */
+	if (frame_size_mbs <= 3600)   return 31;  /* Level 3.1 */
+	if (frame_size_mbs <= 5120)   return 32;  /* Level 3.2 */
+	if (frame_size_mbs <= 8192)   return 41;  /* Level 4.0 - 4.1 */
+	if (frame_size_mbs <= 8704)   return 42;  /* Level 4.2 */
+	if (frame_size_mbs <= 22080)  return 50;  /* Level 5.0 */
+	if (frame_size_mbs <= 36864)  return 51;  /* Level 5.1 - 5.2 */
+	if (frame_size_mbs <= 139264) return 60;  /* Level 6.0 - 6.2 */
+	return 62;                                /* > Level 6 ceiling */
+}
+
 int h264_set_controls(struct request_data *driver_data,
 		      struct object_context *context,
+		      VAProfile profile,
 		      struct object_surface *surface)
 {
 	struct v4l2_ctrl_h264_scaling_matrix matrix = { 0 };
 	struct v4l2_ctrl_h264_decode_params decode = { 0 };
 	struct v4l2_ctrl_h264_slice_params slice = { 0 };
+	struct v4l2_ctrl_h264_pred_weights weights = { 0 };
 	struct v4l2_ctrl_h264_pps pps = { 0 };
 	struct v4l2_ctrl_h264_sps sps = { 0 };
 	struct h264_dpb_entry *output;
 	int rc;

 	output = dpb_lookup(context, &surface->params.h264.picture.CurrPic,
-			    NULL);
+			    NULL, NULL);
 	if (!output)
 		output = dpb_find_entry(context);

@@ -429,37 +830,171 @@ int h264_set_controls(struct request_data *driver_data,
 	h264_va_picture_to_v4l2(driver_data, context, surface,
 				&surface->params.h264.picture,
 				&decode, &pps, &sps);
-	h264_va_matrix_to_v4l2(driver_data, context,
-			       &surface->params.h264.matrix, &matrix);
+
+	/*
+	 * Populate the scaling matrix unconditionally: from VAAPI's
+	 * VAIQMatrixBufferH264 when the consumer sent one this frame
+	 * (matrix_set), otherwise from the H.264 spec flat defaults.
+	 * Submitted to the kernel as V4L2_CID_STATELESS_H264_SCALING_MATRIX
+	 * for every request — required for FFmpeg/hantro contract parity
+	 * (see h264_default_flat_scaling_matrix() docblock).
+	 */
+	if (surface->params.h264.matrix_set)
+		h264_va_matrix_to_v4l2(driver_data, context,
+				       &surface->params.h264.matrix, &matrix);
+	else
+		h264_default_flat_scaling_matrix(&matrix);
+
 	h264_va_slice_to_v4l2(driver_data, context,
 			      &surface->params.h264.slice,
-			      &surface->params.h264.picture, &slice);
+			      &surface->params.h264.picture, &slice, &weights);

-	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-			      V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS, &decode,
-			      sizeof(decode));
-	if (rc < 0)
-		return VA_STATUS_ERROR_OPERATION_FAILED;
+	/*
+	 * Mirror SCALING_MATRIX_PRESENT in PPS flags. Hantro G1 set_params
+	 * gates its G1_REG_DEC_CTRL2_TYPE1_QUANT_E register bit on this;
+	 * FFmpeg sets it unconditionally with the comment "FFmpeg always
+	 * provide a scaling matrix." We submit the matrix always (above),
+	 * so the flag must be set always to match.
+	 */
+	pps.flags |= V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT;

-	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-			      V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS, &slice,
-			      sizeof(slice));
-	if (rc < 0)
-		return VA_STATUS_ERROR_OPERATION_FAILED;
+	/*
+	 * Populate pps->num_ref_idx_l0/l1_default_active_minus1. Hantro G1
+	 * writes both into G1_REG_DEC_CTRL6_REFIDX0_ACTIVE / REFIDX1_ACTIVE
+	 * MMIO registers (via "(field) + 1", so an uninitialized 0 here
+	 * would advertise "1 active reference per list" to hardware, wrong
+	 * for I/IDR frames with 0 refs and wrong for B frames with >1).
+	 *
+	 * VAAPI's VAPictureParameterBufferH264 does not carry the parsed
+	 * PPS num_ref_idx_l*_default_active_minus1 fields — those are in
+	 * the bitstream's PPS NAL which VAAPI consumers parse client-side
+	 * but don't forward. The closest available source is VASlice's
+	 * num_ref_idx_l*_active_minus1, which is the per-slice override
+	 * defaulting to the PPS value (H.264 §7.4.3 num_ref_idx_active_
+	 * override_flag). For most streams these values match; mismatch
+	 * only on streams with explicit per-slice overrides.
+	 *
+	 * For IDR frames (no references), the values are not used by
+	 * hantro's reference list builder, so a wrong value here is
+	 * harmless. For inter frames it matters and slice-derived is
+	 * the best we can do without a full PPS-NAL parser.
+	 */
+	pps.num_ref_idx_l0_default_active_minus1 =
+		surface->params.h264.slice.num_ref_idx_l0_active_minus1;
+	pps.num_ref_idx_l1_default_active_minus1 =
+		surface->params.h264.slice.num_ref_idx_l1_active_minus1;

-	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-			      V4L2_CID_MPEG_VIDEO_H264_PPS, &pps, sizeof(pps));
-	if (rc < 0)
-		return VA_STATUS_ERROR_OPERATION_FAILED;
+	/*
+	 * Derive PFRAME / BFRAME flags in v4l2_ctrl_h264_decode_params.flags
+	 * from VASliceParameterBufferH264.slice_type. VAAPI's slice_type
+	 * matches the H.264 spec slice_type semantic: 0=P, 1=B, 2=I, 3=SP,
+	 * 4=SI; values 5..9 mean "all slices in the picture have this
+	 * slice_type" (mod 5 yields the underlying type). VAAPI consumers
+	 * (ffmpeg, mpv) populate this for every slice; in FRAME_BASED mode
+	 * we only see the most-recent slice's params, but slice_type is
+	 * uniform across a single coded picture for our purposes.
+	 *
+	 * Kernel consumers that read these flags: tegra-vde
+	 * (drivers/media/platform/nvidia/tegra-vde/h264.c lines 783-799 of
+	 * 6.19.x) selects the inter-frame decode kernel. Hantro / rkvdec /
+	 * cedrus / mediatek / qcom-iris-stateless do not consume them.
+	 * Setting them keeps the libva-v4l2-request fork upstreamable
+	 * across drivers without affecting hantro behaviour.
+	 *
+	 * Cross-reference: ext-ctrls-codec-stateless.rst Decode Parameters
+	 * Flags — V4L2_H264_DECODE_PARAM_FLAG_PFRAME / _BFRAME.
+	 */
+	switch (surface->params.h264.slice.slice_type % 5) {
+	case H264_SLICE_P:
+		decode.flags |= V4L2_H264_DECODE_PARAM_FLAG_PFRAME;
+		break;
+	case H264_SLICE_B:
+		decode.flags |= V4L2_H264_DECODE_PARAM_FLAG_BFRAME;
+		break;
+	default:
+		/* I / SP / SI: no extra flag. */
+		break;
+	}

-	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-			      V4L2_CID_MPEG_VIDEO_H264_SPS, &sps, sizeof(sps));
-	if (rc < 0)
-		return VA_STATUS_ERROR_OPERATION_FAILED;
+	sps.profile_idc = h264_profile_to_idc(profile);

-	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-			      V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX, &matrix,
-			      sizeof(matrix));
+	/*
+	 * Derive level_idc from encoded frame size per H.264 Annex A.3.
+	 * VAAPI doesn't expose level_idc on the decode side (see
+	 * h264_derive_level_idc()'s docblock for the rationale); we pick
+	 * the smallest level whose MaxFS contains the picture dimensions.
+	 * Replaces patch 0013's intermediate hardcode of 51.
+	 */
+	sps.level_idc = h264_derive_level_idc(
+		(unsigned int)surface->params.h264.picture.picture_width_in_mbs_minus1 + 1u,
+		(unsigned int)surface->params.h264.picture.picture_height_in_mbs_minus1 + 1u);
+
+	/*
+	 * Build the per-request control list incrementally:
+	 *   - SPS, PPS, DECODE_PARAMS, SCALING_MATRIX: always required.
+	 *     Hantro G1 reads the SCALING_MATRIX_PRESENT flag from PPS to
+	 *     gate hardware register G1_REG_DEC_CTRL2_TYPE1_QUANT_E and
+	 *     reads the matrix entries directly into hardware tables when
+	 *     decoding. FFmpeg always submits the matrix (with spec-default
+	 *     flat values when no explicit lists are in the bitstream); we
+	 *     match that — see h264_default_flat_scaling_matrix() docblock.
+	 *     Earlier patch 0012 made SCALING_MATRIX submission conditional
+	 *     on VAAPI's VAIQMatrixBuffer arrival; that was corpus-correct
+	 *     (bbb has no explicit scaling lists) but inconsistent with the
+	 *     hantro contract — replaced 2026-05-04.
+	 *   - SLICE_PARAMS: SLICE_BASED only. Kernel doc
+	 *     ext-ctrls-codec-stateless.rst (FRAME_BASED entry):
+	 *     "When this mode is selected, the
+	 *     V4L2_CID_STATELESS_H264_SLICE_PARAMS control shall not be
+	 *     set." Submitting it under FRAME_BASED triggers cluster-
+	 *     validation EINVAL at error_idx=count.
+	 *   - PRED_WEIGHTS: SLICE_BASED + V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED.
+	 *
+	 * Patch 0002 unconditionally sets the device to FRAME_BASED,
+	 * so slice_based is hardcoded false here. When the planned
+	 * probe-then-set commit lands, this becomes
+	 *     context->decode_mode == V4L2_STATELESS_H264_DECODE_MODE_SLICE_BASED.
+	 */
+	struct v4l2_ext_control controls[6] = { 0 };
+	unsigned int num_controls = 0;
+	const bool slice_based = false; /* TODO: probe via context->decode_mode */
+
+	controls[num_controls].id = V4L2_CID_STATELESS_H264_SPS;
+	controls[num_controls].p_h264_sps = &sps;
+	controls[num_controls].size = sizeof(sps);
+	num_controls++;
+
+	controls[num_controls].id = V4L2_CID_STATELESS_H264_PPS;
+	controls[num_controls].p_h264_pps = &pps;
+	controls[num_controls].size = sizeof(pps);
+	num_controls++;
+
+	controls[num_controls].id = V4L2_CID_STATELESS_H264_DECODE_PARAMS;
+	controls[num_controls].p_h264_decode_params = &decode;
+	controls[num_controls].size = sizeof(decode);
+	num_controls++;
+
+	controls[num_controls].id = V4L2_CID_STATELESS_H264_SCALING_MATRIX;
+	controls[num_controls].p_h264_scaling_matrix = &matrix;
+	controls[num_controls].size = sizeof(matrix);
+	num_controls++;
+
+	if (slice_based) {
+		controls[num_controls].id = V4L2_CID_STATELESS_H264_SLICE_PARAMS;
+		controls[num_controls].p_h264_slice_params = &slice;
+		controls[num_controls].size = sizeof(slice);
+		num_controls++;
+
+		if (V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED(&pps, &slice)) {
+			controls[num_controls].id = V4L2_CID_STATELESS_H264_PRED_WEIGHTS;
+			controls[num_controls].ptr = &weights;
+			controls[num_controls].size = sizeof(weights);
+			num_controls++;
+		}
+	}
+
+	rc = v4l2_set_controls(driver_data->video_fd, surface->request_fd,
+			       controls, num_controls);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;

@@ -51,8 +51,11 @@ struct h264_dpb {
 	unsigned int age;
 };

+int h264_get_controls(struct request_data *driver_data,
+		      struct object_context *context);
 int h264_set_controls(struct request_data *data,
 		      struct object_context *context,
+		      VAProfile profile,
 		      struct object_surface *surface);

 #endif
@@ -0,0 +1,361 @@
+/*
+ * H.264 slice header bit-parser implementation.
+ *
+ * Implements just enough of ITU-T Rec. H.264 (08/2024) §7.3.3
+ * slice_header to populate the V4L2 DECODE_PARAMS bit-position
+ * fields (idr_pic_id, pic_order_cnt_lsb, delta_pic_order_cnt_*,
+ * pic_order_cnt_bit_size, dec_ref_pic_marking_bit_size).
+ *
+ * Skips through ref_pic_list_modification() and pred_weight_table()
+ * because dec_ref_pic_marking() (whose bit length we need) comes
+ * after them. MVC extensions (nal_unit_type 20/21) are not handled
+ * — this fork strips MVC alongside HEVC.
+ */
+
+#include "h264_slice_header.h"
+
+#include <errno.h>
+#include <string.h>
+
+/*
+ * Minimal RBSP bit reader. Reads bits MSB-first. Tracks bit_pos for
+ * caller use (e.g. computing the size of a syntax element by
+ * pre/post bit_pos delta).
+ */
+struct br {
+	const uint8_t	*data;
+	size_t		length;	/* bytes */
+	size_t		bit_pos;
+	bool		error;
+};
+
+static uint32_t br_read_u(struct br *b, unsigned n)
+{
+	uint32_t v = 0;
+	while (n--) {
+		if (b->bit_pos >= b->length * 8) {
+			b->error = true;
+			return 0;
+		}
+		v = (v << 1) | ((b->data[b->bit_pos >> 3] >>
+				 (7 - (b->bit_pos & 7))) & 1u);
+		b->bit_pos++;
+	}
+	return v;
+}
+
+static uint32_t br_read_ue(struct br *b)
+{
+	unsigned zeros = 0;
+	while (br_read_u(b, 1) == 0) {
+		if (b->error || ++zeros >= 32)
+			return 0;
+	}
+	if (zeros == 0)
+		return 0;
+	return (1u << zeros) - 1u + br_read_u(b, zeros);
+}
+
+static int32_t br_read_se(struct br *b)
+{
+	uint32_t v = br_read_ue(b);
+	if (v & 1u)
+		return (int32_t)((v + 1u) >> 1);
+	return -(int32_t)(v >> 1);
+}
+
+/*
+ * RBSP unescape: strip emulation prevention bytes (after every
+ * 0x00 0x00 in the encoded stream, an extra 0x03 is inserted to
+ * prevent byte-aligned start-code emulation; we strip those before
+ * bit-parsing). Output buffer must be at least as large as input.
+ *
+ * Slice headers are short (<100 bits typically), so we unescape
+ * only the first H264_SLICE_HEADER_SCAN_BYTES = 64 input bytes.
+ * That covers any realistic slice header including
+ * dec_ref_pic_marking() and a generous safety margin.
+ */
+#define H264_SLICE_HEADER_SCAN_BYTES 64
+
+static size_t rbsp_unescape(uint8_t *out, const uint8_t *in,
+			    size_t in_len)
+{
+	size_t out_len = 0;
+	int zero_run = 0;
+	size_t i;
+	size_t cap = in_len < H264_SLICE_HEADER_SCAN_BYTES ?
+		     in_len : H264_SLICE_HEADER_SCAN_BYTES;
+
+	for (i = 0; i < cap; i++) {
+		if (zero_run >= 2 && in[i] == 0x03) {
+			zero_run = 0;
+			continue;
+		}
+		out[out_len++] = in[i];
+		zero_run = (in[i] == 0x00) ? zero_run + 1 : 0;
+	}
+	return out_len;
+}
+
+/*
+ * §7.3.3.1 ref_pic_list_modification() — skip past it without
+ * keeping any values. Length depends on slice_type and the loop
+ * terminator modification_of_pic_nums_idc == 3.
+ */
+static void skip_ref_pic_list_modification(struct br *b,
+					   uint32_t slice_type)
+{
+	uint32_t st_mod5 = slice_type % 5;
+
+	if (st_mod5 != 2 && st_mod5 != 4) {
+		/* P, SP, B */
+		uint32_t ref_pic_list_modification_flag_l0 = br_read_u(b, 1);
+		if (ref_pic_list_modification_flag_l0) {
+			uint32_t mod_idc;
+			do {
+				mod_idc = br_read_ue(b);
+				if (mod_idc == 0 || mod_idc == 1)
+					br_read_ue(b); /* abs_diff_pic_num_minus1 */
+				else if (mod_idc == 2)
+					br_read_ue(b); /* long_term_pic_num */
+				if (b->error)
+					return;
+			} while (mod_idc != 3);
+		}
+	}
+	if (st_mod5 == 1) {
+		/* B */
+		uint32_t ref_pic_list_modification_flag_l1 = br_read_u(b, 1);
+		if (ref_pic_list_modification_flag_l1) {
+			uint32_t mod_idc;
+			do {
+				mod_idc = br_read_ue(b);
+				if (mod_idc == 0 || mod_idc == 1)
+					br_read_ue(b);
+				else if (mod_idc == 2)
+					br_read_ue(b);
+				if (b->error)
+					return;
+			} while (mod_idc != 3);
+		}
+	}
+}
+
+/*
+ * §7.3.3.2 pred_weight_table() — skip past it. Length depends on
+ * the active reference counts and chroma_format_idc.
+ */
+static void skip_pred_weight_table(struct br *b,
+				   uint32_t slice_type,
+				   uint8_t chroma_format_idc,
+				   uint8_t bit_depth_luma_minus8,
+				   uint8_t bit_depth_chroma_minus8,
+				   uint32_t num_ref_idx_l0_active_minus1,
+				   uint32_t num_ref_idx_l1_active_minus1)
+{
+	uint32_t i, j;
+	uint32_t st_mod5 = slice_type % 5;
+
+	(void)bit_depth_luma_minus8;
+	(void)bit_depth_chroma_minus8;
+
+	br_read_ue(b); /* luma_log2_weight_denom */
+	if (chroma_format_idc != 0)
+		br_read_ue(b); /* chroma_log2_weight_denom */
+
+	for (i = 0; i <= num_ref_idx_l0_active_minus1 && !b->error; i++) {
+		uint32_t luma_weight_l0_flag = br_read_u(b, 1);
+		if (luma_weight_l0_flag) {
+			br_read_se(b); /* luma_weight_l0 */
+			br_read_se(b); /* luma_offset_l0 */
+		}
+		if (chroma_format_idc != 0) {
+			uint32_t chroma_weight_l0_flag = br_read_u(b, 1);
+			if (chroma_weight_l0_flag) {
+				for (j = 0; j < 2; j++) {
+					br_read_se(b);
+					br_read_se(b);
+				}
+			}
+		}
+	}
+
+	if (st_mod5 == 1) {
+		for (i = 0; i <= num_ref_idx_l1_active_minus1 && !b->error; i++) {
+			uint32_t luma_weight_l1_flag = br_read_u(b, 1);
+			if (luma_weight_l1_flag) {
+				br_read_se(b);
+				br_read_se(b);
+			}
+			if (chroma_format_idc != 0) {
+				uint32_t chroma_weight_l1_flag = br_read_u(b, 1);
+				if (chroma_weight_l1_flag) {
+					for (j = 0; j < 2; j++) {
+						br_read_se(b);
+						br_read_se(b);
+					}
+				}
+			}
+		}
+	}
+}
+
+int h264_parse_slice_header(const uint8_t *nal_payload,
+			    size_t nal_payload_length,
+			    const struct h264_slice_header_context *ctx,
+			    struct h264_slice_header_info *out)
+{
+	uint8_t unescaped[H264_SLICE_HEADER_SCAN_BYTES];
+	size_t unescaped_len;
+	struct br b = { 0 };
+	bool idr_pic_flag = (ctx->nal_unit_type == 5);
+	uint32_t slice_type;
+	uint32_t num_ref_idx_l0_active_minus1;
+	uint32_t num_ref_idx_l1_active_minus1;
+	size_t pic_order_cnt_start;
+	size_t pic_order_cnt_end;
+	size_t dec_ref_pic_marking_start;
+	size_t dec_ref_pic_marking_end;
+	bool field_pic_flag = false;
+
+	memset(out, 0, sizeof(*out));
+
+	if (!nal_payload || nal_payload_length == 0)
+		return -EINVAL;
+
+	unescaped_len = rbsp_unescape(unescaped, nal_payload,
+				      nal_payload_length);
+	if (unescaped_len < 2)
+		return -EINVAL;
+
+	b.data = unescaped;
+	b.length = unescaped_len;
+	b.bit_pos = 0;
+	b.error = false;
+
+	/* slice_header() per §7.3.3 */
+	out->first_mb_in_slice		= br_read_ue(&b);
+	slice_type			= br_read_ue(&b);
+	out->slice_type			= slice_type;
+	out->pic_parameter_set_id	= br_read_ue(&b);
+
+	if (ctx->separate_colour_plane_flag)
+		(void)br_read_u(&b, 2); /* colour_plane_id */
+
+	out->frame_num = br_read_u(&b, ctx->log2_max_frame_num_minus4 + 4u);
+
+	if (!ctx->frame_mbs_only_flag) {
+		field_pic_flag = (br_read_u(&b, 1) != 0);
+		if (field_pic_flag)
+			(void)br_read_u(&b, 1); /* bottom_field_flag */
+	}
+
+	if (idr_pic_flag)
+		out->idr_pic_id = (uint16_t)br_read_ue(&b);
+
+	/*
+	 * pic_order_cnt syntax — measure bit length from the start of
+	 * pic_order_cnt_lsb / delta_pic_order_cnt[0] to the end of
+	 * delta_pic_order_cnt_bottom / delta_pic_order_cnt[1]. This is
+	 * what V4L2 calls pic_order_cnt_bit_size and what hantro G1
+	 * writes into G1_REG_DEC_CTRL6_POC_LENGTH.
+	 */
+	pic_order_cnt_start = b.bit_pos;
+	if (ctx->pic_order_cnt_type == 0) {
+		out->pic_order_cnt_lsb = (uint16_t)br_read_u(
+			&b, ctx->log2_max_pic_order_cnt_lsb_minus4 + 4u);
+		if (ctx->bottom_field_pic_order_in_frame_present_flag &&
+		    !field_pic_flag)
+			out->delta_pic_order_cnt_bottom = br_read_se(&b);
+	} else if (ctx->pic_order_cnt_type == 1 &&
+		   !ctx->delta_pic_order_always_zero_flag) {
+		out->delta_pic_order_cnt0 = br_read_se(&b);
+		if (ctx->bottom_field_pic_order_in_frame_present_flag &&
+		    !field_pic_flag)
+			out->delta_pic_order_cnt1 = br_read_se(&b);
+	}
+	pic_order_cnt_end = b.bit_pos;
+	out->pic_order_cnt_bit_size = (uint32_t)(pic_order_cnt_end -
+						 pic_order_cnt_start);
+
+	if (ctx->redundant_pic_cnt_present_flag)
+		(void)br_read_ue(&b); /* redundant_pic_cnt */
+
+	if (slice_type % 5 == 1) /* B */
+		(void)br_read_u(&b, 1); /* direct_spatial_mv_pred_flag */
+
+	num_ref_idx_l0_active_minus1 = ctx->num_ref_idx_l0_default_active_minus1;
+	num_ref_idx_l1_active_minus1 = ctx->num_ref_idx_l1_default_active_minus1;
+
+	{
+		uint32_t st = slice_type % 5;
+		if (st == 0 || st == 3 || st == 1) {
+			/* P, SP, B */
+			uint32_t override = br_read_u(&b, 1);
+			if (override) {
+				num_ref_idx_l0_active_minus1 = br_read_ue(&b);
+				if (st == 1)
+					num_ref_idx_l1_active_minus1 = br_read_ue(&b);
+			}
+		}
+	}
+
+	skip_ref_pic_list_modification(&b, slice_type);
+	if (b.error)
+		return -EIO;
+
+	{
+		uint32_t st = slice_type % 5;
+		bool do_pwt =
+			(ctx->weighted_pred_flag && (st == 0 || st == 3)) ||
+			(ctx->weighted_bipred_idc == 1 && st == 1);
+		if (do_pwt) {
+			skip_pred_weight_table(&b, slice_type,
+					       ctx->chroma_format_idc,
+					       ctx->bit_depth_luma_minus8,
+					       ctx->bit_depth_chroma_minus8,
+					       num_ref_idx_l0_active_minus1,
+					       num_ref_idx_l1_active_minus1);
+			if (b.error)
+				return -EIO;
+		}
+	}
+
+	/*
+	 * dec_ref_pic_marking() per §7.3.3.3 — measure bit length;
+	 * hantro G1 writes this into G1_REG_DEC_CTRL5_REFPIC_MK_LEN.
+	 */
+	dec_ref_pic_marking_start = b.bit_pos;
+	if (ctx->nal_ref_idc != 0) {
+		if (idr_pic_flag) {
+			(void)br_read_u(&b, 1); /* no_output_of_prior_pics_flag */
+			(void)br_read_u(&b, 1); /* long_term_reference_flag */
+		} else {
+			uint32_t adaptive = br_read_u(&b, 1);
+			if (adaptive) {
+				uint32_t mmco;
+				do {
+					mmco = br_read_ue(&b);
+					if (mmco == 1 || mmco == 3)
+						br_read_ue(&b); /* difference_of_pic_nums_minus1 */
+					if (mmco == 2)
+						br_read_ue(&b); /* long_term_pic_num */
+					if (mmco == 3 || mmco == 6)
+						br_read_ue(&b); /* long_term_frame_idx */
+					if (mmco == 4)
+						br_read_ue(&b); /* max_long_term_frame_idx_plus1 */
+					if (b.error)
+						return -EIO;
+				} while (mmco != 0);
+			}
+		}
+	}
+	dec_ref_pic_marking_end = b.bit_pos;
+	out->dec_ref_pic_marking_bit_size =
+		(uint32_t)(dec_ref_pic_marking_end - dec_ref_pic_marking_start);
+
+	if (b.error)
+		return -EIO;
+
+	return 0;
+}
@@ -0,0 +1,95 @@
+/*
+ * H.264 slice header bit-parser for libva-v4l2-request.
+ *
+ * Extracts the slice-header bit-position and value fields that
+ * V4L2_CID_STATELESS_H264_DECODE_PARAMS requires (idr_pic_id,
+ * pic_order_cnt_lsb, delta_pic_order_cnt_*, pic_order_cnt_bit_size,
+ * dec_ref_pic_marking_bit_size). VAAPI's pre-parsed
+ * VAPictureParameterBufferH264 / VASliceParameterBufferH264 do not
+ * carry these — they live only in the bitstream's slice_header()
+ * syntax. Hantro G1 (drivers/media/platform/verisilicon/
+ * hantro_g1_h264_dec.c::set_params) writes the bit_size fields
+ * directly into MMIO registers G1_REG_DEC_CTRL5_REFPIC_MK_LEN and
+ * G1_REG_DEC_CTRL6_POC_LENGTH; with zeros the hardware bitstream
+ * parser walks past zero bits, lands on garbage, decodes nothing.
+ *
+ * Spec reference: ITU-T Rec. H.264 (08/2024) §7.3.3 slice_header
+ * and §7.3.3.1 ref_pic_list_modification, §7.3.3.2 pred_weight_table,
+ * §7.3.3.3 dec_ref_pic_marking.
+ *
+ * Cross-reference (proven working on hantro): FFmpeg's
+ * libavcodec/h264_slice.c populates H264SliceContext::ref_pic_marking_
+ * bit_size and pic_order_cnt_bit_size from its bit-precise slice
+ * header parse, then v4l2_request_h264.c forwards them.
+ */
+
+#ifndef H264_SLICE_HEADER_H
+#define H264_SLICE_HEADER_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct h264_slice_header_context {
+	/* From SPS (the active SPS at slice-time). */
+	bool		separate_colour_plane_flag;
+	uint8_t		log2_max_frame_num_minus4;
+	bool		frame_mbs_only_flag;
+	uint8_t		pic_order_cnt_type;
+	uint8_t		log2_max_pic_order_cnt_lsb_minus4;
+	bool		delta_pic_order_always_zero_flag;
+
+	/* From PPS (the active PPS at slice-time). */
+	bool		bottom_field_pic_order_in_frame_present_flag;
+	bool		redundant_pic_cnt_present_flag;
+	bool		weighted_pred_flag;
+	uint8_t		weighted_bipred_idc;
+	uint8_t		num_ref_idx_l0_default_active_minus1;
+	uint8_t		num_ref_idx_l1_default_active_minus1;
+	uint8_t		chroma_format_idc;
+	uint8_t		bit_depth_luma_minus8;
+	uint8_t		bit_depth_chroma_minus8;
+
+	/* From the NAL unit header (already extracted by the caller). */
+	uint8_t		nal_unit_type;
+	uint8_t		nal_ref_idc;
+};
+
+struct h264_slice_header_info {
+	uint16_t	idr_pic_id;
+	uint16_t	pic_order_cnt_lsb;
+	int32_t		delta_pic_order_cnt_bottom;
+	int32_t		delta_pic_order_cnt0;
+	int32_t		delta_pic_order_cnt1;
+	uint32_t	pic_order_cnt_bit_size;
+	uint32_t	dec_ref_pic_marking_bit_size;
+
+	/* Diagnostic — useful for cross-checking VAAPI vs bitstream values. */
+	uint32_t	first_mb_in_slice;
+	uint32_t	slice_type;
+	uint32_t	pic_parameter_set_id;
+	uint32_t	frame_num;
+};
+
+/*
+ * Parse slice_header() up to dec_ref_pic_marking() (inclusive) of
+ * the H.264 RBSP slice_layer_without_partitioning_rbsp() syntax,
+ * extracting the V4L2 DECODE_PARAMS fields. Returns 0 on success,
+ * negative errno-shaped value on parse failure (insufficient data,
+ * malformed exp-Golomb, etc.).
+ *
+ * @nal_payload: pointer to the byte AFTER the NAL header byte
+ *               (i.e. start of the RBSP proper; caller has already
+ *               skipped any ANNEX_B start code and the 1-byte
+ *               nal_unit_header). Will be RBSP-unescaped internally
+ *               before parsing.
+ * @nal_payload_length: bytes available at @nal_payload.
+ * @ctx: SPS/PPS/NAL context required to drive the parse.
+ * @out: filled on success. All fields zero-initialized first.
+ */
+int h264_parse_slice_header(const uint8_t *nal_payload,
+			    size_t nal_payload_length,
+			    const struct h264_slice_header_context *ctx,
+			    struct h264_slice_header_info *out);
+
+#endif /* H264_SLICE_HEADER_H */
@@ -27,6 +27,12 @@
 #ifndef _H265_H_
 #define _H265_H_

+/* Maximum number of slices per frame the libva backend will accumulate
+ * before submitting to the kernel (kernel HEVC slice_params dynamic-array
+ * accepts up to 600 entries per Phase 0 V4L2 inventory; 64 is a
+ * conservative cap for typical fixtures + safety bound). */
+#define HEVC_MAX_SLICES_PER_FRAME 64
+
 struct object_context;
 struct object_surface;
 struct request_data;
@@ -0,0 +1,14 @@
+/* Stub for <gst/base/base-prelude.h> — GStreamer base-lib prelude.
+ * In upstream GStreamer, this sets up the GstBaseExport macro + GObject
+ * boilerplate. We bypass all of that and provide only what our four
+ * vendored .c files actually need (gst_compat.h's typedefs).
+ *
+ * Crucially we also #define GST_BASE_API to nothing so the function
+ * declarations in gstbitreader.h / gstbytereader.h drop the
+ * dllimport / visibility attribute prefix.
+ */
+#ifndef LIBVA_V4L2_REQUEST_FOURIER_BASE_PRELUDE_STUB
+#define LIBVA_V4L2_REQUEST_FOURIER_BASE_PRELUDE_STUB
+#include "gst_compat.h"
+#define GST_BASE_API
+#endif
@@ -0,0 +1,307 @@
+/* GStreamer
+ *
+ * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk>.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define GST_BIT_READER_DISABLE_INLINES
+#include "gstbitreader.h"
+
+#include <string.h>
+
+/**
+ * SECTION:gstbitreader
+ * @title: GstBitReader
+ * @short_description: Reads any number of bits from a memory buffer
+ * @symbols:
+ * - gst_bit_reader_skip_unchecked
+ * - gst_bit_reader_skip_to_byte_unchecked
+ * - gst_bit_reader_get_bits_uint8_unchecked
+ * - gst_bit_reader_peek_bits_uint8_unchecked
+ * - gst_bit_reader_get_bits_uint16_unchecked
+ * - gst_bit_reader_peek_bits_uint16_unchecked
+ * - gst_bit_reader_get_bits_uint32_unchecked
+ * - gst_bit_reader_peek_bits_uint32_unchecked
+ * - gst_bit_reader_get_bits_uint64_unchecked
+ * - gst_bit_reader_peek_bits_uint64_unchecked
+ *
+ * #GstBitReader provides a bit reader that can read any number of bits
+ * from a memory buffer. It provides functions for reading any number of bits
+ * into 8, 16, 32 and 64 bit variables.
+ */
+
+/**
+ * gst_bit_reader_new: (skip)
+ * @data: (array length=size): Data from which the #GstBitReader
+ *   should read
+ * @size: Size of @data in bytes
+ *
+ * Create a new #GstBitReader instance, which will read from @data.
+ *
+ * Free-function: gst_bit_reader_free
+ *
+ * Returns: (transfer full): a new #GstBitReader instance
+ */
+GstBitReader *
+gst_bit_reader_new (const guint8 * data, guint size)
+{
+  GstBitReader *ret = g_new0 (GstBitReader, 1);
+
+  ret->data = data;
+  ret->size = size;
+
+  return ret;
+}
+
+/**
+ * gst_bit_reader_free:
+ * @reader: (in) (transfer full): a #GstBitReader instance
+ *
+ * Frees a #GstBitReader instance, which was previously allocated by
+ * gst_bit_reader_new().
+ */
+void
+gst_bit_reader_free (GstBitReader * reader)
+{
+  g_return_if_fail (reader != NULL);
+
+  g_free (reader);
+}
+
+/**
+ * gst_bit_reader_init:
+ * @reader: a #GstBitReader instance
+ * @data: (in) (array length=size): data from which the bit reader should read
+ * @size: Size of @data in bytes
+ *
+ * Initializes a #GstBitReader instance to read from @data. This function
+ * can be called on already initialized instances.
+ */
+void
+gst_bit_reader_init (GstBitReader * reader, const guint8 * data, guint size)
+{
+  g_return_if_fail (reader != NULL);
+
+  reader->data = data;
+  reader->size = size;
+  reader->byte = reader->bit = 0;
+}
+
+/**
+ * gst_bit_reader_set_pos:
+ * @reader: a #GstBitReader instance
+ * @pos: The new position in bits
+ *
+ * Sets the new position of a #GstBitReader instance to @pos in bits.
+ *
+ * Returns: %TRUE if the position could be set successfully, %FALSE
+ * otherwise.
+ */
+gboolean
+gst_bit_reader_set_pos (GstBitReader * reader, guint pos)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+
+  if (pos > reader->size * 8)
+    return FALSE;
+
+  reader->byte = pos / 8;
+  reader->bit = pos % 8;
+
+  return TRUE;
+}
+
+/**
+ * gst_bit_reader_get_pos:
+ * @reader: a #GstBitReader instance
+ *
+ * Returns the current position of a #GstBitReader instance in bits.
+ *
+ * Returns: The current position of @reader in bits.
+ */
+guint
+gst_bit_reader_get_pos (const GstBitReader * reader)
+{
+  return _gst_bit_reader_get_pos_inline (reader);
+}
+
+/**
+ * gst_bit_reader_get_remaining:
+ * @reader: a #GstBitReader instance
+ *
+ * Returns the remaining number of bits of a #GstBitReader instance.
+ *
+ * Returns: The remaining number of bits of @reader instance.
+ */
+guint
+gst_bit_reader_get_remaining (const GstBitReader * reader)
+{
+  return _gst_bit_reader_get_remaining_inline (reader);
+}
+
+/**
+ * gst_bit_reader_get_size:
+ * @reader: a #GstBitReader instance
+ *
+ * Returns the total number of bits of a #GstBitReader instance.
+ *
+ * Returns: The total number of bits of @reader instance.
+ */
+guint
+gst_bit_reader_get_size (const GstBitReader * reader)
+{
+  return _gst_bit_reader_get_size_inline (reader);
+}
+
+/**
+ * gst_bit_reader_skip:
+ * @reader: a #GstBitReader instance
+ * @nbits: the number of bits to skip
+ *
+ * Skips @nbits bits of the #GstBitReader instance.
+ *
+ * Returns: %TRUE if @nbits bits could be skipped, %FALSE otherwise.
+ */
+gboolean
+gst_bit_reader_skip (GstBitReader * reader, guint nbits)
+{
+  return _gst_bit_reader_skip_inline (reader, nbits);
+}
+
+/**
+ * gst_bit_reader_skip_to_byte:
+ * @reader: a #GstBitReader instance
+ *
+ * Skips until the next byte.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+gboolean
+gst_bit_reader_skip_to_byte (GstBitReader * reader)
+{
+  return _gst_bit_reader_skip_to_byte_inline (reader);
+}
+
+/**
+ * gst_bit_reader_get_bits_uint8:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint8 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val and update the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+/**
+ * gst_bit_reader_get_bits_uint16:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint16 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val and update the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+/**
+ * gst_bit_reader_get_bits_uint32:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint32 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val and update the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+/**
+ * gst_bit_reader_get_bits_uint64:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint64 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val and update the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+/**
+ * gst_bit_reader_peek_bits_uint8:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint8 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val but keep the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+/**
+ * gst_bit_reader_peek_bits_uint16:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint16 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val but keep the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+/**
+ * gst_bit_reader_peek_bits_uint32:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint32 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val but keep the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+/**
+ * gst_bit_reader_peek_bits_uint64:
+ * @reader: a #GstBitReader instance
+ * @val: (out): Pointer to a #guint64 to store the result
+ * @nbits: number of bits to read
+ *
+ * Read @nbits bits into @val but keep the current position.
+ *
+ * Returns: %TRUE if successful, %FALSE otherwise.
+ */
+
+#define GST_BIT_READER_READ_BITS(bits) \
+gboolean \
+gst_bit_reader_peek_bits_uint##bits (const GstBitReader *reader, guint##bits *val, guint nbits) \
+{ \
+  return _gst_bit_reader_peek_bits_uint##bits##_inline (reader, val, nbits); \
+} \
+\
+gboolean \
+gst_bit_reader_get_bits_uint##bits (GstBitReader *reader, guint##bits *val, guint nbits) \
+{ \
+  return _gst_bit_reader_get_bits_uint##bits##_inline (reader, val, nbits); \
+}
+
+GST_BIT_READER_READ_BITS (8);
+GST_BIT_READER_READ_BITS (16);
+GST_BIT_READER_READ_BITS (32);
+GST_BIT_READER_READ_BITS (64);
@@ -0,0 +1,328 @@
+/* GStreamer
+ *
+ * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk>.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_BIT_READER_H__
+#define __GST_BIT_READER_H__
+
+#include <gst/gst.h>
+#include <gst/base/base-prelude.h>
+
+/* FIXME: inline functions */
+
+G_BEGIN_DECLS
+
+#define GST_BIT_READER(reader) ((GstBitReader *) (reader))
+
+/**
+ * GstBitReader:
+ * @data: (array length=size): Data from which the bit reader will
+ *   read
+ * @size: Size of @data in bytes
+ * @byte: Current byte position
+ * @bit: Bit position in the current byte
+ *
+ * A bit reader instance.
+ */
+typedef struct {
+  const guint8 *data;
+  guint size;
+
+  guint byte;  /* Byte position */
+  guint bit;   /* Bit position in the current byte */
+
+  /* < private > */
+  gpointer _gst_reserved[GST_PADDING];
+} GstBitReader;
+
+GST_BASE_API
+GstBitReader *  gst_bit_reader_new              (const guint8 *data, guint size) G_GNUC_MALLOC;
+
+GST_BASE_API
+void            gst_bit_reader_free             (GstBitReader *reader);
+
+GST_BASE_API
+void            gst_bit_reader_init             (GstBitReader *reader, const guint8 *data, guint size);
+
+GST_BASE_API
+gboolean        gst_bit_reader_set_pos          (GstBitReader *reader, guint pos);
+
+GST_BASE_API
+guint           gst_bit_reader_get_pos          (const GstBitReader *reader);
+
+GST_BASE_API
+guint           gst_bit_reader_get_remaining    (const GstBitReader *reader);
+
+GST_BASE_API
+guint           gst_bit_reader_get_size         (const GstBitReader *reader);
+
+GST_BASE_API
+gboolean        gst_bit_reader_skip             (GstBitReader *reader, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_skip_to_byte     (GstBitReader *reader);
+
+GST_BASE_API
+gboolean        gst_bit_reader_get_bits_uint8   (GstBitReader *reader, guint8 *val, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_get_bits_uint16  (GstBitReader *reader, guint16 *val, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_get_bits_uint32  (GstBitReader *reader, guint32 *val, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_get_bits_uint64  (GstBitReader *reader, guint64 *val, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_peek_bits_uint8  (const GstBitReader *reader, guint8 *val, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_peek_bits_uint16 (const GstBitReader *reader, guint16 *val, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_peek_bits_uint32 (const GstBitReader *reader, guint32 *val, guint nbits);
+
+GST_BASE_API
+gboolean        gst_bit_reader_peek_bits_uint64 (const GstBitReader *reader, guint64 *val, guint nbits);
+
+/**
+ * GST_BIT_READER_INIT:
+ * @data: Data from which the #GstBitReader should read
+ * @size: Size of @data in bytes
+ *
+ * A #GstBitReader must be initialized with this macro, before it can be
+ * used. This macro can used be to initialize a variable, but it cannot
+ * be assigned to a variable. In that case you have to use
+ * gst_bit_reader_init().
+ */
+#define GST_BIT_READER_INIT(data, size) {data, size, 0, 0}
+
+/* Unchecked variants */
+
+static inline void
+gst_bit_reader_skip_unchecked (GstBitReader * reader, guint nbits)
+{
+  reader->bit += nbits;
+  reader->byte += reader->bit / 8;
+  reader->bit = reader->bit % 8;
+}
+
+static inline void
+gst_bit_reader_skip_to_byte_unchecked (GstBitReader * reader)
+{
+  if (reader->bit) {
+    reader->bit = 0;
+    reader->byte++;
+  }
+}
+
+#define __GST_BIT_READER_READ_BITS_UNCHECKED(bits) \
+static inline guint##bits \
+gst_bit_reader_peek_bits_uint##bits##_unchecked (const GstBitReader *reader, guint nbits) \
+{ \
+  guint##bits ret = 0; \
+  const guint8 *data; \
+  guint byte, bit; \
+  \
+  data = reader->data; \
+  byte = reader->byte; \
+  bit = reader->bit; \
+  \
+  while (nbits > 0) { \
+    guint toread = MIN (nbits, 8 - bit); \
+    \
+    ret <<= toread; \
+    ret |= (data[byte] & (0xff >> bit)) >> (8 - toread - bit); \
+    \
+    bit += toread; \
+    if (bit >= 8) { \
+      byte++; \
+      bit = 0; \
+    } \
+    nbits -= toread; \
+  } \
+  \
+  return ret; \
+} \
+\
+static inline guint##bits \
+gst_bit_reader_get_bits_uint##bits##_unchecked (GstBitReader *reader, guint nbits) \
+{ \
+  guint##bits ret; \
+  \
+  ret = gst_bit_reader_peek_bits_uint##bits##_unchecked (reader, nbits); \
+  \
+  gst_bit_reader_skip_unchecked (reader, nbits); \
+  \
+  return ret; \
+}
+
+__GST_BIT_READER_READ_BITS_UNCHECKED (8)
+__GST_BIT_READER_READ_BITS_UNCHECKED (16)
+__GST_BIT_READER_READ_BITS_UNCHECKED (32)
+__GST_BIT_READER_READ_BITS_UNCHECKED (64)
+
+#undef __GST_BIT_READER_READ_BITS_UNCHECKED
+
+/* unchecked variants -- do not use */
+
+static inline guint
+_gst_bit_reader_get_size_unchecked (const GstBitReader * reader)
+{
+  return reader->size * 8;
+}
+
+static inline guint
+_gst_bit_reader_get_pos_unchecked (const GstBitReader * reader)
+{
+  return reader->byte * 8 + reader->bit;
+}
+
+static inline guint
+_gst_bit_reader_get_remaining_unchecked (const GstBitReader * reader)
+{
+  return reader->size * 8 - (reader->byte * 8 + reader->bit);
+}
+
+/* inlined variants -- do not use directly */
+static inline guint
+_gst_bit_reader_get_size_inline (const GstBitReader * reader)
+{
+  g_return_val_if_fail (reader != NULL, 0);
+
+  return _gst_bit_reader_get_size_unchecked (reader);
+}
+
+static inline guint
+_gst_bit_reader_get_pos_inline (const GstBitReader * reader)
+{
+  g_return_val_if_fail (reader != NULL, 0);
+
+  return _gst_bit_reader_get_pos_unchecked (reader);
+}
+
+static inline guint
+_gst_bit_reader_get_remaining_inline (const GstBitReader * reader)
+{
+  g_return_val_if_fail (reader != NULL, 0);
+
+  return _gst_bit_reader_get_remaining_unchecked (reader);
+}
+
+static inline gboolean
+_gst_bit_reader_skip_inline (GstBitReader * reader, guint nbits)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+
+  if (_gst_bit_reader_get_remaining_unchecked (reader) < nbits)
+    return FALSE;
+
+  gst_bit_reader_skip_unchecked (reader, nbits);
+
+  return TRUE;
+}
+
+static inline gboolean
+_gst_bit_reader_skip_to_byte_inline (GstBitReader * reader)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+
+  if (reader->byte > reader->size)
+    return FALSE;
+
+  gst_bit_reader_skip_to_byte_unchecked (reader);
+
+  return TRUE;
+}
+
+#define __GST_BIT_READER_READ_BITS_INLINE(bits) \
+static inline gboolean \
+_gst_bit_reader_get_bits_uint##bits##_inline (GstBitReader *reader, guint##bits *val, guint nbits) \
+{ \
+  g_return_val_if_fail (reader != NULL, FALSE); \
+  g_return_val_if_fail (val != NULL, FALSE); \
+  g_return_val_if_fail (nbits <= bits, FALSE); \
+  \
+  if (_gst_bit_reader_get_remaining_unchecked (reader) < nbits) \
+    return FALSE; \
+\
+  *val = gst_bit_reader_get_bits_uint##bits##_unchecked (reader, nbits); \
+  return TRUE; \
+} \
+\
+static inline gboolean \
+_gst_bit_reader_peek_bits_uint##bits##_inline (const GstBitReader *reader, guint##bits *val, guint nbits) \
+{ \
+  g_return_val_if_fail (reader != NULL, FALSE); \
+  g_return_val_if_fail (val != NULL, FALSE); \
+  g_return_val_if_fail (nbits <= bits, FALSE); \
+  \
+  if (_gst_bit_reader_get_remaining_unchecked (reader) < nbits) \
+    return FALSE; \
+\
+  *val = gst_bit_reader_peek_bits_uint##bits##_unchecked (reader, nbits); \
+  return TRUE; \
+}
+
+__GST_BIT_READER_READ_BITS_INLINE (8)
+__GST_BIT_READER_READ_BITS_INLINE (16)
+__GST_BIT_READER_READ_BITS_INLINE (32)
+__GST_BIT_READER_READ_BITS_INLINE (64)
+
+#undef __GST_BIT_READER_READ_BITS_INLINE
+
+#ifndef GST_BIT_READER_DISABLE_INLINES
+
+#define gst_bit_reader_get_size(reader) \
+    _gst_bit_reader_get_size_inline (reader)
+#define gst_bit_reader_get_pos(reader) \
+    _gst_bit_reader_get_pos_inline (reader)
+#define gst_bit_reader_get_remaining(reader) \
+    _gst_bit_reader_get_remaining_inline (reader)
+
+/* we use defines here so we can add the G_LIKELY() */
+
+#define gst_bit_reader_skip(reader, nbits)\
+    G_LIKELY (_gst_bit_reader_skip_inline(reader, nbits))
+#define gst_bit_reader_skip_to_byte(reader)\
+    G_LIKELY (_gst_bit_reader_skip_to_byte_inline(reader))
+
+#define gst_bit_reader_get_bits_uint8(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_get_bits_uint8_inline (reader, val, nbits))
+#define gst_bit_reader_get_bits_uint16(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_get_bits_uint16_inline (reader, val, nbits))
+#define gst_bit_reader_get_bits_uint32(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_get_bits_uint32_inline (reader, val, nbits))
+#define gst_bit_reader_get_bits_uint64(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_get_bits_uint64_inline (reader, val, nbits))
+
+#define gst_bit_reader_peek_bits_uint8(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_peek_bits_uint8_inline (reader, val, nbits))
+#define gst_bit_reader_peek_bits_uint16(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_peek_bits_uint16_inline (reader, val, nbits))
+#define gst_bit_reader_peek_bits_uint32(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_peek_bits_uint32_inline (reader, val, nbits))
+#define gst_bit_reader_peek_bits_uint64(reader, val, nbits) \
+    G_LIKELY (_gst_bit_reader_peek_bits_uint64_inline (reader, val, nbits))
+#endif
+
+G_END_DECLS
+
+#endif /* __GST_BIT_READER_H__ */
@@ -0,0 +1,67 @@
+/* Stub for <gst/base/gstbitwriter.h>.
+ *
+ * The vendored nalutils.c uses GstBitWriter for NAL emulation-prevention
+ * byte INSERTION during write-side (encoder) operations. The libva
+ * backend never invokes those paths — we only PARSE NAL units, never
+ * write them. The functions must still compile + link though, so we
+ * stub them with abort() runtime guards: if any future code path
+ * accidentally invokes a writer function, we fail-fast instead of
+ * silently corrupting.
+ *
+ * Header surface mirrors upstream gstbitwriter.h minimally — enough
+ * for nalutils.c to compile.
+ */
+#ifndef LIBVA_V4L2_REQUEST_FOURIER_GSTBITWRITER_STUB
+#define LIBVA_V4L2_REQUEST_FOURIER_GSTBITWRITER_STUB
+
+#include "gst_compat.h"
+
+typedef struct {
+    guint8 *data;
+    guint  bit_size;
+    guint  bit_capacity;
+    gboolean auto_grow;
+    gboolean owned;
+} GstBitWriter;
+
+static inline void
+gst_bit_writer_init(GstBitWriter *bw) { (void)bw; abort(); }
+static inline void
+gst_bit_writer_init_with_size(GstBitWriter *bw, guint size, gboolean fixed) {
+    (void)bw; (void)size; (void)fixed; abort();
+}
+static inline void
+gst_bit_writer_reset(GstBitWriter *bw) { (void)bw; abort(); }
+static inline gboolean
+gst_bit_writer_put_bits_uint8(GstBitWriter *bw, guint8 value, guint nbits) {
+    (void)bw; (void)value; (void)nbits; abort();
+}
+static inline gboolean
+gst_bit_writer_align_bytes(GstBitWriter *bw, guint8 trailing_bit) {
+    (void)bw; (void)trailing_bit; abort();
+}
+static inline guint8 *
+gst_bit_writer_get_data(GstBitWriter *bw) { (void)bw; abort(); }
+static inline guint
+gst_bit_writer_get_size(const GstBitWriter *bw) { (void)bw; abort(); }
+static inline guint
+gst_bit_writer_reset_and_get_size(GstBitWriter *bw) { (void)bw; abort(); }
+static inline guint8 *
+gst_bit_writer_reset_and_get_data(GstBitWriter *bw) { (void)bw; abort(); }
+static inline gboolean
+gst_bit_writer_put_bits_uint16(GstBitWriter *bw, guint16 value, guint nbits) {
+    (void)bw; (void)value; (void)nbits; abort();
+}
+static inline gboolean
+gst_bit_writer_put_bits_uint32(GstBitWriter *bw, guint32 value, guint nbits) {
+    (void)bw; (void)value; (void)nbits; abort();
+}
+static inline gboolean
+gst_bit_writer_put_bytes(GstBitWriter *bw, const guint8 *data, guint nbytes) {
+    (void)bw; (void)data; (void)nbytes; abort();
+}
+
+#define GST_BIT_WRITER_BIT_SIZE(bw)  ((bw)->bit_size)
+#define GST_BIT_WRITER_DATA(bw)      ((bw)->data)
+
+#endif
@@ -0,0 +1,684 @@
+/* GStreamer byte reader
+ *
+ * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk>.
+ * Copyright (C) 2009 Tim-Philipp Müller <tim centricular net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_BYTE_READER_H__
+#define __GST_BYTE_READER_H__
+
+#include <gst/gst.h>
+#include <gst/base/base-prelude.h>
+
+G_BEGIN_DECLS
+
+#define GST_BYTE_READER(reader) ((GstByteReader *) (reader))
+
+/**
+ * GstByteReader:
+ * @data: (array length=size): Data from which the bit reader will
+ *   read
+ * @size: Size of @data in bytes
+ * @byte: Current byte position
+ *
+ * A byte reader instance.
+ */
+typedef struct {
+  const guint8 *data;
+  guint size;
+
+  guint byte;  /* Byte position */
+
+  /* < private > */
+  gpointer _gst_reserved[GST_PADDING];
+} GstByteReader;
+
+GST_BASE_API
+GstByteReader * gst_byte_reader_new             (const guint8 *data, guint size) G_GNUC_MALLOC;
+
+GST_BASE_API
+void            gst_byte_reader_free            (GstByteReader *reader);
+
+GST_BASE_API
+void            gst_byte_reader_init            (GstByteReader *reader, const guint8 *data, guint size);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_sub_reader (GstByteReader * reader,
+                                                 GstByteReader * sub_reader,
+                                                 guint           size);
+GST_BASE_API
+gboolean        gst_byte_reader_get_sub_reader  (GstByteReader * reader,
+                                                 GstByteReader * sub_reader,
+                                                 guint           size);
+GST_BASE_API
+gboolean        gst_byte_reader_set_pos         (GstByteReader *reader, guint pos);
+
+GST_BASE_API
+guint           gst_byte_reader_get_pos         (const GstByteReader *reader);
+
+GST_BASE_API
+guint           gst_byte_reader_get_remaining   (const GstByteReader *reader);
+
+GST_BASE_API
+guint           gst_byte_reader_get_size        (const GstByteReader *reader);
+
+GST_BASE_API
+gboolean        gst_byte_reader_skip            (GstByteReader *reader, guint nbytes);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint8       (GstByteReader *reader, guint8 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int8        (GstByteReader *reader, gint8 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint16_le   (GstByteReader *reader, guint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int16_le    (GstByteReader *reader, gint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint16_be   (GstByteReader *reader, guint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int16_be    (GstByteReader *reader, gint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint24_le   (GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int24_le    (GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint24_be   (GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int24_be    (GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint32_le   (GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int32_le    (GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint32_be   (GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int32_be    (GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint64_le   (GstByteReader *reader, guint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int64_le    (GstByteReader *reader, gint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_uint64_be   (GstByteReader *reader, guint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_int64_be    (GstByteReader *reader, gint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint8      (const GstByteReader *reader, guint8 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int8       (const GstByteReader *reader, gint8 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint16_le  (const GstByteReader *reader, guint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int16_le   (const GstByteReader *reader, gint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint16_be  (const GstByteReader *reader, guint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int16_be   (const GstByteReader *reader, gint16 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint24_le  (const GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int24_le   (const GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint24_be  (const GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int24_be   (const GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint32_le  (const GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int32_le   (const GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint32_be  (const GstByteReader *reader, guint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int32_be   (const GstByteReader *reader, gint32 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint64_le  (const GstByteReader *reader, guint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int64_le   (const GstByteReader *reader, gint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_uint64_be  (const GstByteReader *reader, guint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_int64_be   (const GstByteReader *reader, gint64 *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_float32_le  (GstByteReader *reader, gfloat *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_float32_be  (GstByteReader *reader, gfloat *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_float64_le  (GstByteReader *reader, gdouble *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_float64_be  (GstByteReader *reader, gdouble *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_float32_le (const GstByteReader *reader, gfloat *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_float32_be (const GstByteReader *reader, gfloat *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_float64_le (const GstByteReader *reader, gdouble *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_float64_be (const GstByteReader *reader, gdouble *val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_dup_data        (GstByteReader * reader, guint size, guint8       ** val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_data        (GstByteReader * reader, guint size, const guint8 ** val);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_data       (const GstByteReader * reader, guint size, const guint8 ** val);
+
+#define gst_byte_reader_dup_string(reader,str) \
+    gst_byte_reader_dup_string_utf8(reader,str)
+
+GST_BASE_API
+gboolean        gst_byte_reader_dup_string_utf8  (GstByteReader * reader, gchar   ** str);
+
+GST_BASE_API
+gboolean        gst_byte_reader_dup_string_utf16 (GstByteReader * reader, guint16 ** str);
+
+GST_BASE_API
+gboolean        gst_byte_reader_dup_string_utf32 (GstByteReader * reader, guint32 ** str);
+
+#define gst_byte_reader_skip_string(reader) \
+    gst_byte_reader_skip_string_utf8(reader)
+
+GST_BASE_API
+gboolean        gst_byte_reader_skip_string_utf8  (GstByteReader * reader);
+
+GST_BASE_API
+gboolean        gst_byte_reader_skip_string_utf16 (GstByteReader * reader);
+
+GST_BASE_API
+gboolean        gst_byte_reader_skip_string_utf32 (GstByteReader * reader);
+
+#define gst_byte_reader_get_string(reader,str) \
+    gst_byte_reader_get_string_utf8(reader,str)
+
+#define gst_byte_reader_peek_string(reader,str) \
+    gst_byte_reader_peek_string_utf8(reader,str)
+
+GST_BASE_API
+gboolean        gst_byte_reader_get_string_utf8    (GstByteReader * reader, const gchar ** str);
+
+GST_BASE_API
+gboolean        gst_byte_reader_peek_string_utf8   (const GstByteReader * reader, const gchar ** str);
+
+GST_BASE_API
+guint           gst_byte_reader_masked_scan_uint32 (const GstByteReader * reader,
+                                                    guint32               mask,
+                                                    guint32               pattern,
+                                                    guint                 offset,
+                                                    guint                 size);
+GST_BASE_API
+guint           gst_byte_reader_masked_scan_uint32_peek (const GstByteReader * reader,
+                                                         guint32 mask,
+                                                         guint32 pattern,
+                                                         guint offset,
+                                                         guint size,
+                                                         guint32 * value);
+
+/**
+ * GST_BYTE_READER_INIT:
+ * @data: Data from which the #GstByteReader should read
+ * @size: Size of @data in bytes
+ *
+ * A #GstByteReader must be initialized with this macro, before it can be
+ * used. This macro can used be to initialize a variable, but it cannot
+ * be assigned to a variable. In that case you have to use
+ * gst_byte_reader_init().
+ */
+#define GST_BYTE_READER_INIT(data, size) {data, size, 0}
+
+/* unchecked variants */
+static inline void
+gst_byte_reader_skip_unchecked (GstByteReader * reader, guint nbytes)
+{
+  reader->byte += nbytes;
+}
+
+#define __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(bits,type,lower,upper,adj) \
+\
+static inline type \
+gst_byte_reader_peek_##lower##_unchecked (const GstByteReader * reader) \
+{ \
+  type val = (type) GST_READ_##upper (reader->data + reader->byte); \
+  adj \
+  return val; \
+} \
+\
+static inline type \
+gst_byte_reader_get_##lower##_unchecked (GstByteReader * reader) \
+{ \
+  type val = gst_byte_reader_peek_##lower##_unchecked (reader); \
+  reader->byte += bits / 8; \
+  return val; \
+}
+
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(8,guint8,uint8,UINT8,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(8,gint8,int8,UINT8,/* */)
+
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,guint16,uint16_le,UINT16_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,guint16,uint16_be,UINT16_BE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,gint16,int16_le,UINT16_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,gint16,int16_be,UINT16_BE,/* */)
+
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,guint32,uint32_le,UINT32_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,guint32,uint32_be,UINT32_BE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gint32,int32_le,UINT32_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gint32,int32_be,UINT32_BE,/* */)
+
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,guint32,uint24_le,UINT24_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,guint32,uint24_be,UINT24_BE,/* */)
+
+/* fix up the sign for 24-bit signed ints stored in 32-bit signed ints */
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,gint32,int24_le,UINT24_LE,
+    if (val & 0x00800000) val |= 0xff000000;)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,gint32,int24_be,UINT24_BE,
+    if (val & 0x00800000) val |= 0xff000000;)
+
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,guint64,uint64_le,UINT64_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,guint64,uint64_be,UINT64_BE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gint64,int64_le,UINT64_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gint64,int64_be,UINT64_BE,/* */)
+
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gfloat,float32_le,FLOAT_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gfloat,float32_be,FLOAT_BE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gdouble,float64_le,DOUBLE_LE,/* */)
+__GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gdouble,float64_be,DOUBLE_BE,/* */)
+
+#undef __GET_PEEK_BITS_UNCHECKED
+
+static inline const guint8 *
+gst_byte_reader_peek_data_unchecked (const GstByteReader * reader)
+{
+  return (const guint8 *) (reader->data + reader->byte);
+}
+
+static inline const guint8 *
+gst_byte_reader_get_data_unchecked (GstByteReader * reader, guint size)
+{
+  const guint8 *data;
+
+  data = gst_byte_reader_peek_data_unchecked (reader);
+  gst_byte_reader_skip_unchecked (reader, size);
+  return data;
+}
+
+static inline guint8 *
+gst_byte_reader_dup_data_unchecked (GstByteReader * reader, guint size)
+{
+  gconstpointer data = gst_byte_reader_get_data_unchecked (reader, size);
+  guint8 *dup_data = (guint8 *) g_malloc (size);
+
+  memcpy (dup_data, data, size);
+  return dup_data;
+}
+
+/* Unchecked variants that should not be used */
+static inline guint
+_gst_byte_reader_get_pos_unchecked (const GstByteReader * reader)
+{
+  return reader->byte;
+}
+
+static inline guint
+_gst_byte_reader_get_remaining_unchecked (const GstByteReader * reader)
+{
+  return reader->size - reader->byte;
+}
+
+static inline guint
+_gst_byte_reader_get_size_unchecked (const GstByteReader * reader)
+{
+  return reader->size;
+}
+
+/* inlined variants (do not use directly) */
+
+static inline guint
+_gst_byte_reader_get_remaining_inline (const GstByteReader * reader)
+{
+  g_return_val_if_fail (reader != NULL, 0);
+
+  return _gst_byte_reader_get_remaining_unchecked (reader);
+}
+
+static inline guint
+_gst_byte_reader_get_size_inline (const GstByteReader * reader)
+{
+  g_return_val_if_fail (reader != NULL, 0);
+
+  return _gst_byte_reader_get_size_unchecked (reader);
+}
+
+#define __GST_BYTE_READER_GET_PEEK_BITS_INLINE(bits,type,name) \
+\
+static inline gboolean \
+_gst_byte_reader_peek_##name##_inline (const GstByteReader * reader, type * val) \
+{ \
+  g_return_val_if_fail (reader != NULL, FALSE); \
+  g_return_val_if_fail (val != NULL, FALSE); \
+  \
+  if (_gst_byte_reader_get_remaining_unchecked (reader) < (bits / 8)) \
+    return FALSE; \
+\
+  *val = gst_byte_reader_peek_##name##_unchecked (reader); \
+  return TRUE; \
+} \
+\
+static inline gboolean \
+_gst_byte_reader_get_##name##_inline (GstByteReader * reader, type * val) \
+{ \
+  g_return_val_if_fail (reader != NULL, FALSE); \
+  g_return_val_if_fail (val != NULL, FALSE); \
+  \
+  if (_gst_byte_reader_get_remaining_unchecked (reader) < (bits / 8)) \
+    return FALSE; \
+\
+  *val = gst_byte_reader_get_##name##_unchecked (reader); \
+  return TRUE; \
+}
+
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(8,guint8,uint8)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(8,gint8,int8)
+
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,guint16,uint16_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,guint16,uint16_be)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,gint16,int16_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,gint16,int16_be)
+
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,guint32,uint32_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,guint32,uint32_be)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gint32,int32_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gint32,int32_be)
+
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,guint32,uint24_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,guint32,uint24_be)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,gint32,int24_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,gint32,int24_be)
+
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,guint64,uint64_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,guint64,uint64_be)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gint64,int64_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gint64,int64_be)
+
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gfloat,float32_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gfloat,float32_be)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gdouble,float64_le)
+__GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gdouble,float64_be)
+
+#undef __GST_BYTE_READER_GET_PEEK_BITS_INLINE
+
+#ifndef GST_BYTE_READER_DISABLE_INLINES
+
+#define gst_byte_reader_init(reader,data,size) \
+    _gst_byte_reader_init_inline(reader,data,size)
+
+#define gst_byte_reader_get_remaining(reader) \
+    _gst_byte_reader_get_remaining_inline(reader)
+
+#define gst_byte_reader_get_size(reader) \
+    _gst_byte_reader_get_size_inline(reader)
+
+#define gst_byte_reader_get_pos(reader) \
+    _gst_byte_reader_get_pos_inline(reader)
+
+/* we use defines here so we can add the G_LIKELY() */
+#define gst_byte_reader_get_uint8(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint8_inline(reader,val))
+#define gst_byte_reader_get_int8(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int8_inline(reader,val))
+#define gst_byte_reader_get_uint16_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint16_le_inline(reader,val))
+#define gst_byte_reader_get_int16_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int16_le_inline(reader,val))
+#define gst_byte_reader_get_uint16_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint16_be_inline(reader,val))
+#define gst_byte_reader_get_int16_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int16_be_inline(reader,val))
+#define gst_byte_reader_get_uint24_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint24_le_inline(reader,val))
+#define gst_byte_reader_get_int24_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int24_le_inline(reader,val))
+#define gst_byte_reader_get_uint24_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint24_be_inline(reader,val))
+#define gst_byte_reader_get_int24_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int24_be_inline(reader,val))
+#define gst_byte_reader_get_uint32_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint32_le_inline(reader,val))
+#define gst_byte_reader_get_int32_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int32_le_inline(reader,val))
+#define gst_byte_reader_get_uint32_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint32_be_inline(reader,val))
+#define gst_byte_reader_get_int32_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int32_be_inline(reader,val))
+#define gst_byte_reader_get_uint64_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint64_le_inline(reader,val))
+#define gst_byte_reader_get_int64_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int64_le_inline(reader,val))
+#define gst_byte_reader_get_uint64_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_uint64_be_inline(reader,val))
+#define gst_byte_reader_get_int64_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_int64_be_inline(reader,val))
+
+#define gst_byte_reader_peek_uint8(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint8_inline(reader,val))
+#define gst_byte_reader_peek_int8(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int8_inline(reader,val))
+#define gst_byte_reader_peek_uint16_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint16_le_inline(reader,val))
+#define gst_byte_reader_peek_int16_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int16_le_inline(reader,val))
+#define gst_byte_reader_peek_uint16_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint16_be_inline(reader,val))
+#define gst_byte_reader_peek_int16_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int16_be_inline(reader,val))
+#define gst_byte_reader_peek_uint24_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint24_le_inline(reader,val))
+#define gst_byte_reader_peek_int24_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int24_le_inline(reader,val))
+#define gst_byte_reader_peek_uint24_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint24_be_inline(reader,val))
+#define gst_byte_reader_peek_int24_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int24_be_inline(reader,val))
+#define gst_byte_reader_peek_uint32_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint32_le_inline(reader,val))
+#define gst_byte_reader_peek_int32_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int32_le_inline(reader,val))
+#define gst_byte_reader_peek_uint32_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint32_be_inline(reader,val))
+#define gst_byte_reader_peek_int32_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int32_be_inline(reader,val))
+#define gst_byte_reader_peek_uint64_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint64_le_inline(reader,val))
+#define gst_byte_reader_peek_int64_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int64_le_inline(reader,val))
+#define gst_byte_reader_peek_uint64_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_uint64_be_inline(reader,val))
+#define gst_byte_reader_peek_int64_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_int64_be_inline(reader,val))
+
+#define gst_byte_reader_get_float32_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_float32_le_inline(reader,val))
+#define gst_byte_reader_get_float32_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_float32_be_inline(reader,val))
+#define gst_byte_reader_get_float64_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_float64_le_inline(reader,val))
+#define gst_byte_reader_get_float64_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_get_float64_be_inline(reader,val))
+#define gst_byte_reader_peek_float32_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_float32_le_inline(reader,val))
+#define gst_byte_reader_peek_float32_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_float32_be_inline(reader,val))
+#define gst_byte_reader_peek_float64_le(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_float64_le_inline(reader,val))
+#define gst_byte_reader_peek_float64_be(reader,val) \
+    G_LIKELY(_gst_byte_reader_peek_float64_be_inline(reader,val))
+
+#endif /* GST_BYTE_READER_DISABLE_INLINES */
+
+static inline void
+_gst_byte_reader_init_inline (GstByteReader * reader, const guint8 * data, guint size)
+{
+  g_return_if_fail (reader != NULL);
+
+  reader->data = data;
+  reader->size = size;
+  reader->byte = 0;
+}
+
+static inline gboolean
+_gst_byte_reader_peek_sub_reader_inline (GstByteReader * reader,
+    GstByteReader * sub_reader, guint size)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+  g_return_val_if_fail (sub_reader != NULL, FALSE);
+
+  if (_gst_byte_reader_get_remaining_unchecked (reader) < size)
+    return FALSE;
+
+  sub_reader->data = reader->data + reader->byte;
+  sub_reader->byte = 0;
+  sub_reader->size = size;
+  return TRUE;
+}
+
+static inline gboolean
+_gst_byte_reader_get_sub_reader_inline (GstByteReader * reader,
+    GstByteReader * sub_reader, guint size)
+{
+  if (!_gst_byte_reader_peek_sub_reader_inline (reader, sub_reader, size))
+    return FALSE;
+  gst_byte_reader_skip_unchecked (reader, size);
+  return TRUE;
+}
+
+static inline gboolean
+_gst_byte_reader_dup_data_inline (GstByteReader * reader, guint size, guint8 ** val)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+  g_return_val_if_fail (val != NULL, FALSE);
+
+  if (G_UNLIKELY (size > reader->size || _gst_byte_reader_get_remaining_unchecked (reader) < size))
+    return FALSE;
+
+  *val = gst_byte_reader_dup_data_unchecked (reader, size);
+  return TRUE;
+}
+
+static inline gboolean
+_gst_byte_reader_get_data_inline (GstByteReader * reader, guint size, const guint8 ** val)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+  g_return_val_if_fail (val != NULL, FALSE);
+
+  if (G_UNLIKELY (size > reader->size || _gst_byte_reader_get_remaining_unchecked (reader) < size))
+    return FALSE;
+
+  *val = gst_byte_reader_get_data_unchecked (reader, size);
+  return TRUE;
+}
+
+static inline gboolean
+_gst_byte_reader_peek_data_inline (const GstByteReader * reader, guint size, const guint8 ** val)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+  g_return_val_if_fail (val != NULL, FALSE);
+
+  if (G_UNLIKELY (size > reader->size || _gst_byte_reader_get_remaining_unchecked (reader) < size))
+    return FALSE;
+
+  *val = gst_byte_reader_peek_data_unchecked (reader);
+  return TRUE;
+}
+
+static inline guint
+_gst_byte_reader_get_pos_inline (const GstByteReader * reader)
+{
+  g_return_val_if_fail (reader != NULL, 0);
+
+  return _gst_byte_reader_get_pos_unchecked (reader);
+}
+
+static inline gboolean
+_gst_byte_reader_skip_inline (GstByteReader * reader, guint nbytes)
+{
+  g_return_val_if_fail (reader != NULL, FALSE);
+
+  if (G_UNLIKELY (_gst_byte_reader_get_remaining_unchecked (reader) < nbytes))
+    return FALSE;
+
+  reader->byte += nbytes;
+  return TRUE;
+}
+
+#ifndef GST_BYTE_READER_DISABLE_INLINES
+
+#define gst_byte_reader_dup_data(reader,size,val) \
+    G_LIKELY(_gst_byte_reader_dup_data_inline(reader,size,val))
+#define gst_byte_reader_get_data(reader,size,val) \
+    G_LIKELY(_gst_byte_reader_get_data_inline(reader,size,val))
+#define gst_byte_reader_peek_data(reader,size,val) \
+    G_LIKELY(_gst_byte_reader_peek_data_inline(reader,size,val))
+#define gst_byte_reader_skip(reader,nbytes) \
+    G_LIKELY(_gst_byte_reader_skip_inline(reader,nbytes))
+
+#endif /* GST_BYTE_READER_DISABLE_INLINES */
+
+G_END_DECLS
+
+#endif /* __GST_BYTE_READER_H__ */
@@ -0,0 +1,9 @@
+/* Stub for <gst/codecparsers/codecparsers-prelude.h>.
+ * Same shape as base-prelude.h — drop the GObject boilerplate + define
+ * the GstCodecParsersAPI macro to nothing.
+ */
+#ifndef LIBVA_V4L2_REQUEST_FOURIER_CODECPARSERS_PRELUDE_STUB
+#define LIBVA_V4L2_REQUEST_FOURIER_CODECPARSERS_PRELUDE_STUB
+#include "gst_compat.h"
+#define GST_CODEC_PARSERS_API
+#endif
@@ -0,0 +1,545 @@
+/* Gstreamer
+ * Copyright (C) <2011> Intel Corporation
+ * Copyright (C) <2011> Collabora Ltd.
+ * Copyright (C) <2011> Thibault Saunier <thibault.saunier@collabora.com>
+ *
+ * Some bits C-c,C-v'ed and s/4/3 from h264parse and videoparsers/h264parse.c:
+ *    Copyright (C) <2010> Mark Nauwelaerts <mark.nauwelaerts@collabora.co.uk>
+ *    Copyright (C) <2010> Collabora Multimedia
+ *    Copyright (C) <2010> Nokia Corporation
+ *
+ *    (C) 2005 Michal Benes <michal.benes@itonis.tv>
+ *    (C) 2008 Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * Common code for NAL parsing from h264 and h265 parsers.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "nalutils.h"
+
+/****** Nal parser ******/
+
+void
+nal_reader_init (NalReader * nr, const guint8 * data, guint size)
+{
+  nr->data = data;
+  nr->size = size;
+  nr->n_epb = 0;
+
+  nr->byte = 0;
+  nr->bits_in_cache = 0;
+  /* fill with something other than 0 to detect emulation prevention bytes */
+  nr->first_byte = 0xff;
+  nr->epb_cache = 0xff;
+  nr->cache = 0xff;
+}
+
+gboolean
+nal_reader_read (NalReader * nr, guint nbits)
+{
+  if (G_UNLIKELY (nr->byte * 8 + (nbits - nr->bits_in_cache) > nr->size * 8)) {
+    GST_DEBUG ("Can not read %u bits, bits in cache %u, Byte * 8 %u, size in "
+        "bits %u", nbits, nr->bits_in_cache, nr->byte * 8, nr->size * 8);
+    return FALSE;
+  }
+
+  while (nr->bits_in_cache < nbits) {
+    guint8 byte;
+
+  next_byte:
+    if (G_UNLIKELY (nr->byte >= nr->size))
+      return FALSE;
+
+    byte = nr->data[nr->byte++];
+    nr->epb_cache = (nr->epb_cache << 8) | byte;
+
+    /* check if the byte is a emulation_prevention_three_byte */
+    if ((nr->epb_cache & 0xffffff) == 0x3) {
+      nr->n_epb++;
+      goto next_byte;
+    }
+    nr->cache = (nr->cache << 8) | nr->first_byte;
+    nr->first_byte = byte;
+    nr->bits_in_cache += 8;
+  }
+
+  return TRUE;
+}
+
+/* Skips the specified amount of bits. This is only suitable to a
+   cacheable number of bits */
+gboolean
+nal_reader_skip (NalReader * nr, guint nbits)
+{
+  g_assert (nbits <= 8 * sizeof (nr->cache));
+
+  if (G_UNLIKELY (!nal_reader_read (nr, nbits)))
+    return FALSE;
+
+  nr->bits_in_cache -= nbits;
+
+  return TRUE;
+}
+
+/* Generic version to skip any number of bits */
+gboolean
+nal_reader_skip_long (NalReader * nr, guint nbits)
+{
+  /* Leave out enough bits in the cache once we are finished */
+  const guint skip_size = 4 * sizeof (nr->cache);
+  guint remaining = nbits;
+
+  nbits %= skip_size;
+  while (remaining > 0) {
+    if (!nal_reader_skip (nr, nbits))
+      return FALSE;
+    remaining -= nbits;
+    nbits = skip_size;
+  }
+  return TRUE;
+}
+
+guint
+nal_reader_get_pos (const NalReader * nr)
+{
+  return nr->byte * 8 - nr->bits_in_cache;
+}
+
+guint
+nal_reader_get_remaining (const NalReader * nr)
+{
+  return (nr->size - nr->byte) * 8 + nr->bits_in_cache;
+}
+
+guint
+nal_reader_get_epb_count (const NalReader * nr)
+{
+  return nr->n_epb;
+}
+
+#define NAL_READER_READ_BITS(bits) \
+gboolean \
+nal_reader_get_bits_uint##bits (NalReader *nr, guint##bits *val, guint nbits) \
+{ \
+  guint shift; \
+  \
+  if (!nal_reader_read (nr, nbits)) \
+    return FALSE; \
+  \
+  /* bring the required bits down and truncate */ \
+  shift = nr->bits_in_cache - nbits; \
+  *val = nr->first_byte >> shift; \
+  \
+  *val |= nr->cache << (8 - shift); \
+  /* mask out required bits */ \
+  if (nbits < bits) \
+    *val &= ((guint##bits)1 << nbits) - 1; \
+  \
+  nr->bits_in_cache = shift; \
+  \
+  return TRUE; \
+} \
+
+NAL_READER_READ_BITS (8);
+NAL_READER_READ_BITS (16);
+NAL_READER_READ_BITS (32);
+
+#define NAL_READER_PEEK_BITS(bits) \
+gboolean \
+nal_reader_peek_bits_uint##bits (const NalReader *nr, guint##bits *val, guint nbits) \
+{ \
+  NalReader tmp; \
+  \
+  tmp = *nr; \
+  return nal_reader_get_bits_uint##bits (&tmp, val, nbits); \
+}
+
+NAL_READER_PEEK_BITS (8);
+
+gboolean
+nal_reader_get_ue (NalReader * nr, guint32 * val)
+{
+  guint i = 0;
+  guint8 bit;
+  guint32 value;
+
+  if (G_UNLIKELY (!nal_reader_get_bits_uint8 (nr, &bit, 1)))
+    return FALSE;
+
+  while (bit == 0) {
+    i++;
+    if (G_UNLIKELY (!nal_reader_get_bits_uint8 (nr, &bit, 1)))
+      return FALSE;
+  }
+
+  if (G_UNLIKELY (i > 31))
+    return FALSE;
+
+  if (G_UNLIKELY (!nal_reader_get_bits_uint32 (nr, &value, i)))
+    return FALSE;
+
+  *val = (1 << i) - 1 + value;
+
+  return TRUE;
+}
+
+gboolean
+nal_reader_get_se (NalReader * nr, gint32 * val)
+{
+  guint32 value;
+
+  if (G_UNLIKELY (!nal_reader_get_ue (nr, &value)))
+    return FALSE;
+
+  if (value % 2)
+    *val = (value / 2) + 1;
+  else
+    *val = -(value / 2);
+
+  return TRUE;
+}
+
+gboolean
+nal_reader_is_byte_aligned (NalReader * nr)
+{
+  if (nr->bits_in_cache != 0)
+    return FALSE;
+  return TRUE;
+}
+
+gboolean
+nal_reader_has_more_data (NalReader * nr)
+{
+  NalReader nr_tmp;
+  guint remaining, nbits;
+  guint8 rbsp_stop_one_bit, zero_bits;
+
+  remaining = nal_reader_get_remaining (nr);
+  if (remaining == 0)
+    return FALSE;
+
+  nr_tmp = *nr;
+  nr = &nr_tmp;
+
+  /* The spec defines that more_rbsp_data() searches for the last bit
+     equal to 1, and that it is the rbsp_stop_one_bit. Subsequent bits
+     until byte boundary is reached shall be zero.
+
+     This means that more_rbsp_data() is FALSE if the next bit is 1
+     and the remaining bits until byte boundary are zero. One way to
+     be sure that this bit was the very last one, is that every other
+     bit after we reached byte boundary are also set to zero.
+     Otherwise, if the next bit is 0 or if there are non-zero bits
+     afterwards, then then we have more_rbsp_data() */
+  if (!nal_reader_get_bits_uint8 (nr, &rbsp_stop_one_bit, 1))
+    return FALSE;
+  if (!rbsp_stop_one_bit)
+    return TRUE;
+
+  nbits = --remaining % 8;
+  while (remaining > 0) {
+    if (!nal_reader_get_bits_uint8 (nr, &zero_bits, nbits))
+      return FALSE;
+    if (zero_bits != 0)
+      return TRUE;
+    remaining -= nbits;
+    nbits = 8;
+  }
+  return FALSE;
+}
+
+/***********  end of nal parser ***************/
+
+gint
+scan_for_start_codes (const guint8 * data, guint size)
+{
+  GstByteReader br;
+  gst_byte_reader_init (&br, data, size);
+
+  /* NALU not empty, so we can at least expect 1 (even 2) bytes following sc */
+  return gst_byte_reader_masked_scan_uint32 (&br, 0xffffff00, 0x00000100,
+      0, size);
+}
+
+void
+nal_writer_init (NalWriter * nw, guint nal_prefix_size, gboolean packetized)
+{
+  g_return_if_fail (nw != NULL);
+  g_return_if_fail ((packetized && nal_prefix_size > 1 && nal_prefix_size < 5)
+      || (!packetized && (nal_prefix_size == 3 || nal_prefix_size == 4)));
+
+  gst_bit_writer_init (&nw->bw);
+  nw->nal_prefix_size = nal_prefix_size;
+  nw->packetized = packetized;
+}
+
+void
+nal_writer_reset (NalWriter * nw)
+{
+  g_return_if_fail (nw != NULL);
+
+  gst_bit_writer_reset (&nw->bw);
+  memset (nw, 0, sizeof (NalWriter));
+}
+
+gboolean
+nal_writer_do_rbsp_trailing_bits (NalWriter * nw)
+{
+  g_return_val_if_fail (nw != NULL, FALSE);
+
+  if (!gst_bit_writer_put_bits_uint8 (&nw->bw, 1, 1)) {
+    GST_WARNING ("Cannot put trailing bits");
+    return FALSE;
+  }
+
+  if (!gst_bit_writer_align_bytes (&nw->bw, 0)) {
+    GST_WARNING ("Cannot put align bits");
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+static gpointer
+nal_writer_create_nal_data (NalWriter * nw, guint32 * ret_size)
+{
+  GstBitWriter bw;
+  gint i;
+  guint8 *src, *dst;
+  gsize size;
+  gpointer data;
+
+  /* scan to put emulation_prevention_three_byte */
+  size = GST_BIT_WRITER_BIT_SIZE (&nw->bw) >> 3;
+  src = GST_BIT_WRITER_DATA (&nw->bw);
+
+  gst_bit_writer_init_with_size (&bw, size + nw->nal_prefix_size, FALSE);
+  for (i = 0; i < nw->nal_prefix_size - 1; i++)
+    gst_bit_writer_put_bits_uint8 (&bw, 0, 8);
+  gst_bit_writer_put_bits_uint8 (&bw, 1, 8);
+
+  for (i = 0; i < size; i++) {
+    guint pos = (GST_BIT_WRITER_BIT_SIZE (&bw) >> 3);
+    dst = GST_BIT_WRITER_DATA (&bw);
+    if (pos >= nw->nal_prefix_size + 2 &&
+        dst[pos - 2] == 0 && dst[pos - 1] == 0 && src[i] <= 0x3) {
+      gst_bit_writer_put_bits_uint8 (&bw, 0x3, 8);
+    }
+
+    gst_bit_writer_put_bits_uint8 (&bw, src[i], 8);
+  }
+
+  *ret_size = bw.bit_size >> 3;
+  data = gst_bit_writer_reset_and_get_data (&bw);
+
+  if (nw->packetized) {
+    size = *ret_size - nw->nal_prefix_size;
+
+    switch (nw->nal_prefix_size) {
+      case 1:
+        GST_WRITE_UINT8 (data, size);
+        break;
+      case 2:
+        GST_WRITE_UINT16_BE (data, size);
+        break;
+      case 3:
+        GST_WRITE_UINT24_BE (data, size);
+        break;
+      case 4:
+        GST_WRITE_UINT32_BE (data, size);
+        break;
+      default:
+        g_assert_not_reached ();
+        break;
+    }
+  }
+
+  return data;
+}
+
+GstMemory *
+nal_writer_reset_and_get_memory (NalWriter * nw)
+{
+  guint32 size = 0;
+  GstMemory *ret = NULL;
+  gpointer data;
+
+  g_return_val_if_fail (nw != NULL, NULL);
+
+  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) >> 3) == 0) {
+    GST_WARNING ("No written byte");
+    goto done;
+  }
+
+  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) & 0x7) != 0) {
+    GST_WARNING ("Written stream is not byte aligned");
+    if (!nal_writer_do_rbsp_trailing_bits (nw))
+      goto done;
+  }
+
+  data = nal_writer_create_nal_data (nw, &size);
+  if (!data) {
+    GST_WARNING ("Failed to create nal data");
+    goto done;
+  }
+
+  ret = gst_memory_new_wrapped (0, data, size, 0, size, data, g_free);
+
+done:
+  gst_bit_writer_reset (&nw->bw);
+
+  return ret;
+}
+
+guint8 *
+nal_writer_reset_and_get_data (NalWriter * nw, guint32 * ret_size)
+{
+  guint32 size = 0;
+  guint8 *data = NULL;
+
+  g_return_val_if_fail (nw != NULL, NULL);
+  g_return_val_if_fail (ret_size != NULL, NULL);
+
+  *ret_size = 0;
+
+  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) >> 3) == 0) {
+    GST_WARNING ("No written byte");
+    goto done;
+  }
+
+  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) & 0x7) != 0) {
+    GST_WARNING ("Written stream is not byte aligned");
+    if (!nal_writer_do_rbsp_trailing_bits (nw))
+      goto done;
+  }
+
+  data = nal_writer_create_nal_data (nw, &size);
+  if (!data) {
+    GST_WARNING ("Failed to create nal data");
+    goto done;
+  }
+
+  *ret_size = size;
+
+done:
+  gst_bit_writer_reset (&nw->bw);
+
+  return data;
+}
+
+gboolean
+nal_writer_put_bits_uint8 (NalWriter * nw, guint8 value, guint nbits)
+{
+  g_return_val_if_fail (nw != NULL, FALSE);
+
+  if (!gst_bit_writer_put_bits_uint8 (&nw->bw, value, nbits))
+    return FALSE;
+
+  return TRUE;
+}
+
+gboolean
+nal_writer_put_bits_uint16 (NalWriter * nw, guint16 value, guint nbits)
+{
+  g_return_val_if_fail (nw != NULL, FALSE);
+
+  if (!gst_bit_writer_put_bits_uint16 (&nw->bw, value, nbits))
+    return FALSE;
+
+  return TRUE;
+}
+
+gboolean
+nal_writer_put_bits_uint32 (NalWriter * nw, guint32 value, guint nbits)
+{
+  g_return_val_if_fail (nw != NULL, FALSE);
+
+  if (!gst_bit_writer_put_bits_uint32 (&nw->bw, value, nbits))
+    return FALSE;
+
+  return TRUE;
+}
+
+gboolean
+nal_writer_put_bytes (NalWriter * nw, const guint8 * data, guint nbytes)
+{
+  g_return_val_if_fail (nw != NULL, FALSE);
+  g_return_val_if_fail (data != NULL, FALSE);
+  g_return_val_if_fail (nbytes != 0, FALSE);
+
+  if (!gst_bit_writer_put_bytes (&nw->bw, data, nbytes))
+    return FALSE;
+
+  return TRUE;
+}
+
+gboolean
+nal_writer_put_ue (NalWriter * nw, guint32 value)
+{
+  guint leading_zeros;
+  guint rest;
+
+  g_return_val_if_fail (nw != NULL, FALSE);
+
+  count_exp_golomb_bits (value, &leading_zeros, &rest);
+
+  /* write leading zeros */
+  if (leading_zeros) {
+    if (!nal_writer_put_bits_uint32 (nw, 0, leading_zeros))
+      return FALSE;
+  }
+
+  /* write the rest */
+  if (!nal_writer_put_bits_uint32 (nw, value + 1, rest))
+    return FALSE;
+
+  return TRUE;
+}
+
+gboolean
+count_exp_golomb_bits (guint32 value, guint * leading_zeros, guint * rest)
+{
+  guint32 x;
+  guint count = 0;
+
+  /* https://en.wikipedia.org/wiki/Exponential-Golomb_coding */
+  /* count bits of value + 1 */
+  x = value + 1;
+  while (x) {
+    count++;
+    x >>= 1;
+  }
+
+  if (leading_zeros) {
+    if (count > 1)
+      *leading_zeros = count - 1;
+    else
+      *leading_zeros = 0;
+  }
+
+  if (rest) {
+    *rest = count;
+  }
+
+  return TRUE;
+}
@@ -0,0 +1,269 @@
+/* Gstreamer
+ * Copyright (C) <2011> Intel Corporation
+ * Copyright (C) <2011> Collabora Ltd.
+ * Copyright (C) <2011> Thibault Saunier <thibault.saunier@collabora.com>
+ *
+ * Some bits C-c,C-v'ed and s/4/3 from h264parse and videoparsers/h264parse.c:
+ *    Copyright (C) <2010> Mark Nauwelaerts <mark.nauwelaerts@collabora.co.uk>
+ *    Copyright (C) <2010> Collabora Multimedia
+ *    Copyright (C) <2010> Nokia Corporation
+ *
+ *    (C) 2005 Michal Benes <michal.benes@itonis.tv>
+ *    (C) 2008 Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/**
+ * Common code for NAL parsing from h264 and h265 parsers.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include <gst/base/gstbytereader.h>
+#include <gst/base/gstbitreader.h>
+#include <gst/base/gstbitwriter.h>
+
+typedef struct
+{
+  const guint8 *data;
+  guint size;
+
+  guint n_epb;                  /* Number of emulation prevention bytes */
+  guint byte;                   /* Byte position */
+  guint bits_in_cache;          /* bitpos in the cache of next bit */
+  guint8 first_byte;
+  guint32 epb_cache;            /* cache 3 bytes to check emulation prevention bytes */
+  guint64 cache;                /* cached bytes */
+} NalReader;
+
+typedef struct
+{
+  GstBitWriter bw;
+
+  guint nal_prefix_size;
+  gboolean packetized;
+} NalWriter;
+
+G_GNUC_INTERNAL
+void nal_reader_init (NalReader * nr, const guint8 * data, guint size);
+
+G_GNUC_INTERNAL
+gboolean nal_reader_read (NalReader * nr, guint nbits);
+
+G_GNUC_INTERNAL
+gboolean nal_reader_skip (NalReader * nr, guint nbits);
+
+G_GNUC_INTERNAL
+gboolean nal_reader_skip_long (NalReader * nr, guint nbits);
+
+G_GNUC_INTERNAL
+guint nal_reader_get_pos (const NalReader * nr);
+
+G_GNUC_INTERNAL
+guint nal_reader_get_remaining (const NalReader * nr);
+
+G_GNUC_INTERNAL
+guint nal_reader_get_epb_count (const NalReader * nr);
+
+G_GNUC_INTERNAL
+gboolean nal_reader_is_byte_aligned (NalReader * nr);
+
+G_GNUC_INTERNAL
+gboolean nal_reader_has_more_data (NalReader * nr);
+
+#define NAL_READER_READ_BITS_H(bits) \
+G_GNUC_INTERNAL \
+gboolean nal_reader_get_bits_uint##bits (NalReader *nr, guint##bits *val, guint nbits)
+
+NAL_READER_READ_BITS_H (8);
+NAL_READER_READ_BITS_H (16);
+NAL_READER_READ_BITS_H (32);
+
+#define NAL_READER_PEEK_BITS_H(bits) \
+G_GNUC_INTERNAL \
+gboolean nal_reader_peek_bits_uint##bits (const NalReader *nr, guint##bits *val, guint nbits)
+
+NAL_READER_PEEK_BITS_H (8);
+
+G_GNUC_INTERNAL
+gboolean nal_reader_get_ue (NalReader * nr, guint32 * val);
+
+G_GNUC_INTERNAL
+gboolean nal_reader_get_se (NalReader * nr, gint32 * val);
+
+#define CHECK_ALLOWED_MAX_WITH_DEBUG(dbg, val, max) { \
+  if (val > max) { \
+    GST_WARNING ("value for '" dbg "' greater than max. value: %d, max %d", \
+                     val, max); \
+    goto error; \
+  } \
+}
+#define CHECK_ALLOWED_MAX(val, max) \
+  CHECK_ALLOWED_MAX_WITH_DEBUG (G_STRINGIFY (val), val, max)
+
+#define CHECK_ALLOWED_WITH_DEBUG(dbg, val, min, max) { \
+  if (val < min || val > max) { \
+    GST_WARNING ("value for '" dbg "' not in allowed range. value: %d, range %d-%d", \
+                     val, min, max); \
+    goto error; \
+  } \
+}
+#define CHECK_ALLOWED(val, min, max) \
+  CHECK_ALLOWED_WITH_DEBUG (G_STRINGIFY (val), val, min, max)
+
+#define READ_UINT8(nr, val, nbits) { \
+  if (!nal_reader_get_bits_uint8 (nr, &val, nbits)) { \
+    GST_WARNING ("failed to read uint8 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
+    goto error; \
+  } \
+}
+
+#define READ_UINT16(nr, val, nbits) { \
+  if (!nal_reader_get_bits_uint16 (nr, &val, nbits)) { \
+  GST_WARNING ("failed to read uint16 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
+    goto error; \
+  } \
+}
+
+#define READ_UINT32(nr, val, nbits) { \
+  if (!nal_reader_get_bits_uint32 (nr, &val, nbits)) { \
+  GST_WARNING ("failed to read uint32 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
+    goto error; \
+  } \
+}
+
+#define READ_UINT64(nr, val, nbits) { \
+  if (!nal_reader_get_bits_uint64 (nr, &val, nbits)) { \
+    GST_WARNING ("failed to read uint32 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
+    goto error; \
+  } \
+}
+
+#define READ_UE(nr, val) { \
+  if (!nal_reader_get_ue (nr, &val)) { \
+    GST_WARNING ("failed to read UE for '" G_STRINGIFY (val) "'"); \
+    goto error; \
+  } \
+}
+
+#define READ_UE_ALLOWED(nr, val, min, max) { \
+  guint32 tmp; \
+  READ_UE (nr, tmp); \
+  CHECK_ALLOWED_WITH_DEBUG (G_STRINGIFY (val), tmp, min, max); \
+  val = tmp; \
+}
+
+#define READ_UE_MAX(nr, val, max) { \
+  guint32 tmp; \
+  READ_UE (nr, tmp); \
+  CHECK_ALLOWED_MAX_WITH_DEBUG (G_STRINGIFY (val), tmp, max); \
+  val = tmp; \
+}
+
+#define READ_SE(nr, val) { \
+  if (!nal_reader_get_se (nr, &val)) { \
+    GST_WARNING ("failed to read SE for '" G_STRINGIFY (val) "'"); \
+    goto error; \
+  } \
+}
+
+#define READ_SE_ALLOWED(nr, val, min, max) { \
+  gint32 tmp; \
+  READ_SE (nr, tmp); \
+  CHECK_ALLOWED_WITH_DEBUG (G_STRINGIFY (val), tmp, min, max); \
+  val = tmp; \
+}
+
+G_GNUC_INTERNAL
+gint scan_for_start_codes (const guint8 * data, guint size);
+
+G_GNUC_INTERNAL
+void nal_writer_init (NalWriter * nw, guint nal_prefix_size, gboolean packetized);
+
+G_GNUC_INTERNAL
+void nal_writer_reset (NalWriter * nw);
+
+G_GNUC_INTERNAL
+gboolean nal_writer_do_rbsp_trailing_bits (NalWriter * nw);
+
+G_GNUC_INTERNAL
+GstMemory * nal_writer_reset_and_get_memory (NalWriter * nw);
+
+G_GNUC_INTERNAL
+guint8 * nal_writer_reset_and_get_data (NalWriter * nw, guint32 * ret_size);
+
+G_GNUC_INTERNAL
+gboolean nal_writer_put_bits_uint8 (NalWriter * nw, guint8 value, guint nbits);
+
+G_GNUC_INTERNAL
+gboolean nal_writer_put_bits_uint16 (NalWriter * nw, guint16 value, guint nbits);
+
+G_GNUC_INTERNAL
+gboolean nal_writer_put_bits_uint32 (NalWriter * nw, guint32 value, guint nbits);
+
+G_GNUC_INTERNAL
+gboolean nal_writer_put_bytes (NalWriter * nw, const guint8 * data, guint nbytes);
+
+G_GNUC_INTERNAL
+gboolean nal_writer_put_ue (NalWriter * nw, guint32 value);
+
+G_GNUC_INTERNAL
+gboolean count_exp_golomb_bits (guint32 value, guint * leading_zeros, guint * rest);
+
+#define WRITE_UINT8(nw, val, nbits) { \
+  if (!nal_writer_put_bits_uint8 (nw, val, nbits)) { \
+    GST_WARNING ("failed to write uint8 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
+    goto error; \
+  } \
+}
+
+#define WRITE_UINT16(nw, val, nbits) { \
+  if (!nal_writer_put_bits_uint16 (nw, val, nbits)) { \
+    GST_WARNING ("failed to write uint16 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
+    goto error; \
+  } \
+}
+
+#define WRITE_UINT32(nw, val, nbits) { \
+  if (!nal_writer_put_bits_uint32 (nw, val, nbits)) { \
+    GST_WARNING ("failed to write uint32 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
+    goto error; \
+  } \
+}
+
+#define WRITE_BYTES(nw, data, nbytes) { \
+  if (!nal_writer_put_bytes (nw, data, nbytes)) { \
+    GST_WARNING ("failed to write bytes for '" G_STRINGIFY (val) "', nbits: %d", nbytes); \
+    goto error; \
+  } \
+}
+
+#define WRITE_UE(nw, val) { \
+  if (!nal_writer_put_ue (nw, val)) { \
+    GST_WARNING ("failed to write ue for '" G_STRINGIFY (val) "'"); \
+    goto error; \
+  } \
+}
+
+static inline guint32 div_ceil (guint32 a, guint32 b)
+{
+  /* http://blog.pkh.me/p/36-figuring-out-round%2C-floor-and-ceil-with-integer-division.html */
+  g_assert (b > 0);
+  return a / b + (a % b > 0);
+}
@@ -0,0 +1,10 @@
+/* Stub for <gst/glib-compat-private.h>.
+ * In upstream GStreamer this provides backwards-compat shims for older
+ * GLib versions (g_memdup2 polyfill being the load-bearing one).
+ * Our gst_compat.h already defines g_memdup2 as a static inline, so
+ * we just include the shim.
+ */
+#ifndef LIBVA_V4L2_REQUEST_FOURIER_GLIB_COMPAT_PRIVATE_STUB
+#define LIBVA_V4L2_REQUEST_FOURIER_GLIB_COMPAT_PRIVATE_STUB
+#include "gst_compat.h"
+#endif
@@ -0,0 +1,10 @@
+/* Stub for <gst/gst.h> — redirects to the project's gst_compat shim.
+ * The vendored GStreamer 1.28.2 H.265 parser was originally built against
+ * full GStreamer; we only need the GLib type aliases + memory helpers +
+ * macro stubs, all provided by gst_compat.h. Original gst.h would pull
+ * in GObject + GstObject + the entire framework, which we don't link.
+ */
+#ifndef LIBVA_V4L2_REQUEST_FOURIER_GST_H_STUB
+#define LIBVA_V4L2_REQUEST_FOURIER_GST_H_STUB
+#include "gst_compat.h"
+#endif
@@ -0,0 +1,145 @@
+/*
+ * gst_compat.c — GArray implementation for the vendored GStreamer parser.
+ *
+ * Scope: minimal subset of GArray API exercised by gsth265parser.c
+ * (g_array_new, g_array_sized_new, g_array_append_vals + the
+ * g_array_append_val macro, g_array_index macro, g_array_set_size,
+ * g_array_set_clear_func, g_array_free, g_array_unref).
+ *
+ * Non-thread-safe (matches GArray's documented semantics — GArray is
+ * not thread-safe in upstream GLib either, callers must serialize).
+ *
+ * License: MIT (matches backend's COPYING.MIT).
+ */
+
+#include "gst_compat.h"
+
+/* ===== internal helpers ===== */
+
+static gboolean
+garray_grow(GArray *array, guint new_capacity)
+{
+    if (new_capacity <= array->capacity)
+        return TRUE;
+
+    /* round up to next power of two for amortized O(1) growth */
+    guint cap = array->capacity > 0 ? array->capacity : 4;
+    while (cap < new_capacity)
+        cap *= 2;
+
+    char *new_data = realloc(array->data, (size_t)cap * array->element_size);
+    if (new_data == NULL)
+        return FALSE;
+
+    if (array->clear) {
+        memset(new_data + (size_t)array->capacity * array->element_size, 0,
+               (size_t)(cap - array->capacity) * array->element_size);
+    }
+
+    array->data = new_data;
+    array->capacity = cap;
+    return TRUE;
+}
+
+/* ===== public API ===== */
+
+GArray *
+g_array_sized_new(gboolean zero_terminated, gboolean clear,
+                  guint element_size, guint reserved_size)
+{
+    /* zero_terminated is GLib-specific (appends a zero-element sentinel
+     * for trailing-NULL semantics). The vendored parser does not use it;
+     * we ignore the flag. */
+    (void)zero_terminated;
+
+    GArray *a = calloc(1, sizeof(GArray));
+    if (a == NULL)
+        return NULL;
+
+    a->element_size = element_size;
+    a->clear = clear;
+
+    if (reserved_size > 0) {
+        if (!garray_grow(a, reserved_size)) {
+            free(a);
+            return NULL;
+        }
+    }
+    return a;
+}
+
+GArray *
+g_array_new(gboolean zero_terminated, gboolean clear, guint element_size)
+{
+    return g_array_sized_new(zero_terminated, clear, element_size, 0);
+}
+
+GArray *
+g_array_set_size(GArray *array, guint length)
+{
+    if (length > array->capacity) {
+        if (!garray_grow(array, length))
+            return array;
+    }
+
+    if (array->clear_func != NULL && length < array->len) {
+        for (guint i = length; i < array->len; i++)
+            array->clear_func(array->data + (size_t)i * array->element_size);
+    }
+    if (array->clear && length > array->len) {
+        memset(array->data + (size_t)array->len * array->element_size, 0,
+               (size_t)(length - array->len) * array->element_size);
+    }
+    array->len = length;
+    return array;
+}
+
+GArray *
+g_array_append_vals(GArray *array, gconstpointer data, guint len)
+{
+    if (len == 0)
+        return array;
+
+    if (!garray_grow(array, array->len + len))
+        return array;
+
+    memcpy(array->data + (size_t)array->len * array->element_size,
+           data, (size_t)len * array->element_size);
+    array->len += len;
+    return array;
+}
+
+void
+g_array_set_clear_func(GArray *array, void (*clear_func)(gpointer))
+{
+    array->clear_func = clear_func;
+}
+
+gchar *
+g_array_free(GArray *array, gboolean free_segment)
+{
+    if (array == NULL)
+        return NULL;
+
+    if (array->clear_func != NULL) {
+        for (guint i = 0; i < array->len; i++)
+            array->clear_func(array->data + (size_t)i * array->element_size);
+    }
+
+    gchar *data = NULL;
+    if (free_segment) {
+        free(array->data);
+    } else {
+        data = array->data;
+    }
+    free(array);
+    return data;
+}
+
+GArray *
+g_array_unref(GArray *array)
+{
+    /* simplified to free; the backend never sub-references shared GArrays */
+    g_array_free(array, TRUE);
+    return NULL;
+}
@@ -0,0 +1,463 @@
+/*
+ * gst_compat.h — minimal GLib/GStreamer compatibility shim for vendored
+ * GStreamer 1.28.2 H.265 parser + bitreader + bytereader + nalutils.
+ *
+ * Strategy: provide #defines / typedefs for the GLib API surface those
+ * 4 vendored files use, so they can compile against libc + libv4l2 only
+ * (no glib2 / gst-base linkage). Vendored .c files are NOT modified
+ * directly; instead this header is force-included via the Makefile's
+ * `-include` flag on the vendored translation units.
+ *
+ * Coverage scoped to what gsth265parser.c + nalutils.c + gstbitreader.c
+ * + gstbytereader.c actually call. Surveyed in
+ * ampere-kernel-decoders phase4 step 2 prep — see
+ * ~/src/ampere-kernel-decoders/phase4_plan_iter2.md and the survey
+ * commit message for the empirical inventory.
+ *
+ * License: this shim is original work, MIT (matching the backend's
+ * COPYING.MIT). The vendored .c files keep their LGPL v2.1+ headers
+ * verbatim.
+ */
+
+#ifndef LIBVA_V4L2_REQUEST_FOURIER_GST_COMPAT_H
+#define LIBVA_V4L2_REQUEST_FOURIER_GST_COMPAT_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* ===== GLib type aliases ===== */
+
+typedef bool          gboolean;
+typedef char          gchar;
+typedef unsigned char guchar;
+typedef int           gint;
+typedef int8_t        gint8;
+typedef int16_t       gint16;
+typedef int32_t       gint32;
+typedef int64_t       gint64;
+typedef unsigned int  guint;
+typedef uint8_t       guint8;
+typedef uint16_t      guint16;
+typedef uint32_t      guint32;
+typedef uint64_t      guint64;
+typedef size_t        gsize;
+typedef ptrdiff_t     gssize;
+typedef void *        gpointer;
+typedef const void *  gconstpointer;
+typedef double        gdouble;
+typedef float         gfloat;
+
+/* GLib's gint64 / guint64 formatting is platform-conditional; for our
+ * aarch64 ALARM target we don't need the full G_*_FORMAT machinery, but
+ * gstbytereader uses G_GSIZE_FORMAT in a debug-only printf. */
+#define G_GSIZE_FORMAT "zu"
+
+#ifndef TRUE
+# define TRUE  true
+#endif
+#ifndef FALSE
+# define FALSE false
+#endif
+
+/* ===== memory ===== */
+
+#define g_malloc(n)         malloc((size_t)(n))
+#define g_malloc0(n)        calloc(1, (size_t)(n))
+#define g_realloc(p, n)     realloc((p), (size_t)(n))
+/* g_free needs to be addressable (passed as a function-pointer arg by
+ * nalutils.c::gst_memory_new_wrapped — even though that call site is
+ * dead code we don't invoke, it must compile). Plain `free` is
+ * compatible: signature is `void (void *)` either way. */
+#define g_free              free
+#define g_new(type, n)      ((type *)malloc(sizeof(type) * (size_t)(n)))
+#define g_new0(type, n)     ((type *)calloc((size_t)(n), sizeof(type)))
+#define g_slice_new(type)   ((type *)malloc(sizeof(type)))
+#define g_slice_new0(type)  ((type *)calloc(1, sizeof(type)))
+#define g_slice_free(type, p)        free(p)
+#define g_slice_free1(size, p)       free(p)
+#define g_clear_pointer(pp, freefn) \
+        do { freefn(*(pp)); *(pp) = NULL; } while (0)
+
+/* g_memdup2 — GLib's 64-bit-safe memdup, used by gstbytereader. */
+static inline gpointer
+g_memdup2(gconstpointer mem, gsize byte_size)
+{
+    if (mem == NULL || byte_size == 0)
+        return NULL;
+    void *copy = malloc(byte_size);
+    if (copy != NULL)
+        memcpy(copy, mem, byte_size);
+    return copy;
+}
+
+/* g_strcmp0 — NULL-safe strcmp. Used by gsth265parser in profile-name lookup. */
+static inline int
+g_strcmp0(const char *a, const char *b)
+{
+    if (a == b) return 0;
+    if (a == NULL) return -1;
+    if (b == NULL) return 1;
+    return strcmp(a, b);
+}
+
+/* ===== asserts / return-guards =====
+ *
+ * Per ampere-kernel-decoders iter2 Phase 2 §"new failure modes" #5:
+ * g_assert must NOT abort the process. It becomes a no-op here;
+ * malformed bitstream is caught by the explicit parse-result returns
+ * the parser already implements.
+ *
+ * g_return_if_fail / g_return_val_if_fail propagate as the original
+ * GLib semantics (early return with optional value). */
+
+#define g_assert(cond)                ((void)0)
+#define g_assert_not_reached()        __builtin_unreachable()
+#define g_return_if_fail(cond)        do { if (!(cond)) return; } while (0)
+#define g_return_val_if_fail(cond, v) do { if (!(cond)) return (v); } while (0)
+
+/* ===== GStreamer logging — no-ops =====
+ *
+ * The parser is heavy on debug logging. We compile all of it out;
+ * the backend's own logging (request_log/error_log) wraps the parser
+ * calls and reports parse-failure return codes from there. */
+
+#define GST_DISABLE_GST_DEBUG 1
+
+#define GST_DEBUG_CATEGORY_STATIC(name)
+#define GST_DEBUG_CATEGORY_INIT(...) ((void)0)
+#define GST_DEBUG_CATEGORY_GET(...)  ((void)0)
+#define GST_DEBUG(...)               ((void)0)
+#define GST_INFO(...)                ((void)0)
+#define GST_WARNING(...)             ((void)0)
+#define GST_ERROR(...)               ((void)0)
+#define GST_LOG(...)                 ((void)0)
+#define GST_FIXME(...)               ((void)0)
+#define GST_MEMDUMP(...)             ((void)0)
+#define GST_CAT_DEFAULT              (NULL)
+
+/* ===== compiler / language helpers ===== */
+
+#define G_LIKELY(x)        __builtin_expect(!!(x), 1)
+#define G_UNLIKELY(x)      __builtin_expect(!!(x), 0)
+#define G_GNUC_UNUSED      __attribute__((unused))
+#define G_GNUC_INTERNAL
+#define G_GNUC_MALLOC      __attribute__((malloc))
+#define G_GNUC_NORETURN    __attribute__((noreturn))
+#define G_GNUC_DEPRECATED
+#define G_GNUC_DEPRECATED_FOR(x)
+#define G_GNUC_PURE        __attribute__((pure))
+#define G_GNUC_CONST       __attribute__((const))
+#define G_GNUC_PRINTF(a, b) __attribute__((format(printf, a, b)))
+#define G_BEGIN_DECLS
+#define G_END_DECLS
+#define G_N_ELEMENTS(arr)  (sizeof(arr) / sizeof((arr)[0]))
+#define G_STMT_START       do
+#define G_STMT_END         while (0)
+#define G_STRINGIFY(x)     G_STRINGIFY_(x)
+#define G_STRINGIFY_(x)    #x
+
+/* GStreamer ABI-padding slot count; upstream uses 4 reserved gpointers
+ * at the end of public structs for future ABI extension. We replicate
+ * the size so struct layout matches what gst_byte_reader_init / friends
+ * write into. */
+#define GST_PADDING        4
+#define GST_PADDING_LARGE  20
+
+/* Public-symbol visibility — backend's shared module uses
+ * -fvisibility=hidden, so we don't need to mark anything public from
+ * within the vendored parser. The original GST_*_API macros expand to
+ * extern + dllimport on Windows; on Linux ELF builds where
+ * fvisibility=hidden is active, they would mark public symbols. The
+ * vendored functions are never called from outside h265_parser/, so
+ * leaving these empty hides them automatically. */
+#define GST_API
+#define GST_API_EXPORT     extern
+#define GST_API_IMPORT     extern
+
+/* ===== Opaque GStreamer pipeline types =====
+ *
+ * GstBuffer + GstMemory are referenced by encoder-side dead-code
+ * functions in gsth265parser.c (gst_h265_parser_insert_sei_hevc).
+ * We never call those; declaring them as opaque structs lets the
+ * function pointers / declarations compile, and the linker keeps the
+ * dead-code .text section even though it's unreachable.
+ *
+ * If you ever need to actually USE GstBuffer in this tree, replace
+ * these opaque decls with the project's own buffer abstraction; do not
+ * try to vendor in libgst itself. */
+
+typedef struct _GstBuffer GstBuffer;
+typedef struct _GstMemory GstMemory;
+typedef struct _GstMapInfo GstMapInfo;  /* opaque — dead-code in gsth265parser SEI insert */
+
+/* GLib min/max constants — dead-code unsigned-overflow guards in
+ * gsth265parser.c. */
+#define G_MAXUINT8   ((guint8)0xFF)
+#define G_MAXUINT16  ((guint16)0xFFFF)
+#define G_MAXUINT32  ((guint32)0xFFFFFFFFU)
+#define G_MAXUINT64  ((guint64)0xFFFFFFFFFFFFFFFFULL)
+#define G_MAXINT8    ((gint8)0x7F)
+#define G_MAXINT16   ((gint16)0x7FFF)
+#define G_MAXINT32   ((gint32)0x7FFFFFFF)
+#define G_MAXINT64   ((gint64)0x7FFFFFFFFFFFFFFFLL)
+#define G_MININT8    ((gint8)(-0x80))
+#define G_MININT16   ((gint16)(-0x8000))
+#define G_MININT32   ((gint32)(-0x80000000))
+#define G_MAXSIZE    ((gsize)-1)
+
+/* GLib function-pointer typedefs used by g_list_* APIs (which our
+ * gst_compat declares as abort-stubs). They show up in code paths
+ * we never invoke but must compile. */
+typedef void (*GDestroyNotify)(gpointer data);
+typedef int  (*GCompareFunc)(gconstpointer a, gconstpointer b);
+typedef int  (*GCompareDataFunc)(gconstpointer a, gconstpointer b, gpointer user_data);
+
+/* GstMapFlags — passed to gst_memory_map / gst_buffer_map. Dead-code. */
+#define GST_MAP_READ      (1 << 0)
+#define GST_MAP_WRITE     (1 << 1)
+#define GST_MAP_READWRITE (GST_MAP_READ | GST_MAP_WRITE)
+
+/* Dead-code stubs for buffer / memory mapping (only referenced by
+ * gst_h265_parser_insert_sei_hevc which we never call). The compile
+ * needs declarations + addressable functions; abort on call. */
+static inline gboolean
+gst_memory_map(GstMemory *mem G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED,
+               int flags G_GNUC_UNUSED) { abort(); }
+static inline void
+gst_memory_unmap(GstMemory *mem G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED) { abort(); }
+static inline gboolean
+gst_buffer_map(GstBuffer *buf G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED,
+               int flags G_GNUC_UNUSED) { abort(); }
+static inline void
+gst_buffer_unmap(GstBuffer *buf G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED) { abort(); }
+static inline GstBuffer *
+gst_buffer_new(void) { abort(); }
+static inline gboolean
+gst_buffer_copy_into(GstBuffer *dst G_GNUC_UNUSED, GstBuffer *src G_GNUC_UNUSED,
+                     int flags G_GNUC_UNUSED, gsize offset G_GNUC_UNUSED,
+                     gssize size G_GNUC_UNUSED) { abort(); }
+static inline void
+gst_buffer_append_memory(GstBuffer *buf G_GNUC_UNUSED, GstMemory *mem G_GNUC_UNUSED) { abort(); }
+static inline GstMemory *
+gst_memory_ref(GstMemory *mem G_GNUC_UNUSED) { abort(); }
+static inline void
+gst_memory_unref(GstMemory *mem G_GNUC_UNUSED) { abort(); }
+static inline GstMemory *
+gst_memory_copy(GstMemory *mem G_GNUC_UNUSED, gssize offset G_GNUC_UNUSED, gssize size G_GNUC_UNUSED) { abort(); }
+static inline void
+gst_clear_buffer(GstBuffer **buf) { *buf = NULL; }
+#define GST_IS_BUFFER(b) (false)
+
+/* GstBufferCopyFlags — used only by gst_buffer_copy_into in dead code. */
+#define GST_BUFFER_COPY_METADATA  (1 << 0)
+#define GST_BUFFER_COPY_MEMORY    (1 << 1)
+#define GST_BUFFER_COPY_DEEP      (1 << 2)
+
+/* gst_util_ceil_log2(n) — ceil(log2(n)) for non-zero unsigned n.
+ * Used by gsth265parser.c::gst_h265_slice_parse_ref_pic_list_modification.
+ * That function is in the slice-header parser which the libva backend
+ * does NOT invoke (we only call parse_sps) — but the linker still
+ * needs a definition. Provide a real impl: cheaper to compute than to
+ * justify a dead-code stub at every call site. */
+static inline guint
+gst_util_ceil_log2(guint32 n)
+{
+    if (n <= 1) return 0;
+    /* __builtin_clz returns leading zeros for a 32-bit value;
+     * 32 - clz(n-1) = bits needed = ceil(log2(n)). */
+    return 32 - (guint)__builtin_clz(n - 1);
+}
+
+/* GstMapInfo's real definition is in <gst/gstmemory.h>; we need at
+ * least enough to make `info->data` / `info->size` compile. */
+struct _GstMapInfo {
+    GstMemory *memory;
+    int        flags;
+    guint8    *data;
+    gsize      size;
+    gsize      maxsize;
+    gpointer   user_data[4];
+    gpointer   _gst_reserved[GST_PADDING];
+};
+
+/* gst_memory_new_wrapped — dead-code stub (nalutils.c calls it from
+ * the SEI-insertion path the libva backend never invokes). */
+static inline GstMemory *
+gst_memory_new_wrapped(int flags, gpointer data, gsize maxsize,
+                       gsize offset, gsize size, gpointer user_data,
+                       void (*notify)(gpointer))
+{
+    (void)flags; (void)data; (void)maxsize; (void)offset; (void)size;
+    (void)user_data; (void)notify;
+    abort();
+}
+
+/* ===== byte-order read / write macros =====
+ *
+ * GStreamer provides these as static-inline functions in
+ * <gst/gstutils.h>. We re-implement for aarch64 little-endian; the
+ * parser is byte-stream input, so endian-conversion is mechanical.
+ * The float / double variants are present in upstream but the parser
+ * never invokes them — provide stubs so the address-taking sites in
+ * gstbytereader.h's function table compile. */
+
+#define GST_READ_UINT8(data)                                    \
+    (*((const guint8 *)(data)))
+
+#define GST_READ_UINT16_LE(data) (                              \
+    ((guint16)((const guint8 *)(data))[0])           |          \
+    ((guint16)((const guint8 *)(data))[1] << 8))
+
+#define GST_READ_UINT16_BE(data) (                              \
+    ((guint16)((const guint8 *)(data))[0] << 8)      |          \
+    ((guint16)((const guint8 *)(data))[1]))
+
+#define GST_READ_UINT24_LE(data) (                              \
+    ((guint32)((const guint8 *)(data))[0])           |          \
+    ((guint32)((const guint8 *)(data))[1] << 8)      |          \
+    ((guint32)((const guint8 *)(data))[2] << 16))
+
+#define GST_READ_UINT24_BE(data) (                              \
+    ((guint32)((const guint8 *)(data))[0] << 16)     |          \
+    ((guint32)((const guint8 *)(data))[1] << 8)      |          \
+    ((guint32)((const guint8 *)(data))[2]))
+
+#define GST_READ_UINT32_LE(data) (                              \
+    ((guint32)((const guint8 *)(data))[0])           |          \
+    ((guint32)((const guint8 *)(data))[1] << 8)      |          \
+    ((guint32)((const guint8 *)(data))[2] << 16)     |          \
+    ((guint32)((const guint8 *)(data))[3] << 24))
+
+#define GST_READ_UINT32_BE(data) (                              \
+    ((guint32)((const guint8 *)(data))[0] << 24)     |          \
+    ((guint32)((const guint8 *)(data))[1] << 16)     |          \
+    ((guint32)((const guint8 *)(data))[2] << 8)      |          \
+    ((guint32)((const guint8 *)(data))[3]))
+
+#define GST_READ_UINT64_LE(data) (                              \
+    ((guint64)((const guint8 *)(data))[0])           |          \
+    ((guint64)((const guint8 *)(data))[1] << 8)      |          \
+    ((guint64)((const guint8 *)(data))[2] << 16)     |          \
+    ((guint64)((const guint8 *)(data))[3] << 24)     |          \
+    ((guint64)((const guint8 *)(data))[4] << 32)     |          \
+    ((guint64)((const guint8 *)(data))[5] << 40)     |          \
+    ((guint64)((const guint8 *)(data))[6] << 48)     |          \
+    ((guint64)((const guint8 *)(data))[7] << 56))
+
+#define GST_READ_UINT64_BE(data) (                              \
+    ((guint64)((const guint8 *)(data))[0] << 56)     |          \
+    ((guint64)((const guint8 *)(data))[1] << 48)     |          \
+    ((guint64)((const guint8 *)(data))[2] << 40)     |          \
+    ((guint64)((const guint8 *)(data))[3] << 32)     |          \
+    ((guint64)((const guint8 *)(data))[4] << 24)     |          \
+    ((guint64)((const guint8 *)(data))[5] << 16)     |          \
+    ((guint64)((const guint8 *)(data))[6] << 8)      |          \
+    ((guint64)((const guint8 *)(data))[7]))
+
+/* Float / double readers — dead-code, abort if called. The function
+ * table in gstbytereader.h takes the address of the underlying inline
+ * which we don't need to be functional, only addressable. */
+static inline gfloat
+GST_READ_FLOAT_LE(const guint8 *data) { (void)data; abort(); }
+static inline gfloat
+GST_READ_FLOAT_BE(const guint8 *data) { (void)data; abort(); }
+static inline gdouble
+GST_READ_DOUBLE_LE(const guint8 *data) { (void)data; abort(); }
+static inline gdouble
+GST_READ_DOUBLE_BE(const guint8 *data) { (void)data; abort(); }
+
+/* Write side — nalutils.c writes-out SEI bytes (dead path for us but
+ * must compile). */
+#define GST_WRITE_UINT8(data, val) do {                         \
+    ((guint8 *)(data))[0] = (guint8)(val);                      \
+} while (0)
+
+#define GST_WRITE_UINT16_BE(data, val) do {                     \
+    ((guint8 *)(data))[0] = (guint8)((val) >> 8);               \
+    ((guint8 *)(data))[1] = (guint8)((val));                    \
+} while (0)
+
+#define GST_WRITE_UINT24_BE(data, val) do {                     \
+    ((guint8 *)(data))[0] = (guint8)((val) >> 16);              \
+    ((guint8 *)(data))[1] = (guint8)((val) >> 8);               \
+    ((guint8 *)(data))[2] = (guint8)((val));                    \
+} while (0)
+
+#define GST_WRITE_UINT32_BE(data, val) do {                     \
+    ((guint8 *)(data))[0] = (guint8)((val) >> 24);              \
+    ((guint8 *)(data))[1] = (guint8)((val) >> 16);              \
+    ((guint8 *)(data))[2] = (guint8)((val) >> 8);               \
+    ((guint8 *)(data))[3] = (guint8)((val));                    \
+} while (0)
+
+#ifndef MIN
+# define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+# define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+/* ===== GArray ===== */
+
+typedef struct {
+    char *data;                 /* exposed via g_array_index / GArray->data */
+    guint len;                  /* element count */
+    guint capacity;             /* allocated element slots */
+    guint element_size;
+    gboolean clear;             /* zero-fill on grow */
+    void (*clear_func)(gpointer);
+} GArray;
+
+GArray *g_array_new(gboolean zero_terminated, gboolean clear, guint element_size);
+GArray *g_array_sized_new(gboolean zero_terminated, gboolean clear,
+                          guint element_size, guint reserved_size);
+GArray *g_array_set_size(GArray *array, guint length);
+GArray *g_array_append_vals(GArray *array, gconstpointer data, guint len);
+void    g_array_set_clear_func(GArray *array, void (*clear_func)(gpointer));
+gchar  *g_array_free(GArray *array, gboolean free_segment);
+GArray *g_array_unref(GArray *array);
+
+#define g_array_append_val(a, v) g_array_append_vals((a), &(v), 1)
+#define g_array_index(a, t, i)   (((t *)(void *)(a)->data)[i])
+
+/* ===== GList — stubs that abort if reached =====
+ *
+ * Surveyed call sites: gsth265parser.c uses g_list_prepend / g_list_sort /
+ * g_list_free_full in code paths the libva backend does not invoke for
+ * basic SPS parsing (likely SEI message accumulation). Stub to abort so
+ * any future call surfaces immediately rather than silently corrupting. */
+
+/* GList — full struct (not opaque) so callers can do `list->data`.
+ * The functions still abort because we never construct a GList. */
+typedef struct _GList GList;
+struct _GList {
+    gpointer data;
+    GList   *next;
+    GList   *prev;
+};
+
+static inline GList *g_list_prepend(GList *list G_GNUC_UNUSED, gpointer data G_GNUC_UNUSED) { abort(); }
+static inline GList *g_list_sort(GList *list G_GNUC_UNUSED, int (*cmp)(gconstpointer, gconstpointer) G_GNUC_UNUSED) { abort(); }
+static inline void g_list_free_full(GList *list G_GNUC_UNUSED, void (*free_func)(gpointer) G_GNUC_UNUSED) { abort(); }
+
+/* ===== g_once_init_enter / g_once_init_leave =====
+ *
+ * GLib's lazy-init guards. The parser uses these for one-shot static
+ * initialization (e.g. profile-name table). Our backend is single-
+ * threaded at the parser-init site (driver_init), so we can simplify
+ * to a plain run-once gate. */
+
+#define g_once_init_enter(loc)        (*(loc) == 0)
+#define g_once_init_leave(loc, val)   (*(loc) = (val))
+
+/* ===== conversions ===== */
+
+#define GINT_TO_POINTER(i) ((gpointer)(uintptr_t)(gint)(i))
+#define GPOINTER_TO_INT(p) ((gint)(uintptr_t)(p))
+
+#endif  /* LIBVA_V4L2_REQUEST_FOURIER_GST_COMPAT_H */
@@ -0,0 +1,90 @@
+/*
+ * v4l2-hevc-ext-controls.h — verbatim mirror of Linux 7.0+ V4L2 stateless
+ * HEVC extended-SPS RPS control definitions, shipped as an internal
+ * header so this libva backend can be built against pre-7.0
+ * linux-api-headers packages (currently ampere ships 6.19-1).
+ *
+ * Upstream source: linux kernel, include/uapi/linux/v4l2-controls.h
+ * As-of: Linux 7.0-rc3 (Detlev Casanova / Collabora "VDPU381/VDPU383"
+ * series, see lkml.org/lkml/2026/1/9/1334). The two CIDs + two structs
+ * + two flag macros below are byte-for-byte the kernel UAPI definitions.
+ *
+ * Once linux-api-headers >= 7.0 is the floor across the fleet, this
+ * shim becomes redundant — `<linux/v4l2-controls.h>` will provide the
+ * same symbols. The include order in h265.c is: this header BEFORE
+ * <linux/v4l2-controls.h>, so when the system catches up, the macro
+ * guards below silently no-op and we use the system definitions.
+ *
+ * License: MIT (matches backend's COPYING.MIT). Per LGPL § 3.b., the
+ * kernel UAPI struct definitions themselves are excepted from the
+ * kernel's overall GPL and may be copied verbatim into userspace
+ * binaries without inheriting GPL.
+ *
+ * Rationale + iter2 plan: see
+ *   ~/src/ampere-kernel-decoders/phase4_plan_iter2.md (§Step 3)
+ *   ~/src/ampere-kernel-decoders/phase0_findings_iter2.md
+ */
+
+#ifndef LIBVA_V4L2_REQUEST_FOURIER_V4L2_HEVC_EXT_CONTROLS_H
+#define LIBVA_V4L2_REQUEST_FOURIER_V4L2_HEVC_EXT_CONTROLS_H
+
+#include <linux/types.h>
+#include <linux/v4l2-controls.h>
+
+#ifndef V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS
+# define V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS \
+    (V4L2_CID_CODEC_STATELESS_BASE + 408)
+#endif
+
+#ifndef V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS
+# define V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS \
+    (V4L2_CID_CODEC_STATELESS_BASE + 409)
+#endif
+
+#ifndef V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED
+# define V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED 0x1
+#endif
+
+#ifndef V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT
+# define V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT 0x1
+#endif
+
+/*
+ * struct v4l2_ctrl_hevc_ext_sps_st_rps — HEVC short-term RPS parameters.
+ *
+ * Dynamic-size 1-dimension array. Number of elements is
+ *   v4l2_ctrl_hevc_sps::num_short_term_ref_pic_sets
+ * Can contain up to 65 elements (the H.265 spec § 7.4.3.2.1 maximum).
+ */
+#ifndef V4L2_HEVC_EXT_SPS_ST_RPS_DEFINED
+# define V4L2_HEVC_EXT_SPS_ST_RPS_DEFINED 1
+struct v4l2_ctrl_hevc_ext_sps_st_rps {
+	__u8	delta_idx_minus1;
+	__u8	delta_rps_sign;
+	__u8	num_negative_pics;
+	__u8	num_positive_pics;
+	__u32	used_by_curr_pic;
+	__u32	use_delta_flag;
+	__u16	abs_delta_rps_minus1;
+	__u16	delta_poc_s0_minus1[16];
+	__u16	delta_poc_s1_minus1[16];
+	__u16	flags;
+};
+#endif
+
+/*
+ * struct v4l2_ctrl_hevc_ext_sps_lt_rps — HEVC long-term RPS parameters.
+ *
+ * Dynamic-size 1-dimension array. Number of elements is
+ *   v4l2_ctrl_hevc_sps::num_long_term_ref_pics_sps
+ * Can contain up to 33 elements (the H.265 spec § 7.4.3.2.1 maximum).
+ */
+#ifndef V4L2_HEVC_EXT_SPS_LT_RPS_DEFINED
+# define V4L2_HEVC_EXT_SPS_LT_RPS_DEFINED 1
+struct v4l2_ctrl_hevc_ext_sps_lt_rps {
+	__u16	lt_ref_pic_poc_lsb_sps;
+	__u16	flags;
+};
+#endif
+
+#endif  /* LIBVA_V4L2_REQUEST_FOURIER_V4L2_HEVC_EXT_CONTROLS_H */
@@ -31,7 +31,13 @@
 #include "video.h"

 #include <assert.h>
+#include <fcntl.h>
 #include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+
+#include <linux/dma-buf.h>

 #include "tiled_yuv.h"
 #include "utils.h"
@@ -125,6 +131,7 @@ VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,

 VAStatus RequestDestroyImage(VADriverContextP context, VAImageID image_id)
 {
+
 	struct request_data *driver_data = context->pDriverData;
 	struct object_image *image_object;
 	VAStatus status;
@@ -149,12 +156,111 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
 {
 	struct object_buffer *buffer_object;
 	unsigned int i;
+	int sync_fds[VIDEO_MAX_PLANES];
+	unsigned int n_sync_fds = 0;

 	buffer_object = BUFFER(driver_data, image->buf);
 	if (buffer_object == NULL)
 		return VA_STATUS_ERROR_INVALID_BUFFER;

+	for (i = 0; i < VIDEO_MAX_PLANES; i++)
+		sync_fds[i] = -1;
+
+	/*
+	 * iter13 α-17: explicit cache sync around the CAPTURE buffer read.
+	 *
+	 * The CAPTURE buffer is V4L2_MEMORY_MMAP and was mapped at
+	 * cap_pool_init time with cached attributes. Kernel decode writes to
+	 * the buffer via DMA, which doesn't propagate to the CPU's cache
+	 * observer for that virtual mapping. Reading from
+	 * surface_object->destination_data[] without an explicit cache
+	 * invalidation returns stale data — observed empirically as Bug 4
+	 * (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva went
+	 * through the SAME readback path that kdirect ffmpeg-v4l2request +
+	 * DRM_PRIME-mmap successfully reads (kdirect's drm-prime mmap
+	 * implicitly handles sync).
+	 *
+	 * DMA_BUF_IOCTL_SYNC(START | READ) makes the CPU mapping coherent
+	 * with the producing engine's writes; END releases the sync.
+	 * Per V4L2 + dma-buf spec, this is the userspace contract for
+	 * cached-mmap'd buffers (Tomasz Figa, linaro-mm-sig 2024-07-11).
+	 *
+	 * Requires a dma-buf fd: get one via VIDIOC_EXPBUF, sync, close.
+	 * Per-call cost is one ioctl pair + one fd open/close per plane.
+	 * Could be optimised by caching the EXPBUF fd on the cap_pool slot,
+	 * but doing it just-in-time keeps the lifecycle uncomplicated. The
+	 * EXPBUF fd's dup count doesn't affect the V4L2 buffer's underlying
+	 * pages; closing the fd is a no-op on memory.
+	 *
+	 * If EXPBUF fails (e.g., consumer-held EXPBUF prevents a second one
+	 * — only true for hantro G1 oddity), we skip the sync silently. The
+	 * existing pre-iter13 behavior is preserved on the error path.
+	 */
+	if (surface_object->current_slot != NULL &&
+	    driver_data->video_format != NULL) {
+		unsigned int capture_type =
+			v4l2_type_video_capture(driver_data->video_format->v4l2_mplane);
+		if (v4l2_export_buffer(driver_data->video_fd, capture_type,
+				       surface_object->destination_index,
+				       O_RDONLY, sync_fds,
+				       surface_object->destination_buffers_count) >= 0) {
+			n_sync_fds = surface_object->destination_buffers_count;
+			for (i = 0; i < n_sync_fds; i++) {
+				struct dma_buf_sync s = {
+					.flags = DMA_BUF_SYNC_START |
+						 DMA_BUF_SYNC_READ,
+				};
+				/* failure is non-fatal: we continue with the read */
+				(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
+			}
+		}
+	}
+
+	/*
+	 * AV1 film_grain: when this surface is the display surface of a
+	 * decode (current_display_picture != current_frame with apply_grain=1),
+	 * its slot is NULL because BeginPicture only fired on the decode
+	 * surface. Follow the back-link set in av1_set_controls and borrow
+	 * the decode surface's destination_data + sizes for the copy.
+	 */
+	if (surface_object->current_slot == NULL &&
+	    surface_object->linked_decode_surface_id != VA_INVALID_SURFACE) {
+		struct object_surface *decode_surface =
+			SURFACE(driver_data,
+				surface_object->linked_decode_surface_id);
+		if (decode_surface != NULL &&
+		    decode_surface->current_slot != NULL) {
+			/* Mirror the fields we read below. The surface heap
+			 * pointer is stable for the surface's lifetime; we
+			 * only need destination_data + destination_sizes +
+			 * destination_planes_count from it. */
+			surface_object->destination_planes_count =
+				decode_surface->destination_planes_count;
+			for (i = 0; i < decode_surface->destination_planes_count; i++) {
+				surface_object->destination_data[i] =
+					decode_surface->destination_data[i];
+				surface_object->destination_sizes[i] =
+					decode_surface->destination_sizes[i];
+			}
+		}
+	}
+
 	for (i = 0; i < surface_object->destination_planes_count; i++) {
+		/* AV1 Phase 3 diag: surface NULL-deref hunt. */
+		if (buffer_object->data == NULL ||
+		    surface_object->destination_data[i] == NULL) {
+			request_log("copy_surface_to_image NULL i=%u "
+				    "buf_data=%p dest_data=%p dest_size=%u "
+				    "planes=%u slot=%p linked=0x%x\n",
+				    i, (void *)buffer_object->data,
+				    (void *)surface_object->destination_data[i],
+				    surface_object->destination_sizes[i],
+				    surface_object->destination_planes_count,
+				    (void *)surface_object->current_slot,
+				    surface_object->linked_decode_surface_id);
+			return VA_STATUS_ERROR_OPERATION_FAILED;
+		}
+#ifdef __arm__
 		if (!video_format_is_linear(driver_data->video_format))
 			tiled_to_planar(surface_object->destination_data[i],
 					buffer_object->data + image->offsets[i],
@@ -162,10 +268,22 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
 					i == 0 ? image->height :
 						 image->height / 2);
 		else {
+#endif
 			memcpy(buffer_object->data + image->offsets[i],
 			       surface_object->destination_data[i],
 			       surface_object->destination_sizes[i]);
+#ifdef __arm__
 		}
+#endif
+	}
+
+	/* iter13 α-17: release cache sync. END pairs with each START. */
+	for (i = 0; i < n_sync_fds; i++) {
+		struct dma_buf_sync s = {
+			.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ,
+		};
+		(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
+		close(sync_fds[i]);
 	}

 	return VA_STATUS_SUCCESS;
@@ -180,6 +298,7 @@ VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
 	VAImageFormat format;
 	VAStatus status;

+
 	surface_object = SURFACE(driver_data, surface_id);
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
@@ -190,16 +309,33 @@ VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
 			return status;
 	}

+	/* Fully populate VAImageFormat to match QueryImageFormats output. */
+	memset(&format, 0, sizeof(format));
 	format.fourcc = VA_FOURCC_NV12;
+	format.byte_order = VA_LSB_FIRST;
+	format.bits_per_pixel = 12;

 	status = RequestCreateImage(context, &format, surface_object->width,
 				    surface_object->height, image);
 	if (status != VA_STATUS_SUCCESS)
 		return status;

-	status = copy_surface_to_image (driver_data, surface_object, image);
-	if (status != VA_STATUS_SUCCESS)
-		return status;
+	/*
+	 * Iter2 Fix 3: skip the surface→image copy when no CAPTURE slot is
+	 * bound. ffmpeg's av_hwframe_ctx_init probes vaDeriveImage on a
+	 * never-decoded surface to learn the format; it doesn't read the
+	 * data. With the cap_pool decoupling, destination_data[] is NULL
+	 * until BeginPicture binds a slot — copying from a NULL source
+	 * crashed in memcpy. The image's buffer remains zero-initialized;
+	 * subsequent post-decode DeriveImage on the same surface (after
+	 * BeginPicture has bound a slot) does the real copy.
+	 */
+	if (surface_object->current_slot != NULL) {
+		status = copy_surface_to_image (driver_data, surface_object,
+						image);
+		if (status != VA_STATUS_SUCCESS)
+			return status;
+	}

 	surface_object->status = VASurfaceReady;

@@ -212,7 +348,25 @@ VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
 VAStatus RequestQueryImageFormats(VADriverContextP context,
 				  VAImageFormat *formats, int *formats_count)
 {
+
+	/*
+	 * Populate the VAImageFormat fully per VAAPI spec for NV12 —
+	 * not just .fourcc. Consumers (FFmpeg's hwcontext_vaapi, mpv,
+	 * Firefox) read .byte_order and .bits_per_pixel; leaving them
+	 * uninitialized inherits whatever caller-stack garbage is in
+	 * the buffer and produces non-deterministic behavior. Reference:
+	 * Mesa's gallium/frontends/va/image.c::vlVaQueryImageFormats and
+	 * intel-vaapi-driver's i965_drv_video.c — both publish NV12
+	 * with byte_order=VA_LSB_FIRST and bits_per_pixel=12.
+	 *
+	 * For YUV formats, depth/red_mask/green_mask/blue_mask/alpha_mask
+	 * are not meaningful (those describe RGB bit layouts); leave them
+	 * zeroed via memset before populating.
+	 */
+	memset(&formats[0], 0, sizeof(formats[0]));
 	formats[0].fourcc = VA_FOURCC_NV12;
+	formats[0].byte_order = VA_LSB_FIRST;
+	formats[0].bits_per_pixel = 12;
 	*formats_count = 1;

 	return VA_STATUS_SUCCESS;
@@ -233,6 +387,7 @@ VAStatus RequestGetImage(VADriverContextP context, VASurfaceID surface_id,
 	struct object_image *image_object;
 	VAImage *image;

+
 	surface_object = SURFACE(driver_data, surface_id);
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
@@ -26,7 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <sys/select.h>
+#include <poll.h>

 #include <linux/media.h>

@@ -78,19 +78,19 @@ int media_request_queue(int request_fd)

 int media_request_wait_completion(int request_fd)
 {
-	struct timeval tv = { 0, 300000 };
-	fd_set except_fds;
+	/* poll() instead of select(): Firefox's RDD seccomp policy admits
+	 * poll/ppoll but not select/pselect6 (as of FF150). Functionally
+	 * equivalent here — the media request fd signals completion via
+	 * exceptional condition, mapped to POLLPRI for poll(). */
+	struct pollfd pfd = { .fd = request_fd, .events = POLLPRI };
 	int rc;

-	FD_ZERO(&except_fds);
-	FD_SET(request_fd, &except_fds);
-
-	rc = select(request_fd + 1, NULL, NULL, &except_fds, &tv);
+	rc = poll(&pfd, 1, 300 /* ms */);
 	if (rc == 0) {
 		request_log("Timeout when waiting for media request\n");
 		return -1;
 	} else if (rc < 0) {
-		request_log("Unable to select media request: %s\n",
+		request_log("Unable to poll media request: %s\n",
 			    strerror(errno));
 		return -1;
 	}
@@ -44,7 +44,23 @@ sources = [
 	'v4l2.c',
 	'mpeg2.c',
 	'h264.c',
-	'h265.c'
+	'h264_slice_header.c',
+	'request_pool.c',
+	'cap_pool.c',
+	'h265.c',
+	'vp8.c',
+	'vp9.c',
+	'av1.c',
+	'codec.c',
+
+	# Vendored GStreamer 1.28.2 H.265 parser + utilities (LGPL v2.1+,
+	# see src/h265_parser/gst_compat.h for sourcing notes + per-iter2
+	# adaptation strategy).
+	'h265_parser/gst_compat.c',
+	'h265_parser/gst/base/gstbitreader.c',
+	'h265_parser/gst/base/gstbytereader.c',
+	'h265_parser/gst/codecparsers/nalutils.c',
+	'h265_parser/gst/codecparsers/gsth265parser.c'
 ]

 headers = [
@@ -64,11 +80,39 @@ headers = [
 	'v4l2.h',
 	'mpeg2.h',
 	'h264.h',
-	'h265.h'
+	'h264_slice_header.h',
+	'request_pool.h',
+	'cap_pool.h',
+	'h265.h',
+	'vp8.h',
+	'vp9.h',
+	'av1.h',
+	'codec.h',
+
+	# Internal mirror of Linux 7.0 V4L2 HEVC EXT_SPS_*_RPS UAPI defs
+	# (allows building against pre-7.0 linux-api-headers; redundant
+	# once the host headers are 7.0+).
+	'hevc-ctrls/v4l2-hevc-ext-controls.h',
+
+	# Vendored GStreamer + project shim headers (see sources above).
+	'h265_parser/gst_compat.h',
+	'h265_parser/gst/gst.h',
+	'h265_parser/gst/glib-compat-private.h',
+	'h265_parser/gst/base/base-prelude.h',
+	'h265_parser/gst/base/gstbitreader.h',
+	'h265_parser/gst/base/gstbytereader.h',
+	'h265_parser/gst/base/gstbitwriter.h',
+	'h265_parser/gst/codecparsers/codecparsers-prelude.h',
+	'h265_parser/gst/codecparsers/gsth265parser.h',
+	'h265_parser/gst/codecparsers/nalutils.h'
 ]

 includes = [
-	include_directories('../include')
+	include_directories('../include'),
+	# Vendored GStreamer parser tree — the parser's #include <gst/base/...>
+	# style references resolve here via stub headers that redirect to
+	# gst_compat.h.
+	include_directories('h265_parser')
 ]

 cflags = [
@@ -23,6 +23,34 @@
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

+/*
+ * fresnel-fourier iter1 Phase 6 commit B: rewrite against new split
+ * V4L2_CID_STATELESS_MPEG2_{SEQUENCE,PICTURE,QUANTISATION} stateless
+ * controls (mainline kernel <linux/v4l2-controls.h>:1985-2105).
+ *
+ * Replaces the staging-era V4L2_CID_MPEG_VIDEO_MPEG2_{SLICE_PARAMS,
+ * QUANTIZATION} combined-struct API that the fork previously used
+ * via include/mpeg2-ctrls.h (deleted in commit C).
+ *
+ * Per-frame submission: one batched VIDIOC_S_EXT_CTRLS with three
+ * controls (12-byte SEQUENCE + 32-byte PICTURE + 256-byte QUANTISATION),
+ * matching FFmpeg libavcodec/v4l2_request_mpeg2.c:130-155 reference
+ * implementation. Verified empirically in fresnel-fourier Phase 0
+ * cross-validator sweep and Phase 3 Baseline C verbatim payload.
+ *
+ * Quantisation matrix order: zigzag scanning order per kernel doc
+ * v4l2-controls.h:2076. VAAPI VAIQMatrixBufferMPEG2 also stores in
+ * zigzag scanning order (per VAAPI spec). Direct memcpy works; no
+ * permutation in the libva backend. Kernel hantro_mpeg2.c::
+ * hantro_mpeg2_dec_copy_qtable applies the zigzag-to-raster
+ * permutation when copying to the hardware quantisation table.
+ *
+ * Default matrices (when iqmatrix_set==false): MPEG-2 spec defaults
+ * per ISO/IEC 13818-2 Table 7-3, transcribed from Phase 3 Baseline C
+ * QUANTISATION verbatim payload (256 bytes captured from
+ * ffmpeg-v4l2request decode of bbb_720p10s_mpeg2.ts).
+ */
+
 #include "mpeg2.h"
 #include "context.h"
 #include "request.h"
@@ -35,120 +63,187 @@
 #include <sys/mman.h>

 #include <linux/videodev2.h>
-#include <mpeg2-ctrls.h>
+#include <linux/v4l2-controls.h>

 #include "v4l2.h"

+/*
+ * MPEG-2 default intra quantisation matrix in zigzag scanning order
+ * (ISO/IEC 13818-2 Table 7-3, verified empirically against
+ * fresnel-fourier Phase 3 Baseline C QUANTISATION payload bytes 0..63
+ * from a ffmpeg-v4l2request decode of the BBB 720p10s MPEG-2 fixture).
+ */
+static const __u8 mpeg2_default_intra_matrix[64] = {
+	  8,  16,  16,  19,  16,  19,  22,  22,
+	 22,  22,  22,  22,  26,  24,  26,  27,
+	 27,  27,  26,  26,  26,  26,  27,  27,
+	 27,  29,  29,  29,  34,  34,  34,  29,
+	 29,  29,  27,  27,  29,  29,  32,  32,
+	 34,  34,  37,  38,  37,  35,  35,  34,
+	 35,  38,  38,  40,  40,  40,  48,  48,
+	 46,  46,  56,  56,  58,  69,  69,  83,
+};
+
+/*
+ * MPEG-2 default non-intra quantisation matrix is uniformly 16 in spec.
+ * Verified against Phase 3 Baseline C QUANTISATION payload bytes
+ * 64..127 (all 0x10 = 16). Same applies to chroma_non_intra
+ * (bytes 192..255). Filled at runtime via memset rather than a
+ * separate const array to keep the binary smaller.
+ */
+
 int mpeg2_set_controls(struct request_data *driver_data,
 		       struct object_context *context_object,
 		       struct object_surface *surface_object)
 {
 	VAPictureParameterBufferMPEG2 *picture =
 		&surface_object->params.mpeg2.picture;
-	VASliceParameterBufferMPEG2 *slice =
-		&surface_object->params.mpeg2.slice;
 	VAIQMatrixBufferMPEG2 *iqmatrix =
 		&surface_object->params.mpeg2.iqmatrix;
 	bool iqmatrix_set = surface_object->params.mpeg2.iqmatrix_set;
-	struct v4l2_ctrl_mpeg2_slice_params slice_params;
-	struct v4l2_ctrl_mpeg2_quantization quantization;
+
+	/* Clause 2: v4l2_ctrl_mpeg2_sequence (12 bytes) */
+	struct v4l2_ctrl_mpeg2_sequence sequence;
+	/* Clause 3: v4l2_ctrl_mpeg2_picture (32 bytes; reserved[5] must be zero) */
+	struct v4l2_ctrl_mpeg2_picture pic;
+	/* Clause 4: v4l2_ctrl_mpeg2_quantisation (256 bytes) */
+	struct v4l2_ctrl_mpeg2_quantisation quant;
+
 	struct object_surface *forward_reference_surface;
 	struct object_surface *backward_reference_surface;
-	uint64_t timestamp;
-	unsigned int i;
 	int rc;

-	memset(&slice_params, 0, sizeof(slice_params));
+	memset(&sequence, 0, sizeof sequence);
+	memset(&pic, 0, sizeof pic);  /* zeros pic.reserved[5] per Clause 3 */
+	memset(&quant, 0, sizeof quant);

-	slice_params.bit_size = surface_object->slices_size * 8;
-	slice_params.data_bit_offset = 0;
-
-	slice_params.sequence.horizontal_size = picture->horizontal_size;
-	slice_params.sequence.vertical_size = picture->vertical_size;
-	slice_params.sequence.vbv_buffer_size = SOURCE_SIZE_MAX;
-
-	slice_params.sequence.profile_and_level_indication = 0;
-	slice_params.sequence.progressive_sequence = 0;
-	slice_params.sequence.chroma_format = 1; // 4:2:0
-
-	slice_params.picture.picture_coding_type = picture->picture_coding_type;
-	slice_params.picture.f_code[0][0] = (picture->f_code >> 12) & 0x0f;
-	slice_params.picture.f_code[0][1] = (picture->f_code >> 8) & 0x0f;
-	slice_params.picture.f_code[1][0] = (picture->f_code >> 4) & 0x0f;
-	slice_params.picture.f_code[1][1] = (picture->f_code >> 0) & 0x0f;
-
-	slice_params.picture.intra_dc_precision =
-		picture->picture_coding_extension.bits.intra_dc_precision;
-	slice_params.picture.picture_structure =
-		picture->picture_coding_extension.bits.picture_structure;
-	slice_params.picture.top_field_first =
-		picture->picture_coding_extension.bits.top_field_first;
-	slice_params.picture.frame_pred_frame_dct =
-		picture->picture_coding_extension.bits.frame_pred_frame_dct;
-	slice_params.picture.concealment_motion_vectors =
-		picture->picture_coding_extension.bits
-			.concealment_motion_vectors;
-	slice_params.picture.q_scale_type =
-		picture->picture_coding_extension.bits.q_scale_type;
-	slice_params.picture.intra_vlc_format =
-		picture->picture_coding_extension.bits.intra_vlc_format;
-	slice_params.picture.alternate_scan =
-		picture->picture_coding_extension.bits.alternate_scan;
-	slice_params.picture.repeat_first_field =
-		picture->picture_coding_extension.bits.repeat_first_field;
-	slice_params.picture.progressive_frame =
-		picture->picture_coding_extension.bits.progressive_frame;
-
-	slice_params.quantiser_scale_code = slice->quantiser_scale_code;
+	/* === Clause 2: SEQUENCE ===
+	 *
+	 * VAAPI's VAPictureParameterBufferMPEG2 doesn't expose the
+	 * sequence-extension's progressive_sequence flag separately;
+	 * use progressive_frame from the picture-coding extension as a
+	 * proxy. They're identical for typical streams (BBB is
+	 * progressive throughout).
+	 */
+	sequence.horizontal_size = picture->horizontal_size;
+	sequence.vertical_size = picture->vertical_size;
+	sequence.vbv_buffer_size = surface_object->source_size;
+	sequence.profile_and_level_indication = 0;  /* not exposed by VAAPI */
+	sequence.chroma_format = 1;  /* 4:2:0 — campaign codec scope */
+	if (picture->picture_coding_extension.bits.progressive_frame)
+		sequence.flags |= V4L2_MPEG2_SEQ_FLAG_PROGRESSIVE;

+	/* === Clause 3: PICTURE ===
+	 *
+	 * Behavioral correction vs. previous mpeg2.c at this iter1:
+	 * old code self-referenced surface_object->timestamp when the
+	 * VAAPI ref picture was VA_INVALID_ID. New code sets ts = 0 for
+	 * missing refs, matching kernel doc's 0-as-sentinel convention
+	 * (verified against Phase 3 Baseline C frame 1: I-frame has both
+	 * forward_ref_ts and backward_ref_ts == 0; FFmpeg
+	 * libavcodec/v4l2_request_mpeg2.c:98-108 uses same convention).
+	 */
 	forward_reference_surface =
 		SURFACE(driver_data, picture->forward_reference_picture);
-	if (forward_reference_surface == NULL)
-		forward_reference_surface = surface_object;
-
-	timestamp = v4l2_timeval_to_ns(&forward_reference_surface->timestamp);
-	slice_params.forward_ref_ts = timestamp;
+	if (forward_reference_surface != NULL)
+		pic.forward_ref_ts =
+			v4l2_timeval_to_ns(&forward_reference_surface->timestamp);

 	backward_reference_surface =
 		SURFACE(driver_data, picture->backward_reference_picture);
-	if (backward_reference_surface == NULL)
-		backward_reference_surface = surface_object;
+	if (backward_reference_surface != NULL)
+		pic.backward_ref_ts =
+			v4l2_timeval_to_ns(&backward_reference_surface->timestamp);

-	timestamp = v4l2_timeval_to_ns(&backward_reference_surface->timestamp);
-	slice_params.backward_ref_ts = timestamp;
+	if (picture->picture_coding_extension.bits.top_field_first)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_TOP_FIELD_FIRST;
+	if (picture->picture_coding_extension.bits.frame_pred_frame_dct)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_FRAME_PRED_DCT;
+	if (picture->picture_coding_extension.bits.concealment_motion_vectors)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_CONCEALMENT_MV;
+	if (picture->picture_coding_extension.bits.q_scale_type)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_Q_SCALE_TYPE;
+	if (picture->picture_coding_extension.bits.intra_vlc_format)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_INTRA_VLC;
+	if (picture->picture_coding_extension.bits.alternate_scan)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_ALT_SCAN;
+	if (picture->picture_coding_extension.bits.repeat_first_field)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_REPEAT_FIRST;
+	if (picture->picture_coding_extension.bits.progressive_frame)
+		pic.flags |= V4L2_MPEG2_PIC_FLAG_PROGRESSIVE;

-	rc = v4l2_set_control(driver_data->video_fd, surface_object->request_fd,
-			      V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS,
-			      &slice_params, sizeof(slice_params));
+	pic.f_code[0][0] = (picture->f_code >> 12) & 0x0f;
+	pic.f_code[0][1] = (picture->f_code >>  8) & 0x0f;
+	pic.f_code[1][0] = (picture->f_code >>  4) & 0x0f;
+	pic.f_code[1][1] = (picture->f_code >>  0) & 0x0f;
+	pic.picture_coding_type = picture->picture_coding_type;
+	pic.picture_structure =
+		picture->picture_coding_extension.bits.picture_structure;
+	pic.intra_dc_precision =
+		picture->picture_coding_extension.bits.intra_dc_precision;
+	/* pic.reserved[5] zeroed by memset above */
+
+	/* === Clause 4: QUANTISATION ===
+	 *
+	 * Kernel always reads all four matrices unconditionally
+	 * (no load_* flags in the new API; kernel hantro_mpeg2.c
+	 * doesn't synthesize defaults). When VAAPI's consumer didn't
+	 * send VAIQMatrixBufferType (iqmatrix_set==false), populate
+	 * with MPEG-2 spec default matrices.
+	 *
+	 * VAAPI VAIQMatrixBufferMPEG2 stores matrices in zigzag scanning
+	 * order (per VAAPI spec). Kernel expects zigzag scanning order
+	 * (per v4l2-controls.h:2076). Direct memcpy.
+	 */
+	if (iqmatrix_set) {
+		memcpy(quant.intra_quantiser_matrix,
+		       iqmatrix->intra_quantiser_matrix, 64);
+		memcpy(quant.non_intra_quantiser_matrix,
+		       iqmatrix->non_intra_quantiser_matrix, 64);
+		memcpy(quant.chroma_intra_quantiser_matrix,
+		       iqmatrix->chroma_intra_quantiser_matrix, 64);
+		memcpy(quant.chroma_non_intra_quantiser_matrix,
+		       iqmatrix->chroma_non_intra_quantiser_matrix, 64);
+	} else {
+		memcpy(quant.intra_quantiser_matrix,
+		       mpeg2_default_intra_matrix, 64);
+		memset(quant.non_intra_quantiser_matrix, 16, 64);
+		memcpy(quant.chroma_intra_quantiser_matrix,
+		       mpeg2_default_intra_matrix, 64);
+		memset(quant.chroma_non_intra_quantiser_matrix, 16, 64);
+	}
+
+	/* === Clause 1+5: batched submission ===
+	 *
+	 * One VIDIOC_S_EXT_CTRLS with all three controls. Matches
+	 * src/h264.c:986 pattern (single v4l2_set_controls call) and
+	 * FFmpeg ff_v4l2_request_decode_frame contract. Bound to the
+	 * surface's permanent request_fd (iter6 per-OUTPUT-slot binding;
+	 * picture.c:284 sets surface_object->request_fd at BeginPicture).
+	 */
+	struct v4l2_ext_control ctrls[3] = {
+		{
+			.id = V4L2_CID_STATELESS_MPEG2_SEQUENCE,
+			.ptr = &sequence,
+			.size = sizeof sequence,
+		},
+		{
+			.id = V4L2_CID_STATELESS_MPEG2_PICTURE,
+			.ptr = &pic,
+			.size = sizeof pic,
+		},
+		{
+			.id = V4L2_CID_STATELESS_MPEG2_QUANTISATION,
+			.ptr = &quant,
+			.size = sizeof quant,
+		},
+	};
+
+	rc = v4l2_set_controls(driver_data->video_fd,
+			       surface_object->request_fd,
+			       ctrls, 3);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;

-	if (iqmatrix_set) {
-		quantization.load_intra_quantiser_matrix =
-			iqmatrix->load_intra_quantiser_matrix;
-		quantization.load_non_intra_quantiser_matrix =
-			iqmatrix->load_non_intra_quantiser_matrix;
-		quantization.load_chroma_intra_quantiser_matrix =
-			iqmatrix->load_chroma_intra_quantiser_matrix;
-		quantization.load_chroma_non_intra_quantiser_matrix =
-			iqmatrix->load_chroma_non_intra_quantiser_matrix;
-
-		for (i = 0; i < 64; i++) {
-			quantization.intra_quantiser_matrix[i] =
-				iqmatrix->intra_quantiser_matrix[i];
-			quantization.non_intra_quantiser_matrix[i] =
-				iqmatrix->non_intra_quantiser_matrix[i];
-			quantization.chroma_intra_quantiser_matrix[i] =
-				iqmatrix->chroma_intra_quantiser_matrix[i];
-			quantization.chroma_non_intra_quantiser_matrix[i] =
-				iqmatrix->chroma_non_intra_quantiser_matrix[i];
-		}
-
-		rc = v4l2_set_control(driver_data->video_fd,
-				      surface_object->request_fd,
-				      V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION,
-				      &quantization, sizeof(quantization));
-	}
-
 	return 0;
 }
@@ -34,8 +34,13 @@
 #include "h264.h"
 #include "h265.h"
 #include "mpeg2.h"
+#include "vp8.h"
+#include "vp9.h"
+#include "av1.h"

 #include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>

 #include <errno.h>
@@ -51,6 +56,7 @@
 #include "autoconfig.h"

 static VAStatus codec_store_buffer(struct request_data *driver_data,
+				   struct object_context *context,
 				   VAProfile profile,
 				   struct object_surface *surface_object,
 				   struct object_buffer *buffer_object)
@@ -63,6 +69,47 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 		 * RenderPicture), we can't use a V4L2 buffer directly
 		 * and have to copy from a regular buffer.
 		 */
+		if (context->h264_start_code) {
+			static const char start_code[3] = { 0x00, 0x00, 0x01 };
+
+			memcpy(surface_object->source_data +
+			       surface_object->slices_size,
+			       start_code, sizeof(start_code));
+			surface_object->slices_size += sizeof(start_code);
+		}
+		/*
+		 * iter33 α-30: VP8 OUTPUT buffer needs the uncompressed
+		 * frame header that ffmpeg-vaapi stripped before submitting
+		 * VASliceData. Hantro's vp8_dec_run reads OUTPUT[0..N] with
+		 * an assumed offset of 10 bytes (keyframe) or 3 bytes
+		 * (interframe) before the first_partition data — see
+		 * rockchip_vpu2_hw_vp8_dec.c:349.
+		 *
+		 * ffmpeg-vaapi (vaapi_vp8.c:191-192) strips
+		 *   header_size = 3 + 7 * s->keyframe
+		 * before submitting the slice data, so libva needs to
+		 * pre-pad the OUTPUT with that many bytes. Hantro only
+		 * uses these bytes for offset arithmetic, not parsing,
+		 * so zero-filled placeholder is sufficient.
+		 *
+		 * ffmpeg-v4l2request (kdirect path) does NOT strip the
+		 * header, hence its OUTPUT is byte-equal to SW reference
+		 * and decode works correctly. This is the only material
+		 * difference between the two front-ends for VP8.
+		 *
+		 * key_frame in VAAPI's pic_fields.bits is INVERTED:
+		 *   0 → keyframe, 1 → interframe.
+		 */
+		if (profile == VAProfileVP8Version0_3 &&
+		    surface_object->params.vp8.iqmatrix_set /* picture parsed by now */) {
+			unsigned int header_size =
+				surface_object->params.vp8.picture.pic_fields.bits.key_frame == 0 ?
+					10 : 3;
+			memset(surface_object->source_data +
+			       surface_object->slices_size,
+			       0, header_size);
+			surface_object->slices_size += header_size;
+		}
 		memcpy(surface_object->source_data +
 			       surface_object->slices_size,
 		       buffer_object->data,
@@ -97,6 +144,27 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			       sizeof(surface_object->params.h265.picture));
 			break;

+		case VAProfileVP8Version0_3:
+			memcpy(&surface_object->params.vp8.picture,
+			       buffer_object->data,
+			       sizeof(surface_object->params.vp8.picture));
+			break;
+
+		case VAProfileVP9Profile0:
+			memcpy(&surface_object->params.vp9.picture,
+			       buffer_object->data,
+			       sizeof(surface_object->params.vp9.picture));
+			break;
+
+		case VAProfileAV1Profile0:
+			memcpy(&surface_object->params.av1.picture,
+			       buffer_object->data,
+			       sizeof(surface_object->params.av1.picture));
+			/* Reset per-frame tile group entry array on each new
+			 * picture parameter buffer (start of a new frame). */
+			surface_object->params.av1.num_tile_group_entries = 0;
+			break;
+
 		default:
 			break;
 		}
@@ -114,11 +182,44 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			       sizeof(surface_object->params.h264.slice));
 			break;

-		case VAProfileHEVCMain:
+		case VAProfileHEVCMain: {
+			unsigned int n = surface_object->params.h265.num_slices;
+			if (n < HEVC_MAX_SLICES_PER_FRAME) {
+				memcpy(&surface_object->params.h265.slices[n],
+				       buffer_object->data,
+				       sizeof(VASliceParameterBufferHEVC));
+				surface_object->params.h265.num_slices = n + 1;
+			}
+			/* Keep .slice mirror populated as last-slice ref for
+			 * h265_fill_pps which reads dependent_slice_segment_flag */
 			memcpy(&surface_object->params.h265.slice,
 			       buffer_object->data,
 			       sizeof(surface_object->params.h265.slice));
 			break;
+		}
+
+		case VAProfileVP8Version0_3:
+			memcpy(&surface_object->params.vp8.slice,
+			       buffer_object->data,
+			       sizeof(surface_object->params.vp8.slice));
+			break;
+
+		case VAProfileVP9Profile0:
+			memcpy(&surface_object->params.vp9.slice,
+			       buffer_object->data,
+			       sizeof(surface_object->params.vp9.slice));
+			break;
+
+		case VAProfileAV1Profile0: {
+			unsigned int n = surface_object->params.av1.num_tile_group_entries;
+			if (n < AV1_MAX_TILES) {
+				memcpy(&surface_object->params.av1.tile_group_entries[n],
+				       buffer_object->data,
+				       sizeof(VASliceParameterBufferAV1));
+				surface_object->params.av1.num_tile_group_entries = n + 1;
+			}
+			break;
+		}

 		default:
 			break;
@@ -143,6 +244,7 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			memcpy(&surface_object->params.h264.matrix,
 			       buffer_object->data,
 			       sizeof(surface_object->params.h264.matrix));
+			surface_object->params.h264.matrix_set = true;
 			break;

 		case VAProfileHEVCMain:
@@ -152,6 +254,27 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			surface_object->params.h265.iqmatrix_set = true;
 			break;

+		case VAProfileVP8Version0_3:
+			memcpy(&surface_object->params.vp8.iqmatrix,
+			       buffer_object->data,
+			       sizeof(surface_object->params.vp8.iqmatrix));
+			surface_object->params.vp8.iqmatrix_set = true;
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	case VAProbabilityBufferType:
+		switch (profile) {
+		case VAProfileVP8Version0_3:
+			memcpy(&surface_object->params.vp8.probability,
+			       buffer_object->data,
+			       sizeof(surface_object->params.vp8.probability));
+			surface_object->params.vp8.probability_set = true;
+			break;
+
 		default:
 			break;
 		}
@@ -184,7 +307,8 @@ static VAStatus codec_set_controls(struct request_data *driver_data,
 	case VAProfileH264ConstrainedBaseline:
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
-		rc = h264_set_controls(driver_data, context, surface_object);
+		rc = h264_set_controls(driver_data, context, profile,
+				       surface_object);
 		if (rc < 0)
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;
@@ -195,6 +319,23 @@ static VAStatus codec_set_controls(struct request_data *driver_data,
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;

+	case VAProfileVP8Version0_3:
+		rc = vp8_set_controls(driver_data, context, surface_object);
+		if (rc < 0)
+			return VA_STATUS_ERROR_OPERATION_FAILED;
+		break;
+
+	case VAProfileVP9Profile0:
+		rc = vp9_set_controls(driver_data, context, surface_object);
+		if (rc < 0)
+			return VA_STATUS_ERROR_OPERATION_FAILED;
+		break;
+	case VAProfileAV1Profile0:
+		rc = av1_set_controls(driver_data, context, surface_object);
+		if (rc < 0)
+			return VA_STATUS_ERROR_OPERATION_FAILED;
+		break;
+
 	default:
 		return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 	}
@@ -208,6 +349,9 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id,
 	struct request_data *driver_data = context->pDriverData;
 	struct object_context *context_object;
 	struct object_surface *surface_object;
+	struct request_pool_slot *slot;
+	int slot_index;
+

 	context_object = CONTEXT(driver_data, context_id);
 	if (context_object == NULL)
@@ -217,9 +361,115 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id,
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;

+	/* AV1 Phase 3 diag */
+	request_log("BeginPicture id=0x%x prev_slot=%p status=%d\n",
+		    surface_object->base.id,
+		    (void *)surface_object->current_slot,
+		    surface_object->status);
+
 	if (surface_object->status == VASurfaceRendering)
 		RequestSyncSurface(context, surface_id);

+	/*
+	 * Iter2 Fix 3: acquire a CAPTURE-pool slot for this decode cycle.
+	 * If the surface still holds a slot from a prior decode (DECODED
+	 * or EXPORTED — the consumer is done with it by definition since
+	 * we got back to BeginPicture for the same surface), release it
+	 * first. The new slot is bound and its V4L2 index + mmap pointers
+	 * are mirrored into surface_object->destination_* so the existing
+	 * QBUF/DQBUF/EXPBUF code paths see no behavioral change.
+	 *
+	 * AV1 Phase 3 finding: LIBVA_SKIP_REBIND=1 experiment (do NOT
+	 * unbind on rebind) did not improve PASS count for the av1_larger
+	 * film_grain stress vector — proving the iter2 Fix 3 release is
+	 * NOT the source of the inter-frame divergence. The issue is
+	 * deeper in ffmpeg-vaapi's AV1 hwaccel: per byte-equal OUTPUT
+	 * comparison with the patched-ffmpeg-v4l2request reference run
+	 * (LD_LIBRARY_PATH override on a debug libavcodec.so), 7/7 first
+	 * EndPicture submissions are byte-identical, libva has 2 EXTRA.
+	 */
+	if (surface_object->current_slot != NULL)
+		surface_unbind_slot(driver_data, surface_object);
+
+	/*
+	 * AV1 Phase 5 review Amendment 4: clear any stale
+	 * linked_decode_surface_id from a prior film_grain display→decode
+	 * link. If ffmpeg-vaapi recycles a former display surface as a
+	 * decode target, BeginPicture binds a fresh slot — but without
+	 * this reset, copy_surface_to_image's link-follow would still
+	 * borrow from the now-stale linked surface and serve wrong data.
+	 * Cleared unconditionally (cheap) so the next AV1 grain frame
+	 * re-establishes the link if needed.
+	 */
+	surface_object->linked_decode_surface_id = VA_INVALID_SURFACE;
+	{
+		struct cap_pool_slot *cap_slot =
+			cap_pool_acquire(&driver_data->capture_pool, surface_id);
+		if (cap_slot == NULL)
+			return VA_STATUS_ERROR_ALLOCATION_FAILED;
+		surface_bind_slot(surface_object, cap_slot);
+
+		/*
+		 * iter8 Phase 7 IMP-1 experiment: env-gated CAPTURE buffer
+		 * pre-zero. LIBVA_V4L2_ZERO_CAPTURE=1 wipes the slot's mmap'd
+		 * region before kernel decode. Discriminates "kernel writes
+		 * partial then aborts" from "kernel writes nothing and we
+		 * see stale residue."
+		 */
+		{
+			static const char *zero_env = NULL;
+			static bool zero_env_checked = false;
+			if (!zero_env_checked) {
+				zero_env = getenv("LIBVA_V4L2_ZERO_CAPTURE");
+				zero_env_checked = true;
+			}
+			if (zero_env != NULL && zero_env[0] == '1') {
+				unsigned int b;
+				for (b = 0; b < cap_slot->buffers_count; b++)
+					if (cap_slot->map[b] != NULL)
+						memset(cap_slot->map[b], 0,
+						       cap_slot->map_lengths[b]);
+			}
+		}
+	}
+
+	/*
+	 * Borrow an OUTPUT (bitstream-input) slot from the driver-wide
+	 * pool for the duration of this Begin/Render/End cycle. The
+	 * surface's source_* fields hold the borrow's mmap pointer/size/
+	 * V4L2 buffer index until RequestSyncSurface releases it after
+	 * VIDIOC_DQBUF.
+	 */
+	slot_index = request_pool_acquire(&driver_data->output_pool);
+	if (slot_index < 0)
+		return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+	slot = request_pool_slot(&driver_data->output_pool,
+				 (unsigned int)slot_index);
+	if (slot == NULL) {
+		request_pool_release(&driver_data->output_pool,
+				     (unsigned int)slot_index);
+		return VA_STATUS_ERROR_ALLOCATION_FAILED;
+	}
+
+	surface_object->source_index = slot->index;
+	surface_object->source_data = slot->data;
+	surface_object->source_size = slot->size;
+	/*
+	 * iter6: bind the slot's permanent request_fd to this surface for the
+	 * duration of the decode cycle. Replaces the iter4 close+alloc-per-
+	 * frame model. The fd is REINIT'd (not closed) at RequestSyncSurface,
+	 * so the kernel-side request object is reset in place — no fd-reuse
+	 * race with another slot's pending decode.
+	 */
+	surface_object->request_fd = slot->request_fd;
+	surface_object->slices_size = 0;
+	surface_object->slices_count = 0;
+	surface_object->params.h264.matrix_set = false;
+	surface_object->params.h265.num_slices = 0;
+	surface_object->params.vp8.iqmatrix_set = false;
+	surface_object->params.vp8.probability_set = false;
+
 	surface_object->status = VASurfaceRendering;
 	context_object->render_surface_id = surface_id;

@@ -255,7 +505,8 @@ VAStatus RequestRenderPicture(VADriverContextP context, VAContextID context_id,
 		if (buffer_object == NULL)
 			return VA_STATUS_ERROR_INVALID_BUFFER;

-		rc = codec_store_buffer(driver_data, config_object->profile,
+		rc = codec_store_buffer(driver_data, context_object,
+					config_object->profile,
 					surface_object, buffer_object);
 		if (rc != VA_STATUS_SUCCESS)
 			return rc;
@@ -296,22 +547,75 @@ VAStatus RequestEndPicture(VADriverContextP context, VAContextID context_id)
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;

-	gettimeofday(&surface_object->timestamp, NULL);
+	/*
+	 * iter9 α-7: monotonic per-context counter instead of gettimeofday,
+	 * so DPB.reference_ts / OUTPUT QBUF ts stay small (matches
+	 * ffmpeg-v4l2request's pattern). Confirmed in iter30 sweep
+	 * (1×, 1000×, 1000000× multipliers all produce identical output);
+	 * the counter scheme works on both rkvdec and hantro vb2_find_buffer.
+	 */
+	context_object->timestamp_counter++;
+	surface_object->timestamp.tv_sec =
+		(time_t)(context_object->timestamp_counter / 1000000);
+	surface_object->timestamp.tv_usec =
+		(suseconds_t)(context_object->timestamp_counter % 1000000);

+	/*
+	 * iter6: request_fd was bound to the surface in BeginPicture from
+	 * the OUTPUT pool slot's permanent fd. Per-frame allocation is gone.
+	 */
 	request_fd = surface_object->request_fd;
-	if (request_fd < 0) {
-		request_fd = media_request_alloc(driver_data->media_fd);
-		if (request_fd < 0)
-			return VA_STATUS_ERROR_OPERATION_FAILED;
-
-		surface_object->request_fd = request_fd;
-	}
+	if (request_fd < 0)
+		return VA_STATUS_ERROR_OPERATION_FAILED;

 	rc = codec_set_controls(driver_data, context_object,
 				config_object->profile, surface_object);
 	if (rc != VA_STATUS_SUCCESS)
 		return rc;

+	/*
+	 * iter14 α-16: env-gated dump of OUTPUT bitstream bytes immediately
+	 * before QBUF. LIBVA_V4L2_DUMP_OUTPUT=<dir> writes source_data[0..
+	 * slices_size] to <dir>/output_<profile>_<surface>_<frame>.bin.
+	 * Discriminates whether libva writes the same H.264/HEVC slice bytes
+	 * as kdirect — if YES, Bug 4/5 are not in the OUTPUT-side; if NO,
+	 * narrow to which slice-write path produces the divergence.
+	 *
+	 * Off by default; no behavior change when env unset.
+	 */
+	{
+		static const char *dump_env = NULL;
+		static bool dump_env_checked = false;
+		if (!dump_env_checked) {
+			dump_env = getenv("LIBVA_V4L2_DUMP_OUTPUT");
+			dump_env_checked = true;
+		}
+		if (dump_env != NULL && dump_env[0] != '\0' &&
+		    surface_object->source_data != NULL &&
+		    surface_object->slices_size > 0) {
+			char path[256];
+			snprintf(path, sizeof(path),
+				 "%s/output_p%d_s%u_t%llu.bin",
+				 dump_env, (int)config_object->profile,
+				 (unsigned int)surface_object->base.id,
+				 (unsigned long long)context_object->timestamp_counter);
+			FILE *fp = fopen(path, "wb");
+			if (fp != NULL) {
+				size_t w = fwrite(surface_object->source_data,
+						  1, surface_object->slices_size,
+						  fp);
+				request_log("α-16: dumped %zu bytes to %s "
+					    "(slices_count=%u)\n",
+					    w, path,
+					    surface_object->slices_count);
+				fclose(fp);
+			} else {
+				request_log("α-16: fopen(%s) failed: %s\n",
+					    path, strerror(errno));
+			}
+		}
+	}
+
 	rc = v4l2_queue_buffer(driver_data->video_fd, -1, capture_type, NULL,
 			       surface_object->destination_index, 0,
 			       surface_object->destination_buffers_count);
@@ -25,10 +25,12 @@
 */

 #include "buffer.h"
+#include "cap_pool.h"
 #include "config.h"
 #include "context.h"
 #include "image.h"
 #include "picture.h"
+#include "request_pool.h"
 #include "subpicture.h"
 #include "surface.h"

@@ -41,6 +43,7 @@
 #include "v4l2.h"

 #include <assert.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>

@@ -51,8 +54,524 @@

 #include <sys/ioctl.h>

+#include <linux/media.h>
 #include <linux/videodev2.h>

+#include "hevc-ctrls/v4l2-hevc-ext-controls.h"
+
+/*
+ * fresnel-fourier iter4 Phase 6 commit Z + iter7 Phase 6 (B1a): device-path
+ * auto-detect via media controller topology with decoder-entity discrimination.
+ *
+ * Pre-iter4 the backend hardcoded /dev/video0 + /dev/media0. On Linux 7.0 the
+ * udev/probe order changed and rockchip-rga (an RGB color converter, no codec
+ * support) now claims /dev/video0 — the legacy default returns an empty
+ * profile list. iter4 commit Z replaced enumeration-order discovery with
+ * media-topology discovery.
+ *
+ * iter7 (B1a): the iter4 walk treated the hantro-vpu driver name as a single
+ * unit, but hantro-vpu registers BOTH encoder and decoder entities under one
+ * /dev/mediaN on RK3399. iter4's "pick the first V4L_VIDEO interface" could
+ * land on the encoder. iter7 walks ENTITIES looking for
+ * MEDIA_ENT_F_PROC_VIDEO_DECODER, then follows the kernel's link graph
+ * (data link from proc to IO entity, interface link from IO entity to V4L
+ * interface) to the correct /dev/videoN.
+ *
+ * Two-pass to prefer rkvdec: pass 1 accepts only "rkvdec" (multi-codec
+ * decoder, 3 of 5 codecs); pass 2 accepts any known decoder driver. On
+ * RK3399 this makes auto-detect always pick rkvdec when available.
+ *
+ * iter4-B1b (multi-decoder routing — open BOTH rkvdec AND hantro from one
+ * backend instance, dispatch per codec) is still deferred. Post-iter7 the
+ * backend opens one decoder per process; MPEG-2/VP8 (hantro) still need
+ * explicit LIBVA_V4L2_REQUEST_VIDEO_PATH override when iter7's pass-1
+ * lands on rkvdec.
+ *
+ * Escape hatch: LIBVA_V4L2_REQUEST_NO_AUTODETECT=1 reverts to legacy
+ * hardcoded /dev/video0 + /dev/media0 for callers that relied on it.
+ */
+static const char * const known_decoder_drivers[] = {
+	"rkvdec",
+	"hantro-vpu",
+	"cedrus",
+	"sun4i_csi",
+	NULL
+};
+
+static int resolve_dev_node(uint32_t major, uint32_t minor, char *out, size_t out_sz)
+{
+	char sysfs_path[64], target[256];
+	ssize_t n;
+	const char *base;
+
+	snprintf(sysfs_path, sizeof sysfs_path, "/sys/dev/char/%u:%u", major, minor);
+	n = readlink(sysfs_path, target, sizeof target - 1);
+	if (n < 0)
+		return -1;
+	target[n] = '\0';
+	base = strrchr(target, '/');
+	base = base ? base + 1 : target;
+	snprintf(out, out_sz, "/dev/%s", base);
+	return 0;
+}
+
+/*
+ * iter7 B1a: walk topology graph from decoder-proc entity to its V4L_VIDEO
+ * interface. Returns 0 + sets video_out on success, -1 if this media device
+ * has no decoder entity (e.g. encoder-only device).
+ *
+ * Algorithm (per Phase 5 review, empirically validated against
+ * boltzmann:~/src/linux-rockchip):
+ *   1. For each entity E with function == MEDIA_ENT_F_PROC_VIDEO_DECODER:
+ *   2.   Find IO entity neighbors via DATA links (entity↔entity).
+ *   3.   Find the V4L_VIDEO interface via INTERFACE links from those IO
+ *        neighbors.
+ *   4.   Resolve interface.devnode.major:minor to /dev/videoN.
+ *
+ * Two-call MEDIA_IOC_G_TOPOLOGY pattern (Phase 5 IMP-3): first call gets
+ * counts; second call fills the three arrays after we allocate them.
+ *
+ * Link discrimination via MEDIA_LNK_FL_INTERFACE_LINK (1U<<28):
+ * data links have flags & MEDIA_LNK_FL_INTERFACE_LINK == 0; interface
+ * links have it set. source_id/sink_id ordering is not guaranteed —
+ * check both endpoints.
+ */
+static int find_decoder_video_node_via_topology(int media_fd,
+						char *video_out,
+						size_t video_out_sz)
+{
+	struct media_v2_topology topo;
+	struct media_v2_entity *entities = NULL;
+	struct media_v2_interface *interfaces = NULL;
+	struct media_v2_link *links = NULL;
+	struct media_v2_pad *pads = NULL;
+	int ret = -1;
+	unsigned int i, j;
+
+	memset(&topo, 0, sizeof topo);
+	if (ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topo) < 0)
+		return -1;
+	if (topo.num_entities == 0 || topo.num_interfaces == 0 ||
+	    topo.num_links == 0 || topo.num_pads == 0)
+		return -1;
+
+	entities   = calloc(topo.num_entities,   sizeof *entities);
+	interfaces = calloc(topo.num_interfaces, sizeof *interfaces);
+	links      = calloc(topo.num_links,      sizeof *links);
+	pads       = calloc(topo.num_pads,       sizeof *pads);
+	if (!entities || !interfaces || !links || !pads)
+		goto out;
+
+	topo.ptr_entities   = (uintptr_t)entities;
+	topo.ptr_interfaces = (uintptr_t)interfaces;
+	topo.ptr_links      = (uintptr_t)links;
+	topo.ptr_pads       = (uintptr_t)pads;
+
+	if (ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topo) < 0)
+		goto out;
+
+	for (i = 0; i < topo.num_entities; i++) {
+		uint32_t proc_id;
+		uint32_t proc_pad_ids[16];
+		uint32_t io_entity_ids[16];
+		unsigned int proc_pad_count = 0;
+		unsigned int io_count = 0;
+
+		if (entities[i].function != MEDIA_ENT_F_PROC_VIDEO_DECODER)
+			continue;
+		proc_id = entities[i].id;
+
+		/* Step 2a: collect pads belonging to the proc entity. Data
+		 * links connect PADs, not entities directly. */
+		for (j = 0; j < topo.num_pads; j++) {
+			if (pads[j].entity_id != proc_id)
+				continue;
+			if (proc_pad_count < (sizeof proc_pad_ids /
+					      sizeof proc_pad_ids[0]))
+				proc_pad_ids[proc_pad_count++] = pads[j].id;
+		}
+
+		/* Step 2b: walk data links. For each link with either endpoint
+		 * in proc_pad_ids[], the other endpoint is a pad belonging to
+		 * an IO neighbor. Resolve that pad's entity_id via pads[]. */
+		for (j = 0; j < topo.num_links; j++) {
+			uint32_t other_pad = 0;
+			unsigned int k;
+
+			if (links[j].flags & MEDIA_LNK_FL_INTERFACE_LINK)
+				continue;
+			for (k = 0; k < proc_pad_count; k++) {
+				if (links[j].source_id == proc_pad_ids[k])
+					other_pad = links[j].sink_id;
+				else if (links[j].sink_id == proc_pad_ids[k])
+					other_pad = links[j].source_id;
+				if (other_pad != 0)
+					break;
+			}
+			if (other_pad == 0)
+				continue;
+			/* Resolve other_pad to its entity_id. */
+			for (k = 0; k < topo.num_pads; k++) {
+				if (pads[k].id != other_pad)
+					continue;
+				if (io_count < (sizeof io_entity_ids /
+						sizeof io_entity_ids[0]))
+					io_entity_ids[io_count++] =
+						pads[k].entity_id;
+				break;
+			}
+		}
+
+		/* Step 3-4: find an interface link from any IO entity neighbor;
+		 * resolve devnode for the linked V4L_VIDEO interface.
+		 * Interface links connect interfaces↔entities directly (not
+		 * via pads), so source_id/sink_id is an entity ID on one side
+		 * and an interface ID on the other. */
+		for (j = 0; j < topo.num_links; j++) {
+			uint32_t intf_id = 0;
+			unsigned int k;
+
+			if (!(links[j].flags & MEDIA_LNK_FL_INTERFACE_LINK))
+				continue;
+			for (k = 0; k < io_count; k++) {
+				if (links[j].source_id == io_entity_ids[k])
+					intf_id = links[j].sink_id;
+				else if (links[j].sink_id == io_entity_ids[k])
+					intf_id = links[j].source_id;
+				if (intf_id != 0)
+					break;
+			}
+			if (intf_id == 0)
+				continue;
+
+			for (k = 0; k < topo.num_interfaces; k++) {
+				if (interfaces[k].id != intf_id)
+					continue;
+				if (interfaces[k].intf_type !=
+				    MEDIA_INTF_T_V4L_VIDEO)
+					break;
+				if (resolve_dev_node(
+					    interfaces[k].devnode.major,
+					    interfaces[k].devnode.minor,
+					    video_out, video_out_sz) == 0)
+					ret = 0;
+				break;
+			}
+			if (ret == 0)
+				goto out;
+		}
+	}
+
+out:
+	free(entities);
+	free(interfaces);
+	free(links);
+	free(pads);
+	return ret;
+}
+
+/*
+ * iter7 B1a: two-pass walk of /dev/media0..N. Pass 1 accepts only "rkvdec"
+ * (multi-codec decoder serving 3 of 5 codecs). Pass 2 accepts any
+ * known_decoder_drivers entry. Within each pass, the chosen media device
+ * must ALSO contain at least one MEDIA_ENT_F_PROC_VIDEO_DECODER entity —
+ * guards against encoder-only devices that happen to share the same driver
+ * name (e.g. hantro-vpu encoder vs decoder inside one /dev/mediaN).
+ */
+/*
+ * iter38: locate a /dev/mediaN whose driver name matches `want_driver`
+ * AND exposes at least one MEDIA_ENT_F_PROC_VIDEO_DECODER entity (rules
+ * out encoder-only devices sharing the same driver name). Resolves the
+ * matching /dev/videoM via topology graph walk.
+ *
+ * `want_driver`:
+ *   - non-NULL → match only that exact driver name
+ *   - NULL     → match any name in known_decoder_drivers[]
+ */
+/*
+ * iter2 (ampere-kernel-decoders campaign) — runtime probe for the
+ * V4L2 stateless HEVC EXT_SPS_{ST,LT}_RPS controls added in
+ * Linux 7.0 (Casanova VDPU381/VDPU383 series). Returns true iff BOTH
+ * controls are registered on the given fd. Stored per-fd on
+ * driver_data so the multi-device-probe model (iter38) doesn't
+ * silently misbehave when codec routing switches devices.
+ *
+ * The two CIDs together are the gate — neither alone is meaningful
+ * without the other (st-RPS + lt-RPS arrays both need to be set to
+ * match the SPS num_short_term_ref_pic_sets / num_long_term_ref_pics_sps
+ * counts). Old kernels (RK3399 rkvdec on linux 6.x) register neither;
+ * RK3588 rkvdec (VDPU381/383 path) registers both.
+ *
+ * Reference: phase4_plan_iter2.md §Step 3 in
+ * ~/src/ampere-kernel-decoders/.
+ */
+static bool probe_hevc_ext_sps_rps_controls(int video_fd)
+{
+	struct v4l2_queryctrl q;
+
+	if (video_fd < 0)
+		return false;
+
+	memset(&q, 0, sizeof(q));
+	q.id = V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS;
+	if (ioctl(video_fd, VIDIOC_QUERYCTRL, &q) < 0)
+		return false;
+
+	memset(&q, 0, sizeof(q));
+	q.id = V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS;
+	if (ioctl(video_fd, VIDIOC_QUERYCTRL, &q) < 0)
+		return false;
+
+	return true;
+}
+
+/*
+ * Inspect a /dev/videoN's OUTPUT formats for `want_pixfmt`. Returns true
+ * iff at least one OUTPUT/OUTPUT_MPLANE format matches.
+ *
+ * Used to discriminate between multiple devices sharing a driver name —
+ * RK3588 has 3 hantro-vpu instances and only one of them is vpu981 (the
+ * dedicated AV1 decoder advertising V4L2_PIX_FMT_AV1_FRAME).
+ */
+static bool video_node_supports_output_fmt(int video_fd, uint32_t want_pixfmt)
+{
+	struct v4l2_fmtdesc desc;
+	const enum v4l2_buf_type types[] = {
+		V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE,
+		V4L2_BUF_TYPE_VIDEO_OUTPUT,
+	};
+	unsigned int t, i;
+
+	for (t = 0; t < sizeof(types) / sizeof(types[0]); t++) {
+		for (i = 0; i < 64; i++) {
+			memset(&desc, 0, sizeof desc);
+			desc.index = i;
+			desc.type = types[t];
+			if (ioctl(video_fd, VIDIOC_ENUM_FMT, &desc) < 0)
+				break;
+			if (desc.pixelformat == want_pixfmt)
+				return true;
+		}
+	}
+	return false;
+}
+
+static int find_decoder_device_by_driver(const char *want_driver,
+					 char *video_out, size_t video_out_sz,
+					 char *media_out, size_t media_out_sz)
+{
+	struct media_device_info info;
+	char path[32];
+	const char * const *kd;
+	int fd, i;
+
+	for (i = 0; i < 16; i++) {
+		bool match;
+
+		snprintf(path, sizeof path, "/dev/media%d", i);
+		fd = open(path, O_RDWR | O_NONBLOCK);
+		if (fd < 0)
+			continue;
+		memset(&info, 0, sizeof info);
+		if (ioctl(fd, MEDIA_IOC_DEVICE_INFO, &info) != 0) {
+			close(fd);
+			continue;
+		}
+		if (want_driver != NULL) {
+			match = (strcmp(info.driver, want_driver) == 0);
+		} else {
+			match = false;
+			for (kd = known_decoder_drivers; *kd; kd++) {
+				if (strcmp(info.driver, *kd) == 0) {
+					match = true;
+					break;
+				}
+			}
+		}
+		if (!match) {
+			close(fd);
+			continue;
+		}
+		if (find_decoder_video_node_via_topology(
+			    fd, video_out, video_out_sz) == 0) {
+			snprintf(media_out, media_out_sz, "%s", path);
+			close(fd);
+			return 0;
+		}
+		close(fd);
+	}
+	return -1;
+}
+
+/*
+ * ampere-av1-enablement Phase 2 — like find_decoder_device_by_driver but
+ * additionally verifies the resolved /dev/videoN advertises `want_pixfmt`
+ * as an OUTPUT format. Required for RK3588 where 3 hantro-vpu instances
+ * share the driver name but only one is vpu981 (AV1 decoder).
+ *
+ * Walks all /dev/media* with matching driver name; takes the first hit
+ * whose OUTPUT formats include `want_pixfmt`. Non-matching candidates
+ * (encoder-only nodes, legacy hantro for MPEG2/VP8) are skipped.
+ */
+static int find_decoder_device_by_driver_with_fmt(const char *want_driver,
+						  uint32_t want_pixfmt,
+						  char *video_out,
+						  size_t video_out_sz,
+						  char *media_out,
+						  size_t media_out_sz)
+{
+	struct media_device_info info;
+	char path[32];
+	char vpath[32];
+	int fd, vfd, i;
+
+	for (i = 0; i < 16; i++) {
+		snprintf(path, sizeof path, "/dev/media%d", i);
+		fd = open(path, O_RDWR | O_NONBLOCK);
+		if (fd < 0)
+			continue;
+		memset(&info, 0, sizeof info);
+		if (ioctl(fd, MEDIA_IOC_DEVICE_INFO, &info) != 0) {
+			close(fd);
+			continue;
+		}
+		if (strcmp(info.driver, want_driver) != 0) {
+			close(fd);
+			continue;
+		}
+		if (find_decoder_video_node_via_topology(fd, vpath,
+							 sizeof vpath) != 0) {
+			close(fd);
+			continue;
+		}
+		close(fd);
+
+		/* Capability check: does this /dev/videoN advertise the
+		 * codec-specific OUTPUT format? */
+		vfd = open(vpath, O_RDWR | O_NONBLOCK);
+		if (vfd < 0)
+			continue;
+		if (video_node_supports_output_fmt(vfd, want_pixfmt)) {
+			close(vfd);
+			snprintf(video_out, video_out_sz, "%s", vpath);
+			snprintf(media_out, media_out_sz, "%s", path);
+			return 0;
+		}
+		close(vfd);
+	}
+	return -1;
+}
+
+static int find_codec_device(char *video_out, size_t video_out_sz,
+			     char *media_out, size_t media_out_sz)
+{
+	if (find_decoder_device_by_driver("rkvdec",
+					  video_out, video_out_sz,
+					  media_out, media_out_sz) == 0)
+		return 0;
+	return find_decoder_device_by_driver(NULL,
+					     video_out, video_out_sz,
+					     media_out, media_out_sz);
+}
+
+/*
+ * iter38: profile → which physical decoder device should serve it on
+ * RK3399. Returns 'r' for rkvdec, 'h' for hantro, '?' for unknown.
+ *
+ * This is RK3399-shaped knowledge — a more general impl would interrogate
+ * each open device's supported OUTPUT formats. For the campaign-scope
+ * five codecs, the mapping is stable and explicit.
+ */
+char request_device_kind_for_profile(VAProfile profile);
+char request_device_kind_for_profile(VAProfile profile)
+{
+	switch (profile) {
+	case VAProfileH264Main:
+	case VAProfileH264High:
+	case VAProfileH264ConstrainedBaseline:
+	case VAProfileH264MultiviewHigh:
+	case VAProfileH264StereoHigh:
+	case VAProfileHEVCMain:
+	case VAProfileVP9Profile0:
+		return 'r';
+	case VAProfileMPEG2Simple:
+	case VAProfileMPEG2Main:
+	case VAProfileVP8Version0_3:
+		return 'h';
+	case VAProfileAV1Profile0:
+		return 'a';   /* ampere-av1-enablement: vpu981 dedicated AV1 */
+	default:
+		return '?';
+	}
+}
+
+/*
+ * iter38: retarget driver_data->{video,media}_fd to the device kind
+ * required by `profile`. If a switch is needed, tear down any per-device
+ * pool state so the next RequestCreateContext rebuilds it against the
+ * new device. Returns 0 on success, -1 if the required device wasn't
+ * probed (e.g. trying VP8 on a system without hantro).
+ *
+ * Safe to call repeatedly with the same profile: if the active fd
+ * already matches, the function is a no-op.
+ */
+int request_switch_device_for_profile(struct request_data *driver_data,
+				      VAProfile profile);
+int request_switch_device_for_profile(struct request_data *driver_data,
+				      VAProfile profile)
+{
+	char kind = request_device_kind_for_profile(profile);
+	int target_video, target_media;
+
+	if (kind == 'r') {
+		target_video = driver_data->video_fd_rkvdec;
+		target_media = driver_data->media_fd_rkvdec;
+	} else if (kind == 'h') {
+		target_video = driver_data->video_fd_hantro;
+		target_media = driver_data->media_fd_hantro;
+	} else if (kind == 'a') {
+		target_video = driver_data->video_fd_vpu981;
+		target_media = driver_data->media_fd_vpu981;
+	} else {
+		return -1;
+	}
+
+	/* Either side never probed (e.g. env-override single-device init,
+	 * or this kind isn't present on the running kernel) → tolerate by
+	 * staying on whatever's already active. RequestCreateConfig still
+	 * accepted the profile via the format probe, so the active fd
+	 * supports it. */
+	if (target_video < 0 || target_media < 0)
+		return 0;
+
+	if (driver_data->video_fd == target_video &&
+	    driver_data->media_fd == target_media)
+		return 0;  /* already active, nothing to do */
+
+	/*
+	 * Tear down any per-device pool state. cap_pool needs capture_type,
+	 * which comes from video_format. Both rkvdec and hantro use
+	 * V4L2_PIX_FMT_NV12 MPLANE on RK3399 (verified Phase 0 inventory)
+	 * so the MPLANE form is always right here.
+	 */
+	if (driver_data->capture_pool.initialized) {
+		cap_pool_destroy(&driver_data->capture_pool,
+				 driver_data->video_fd,
+				 V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
+	}
+	if (driver_data->output_pool.initialized)
+		request_pool_destroy(&driver_data->output_pool);
+
+	/* video_format is a static-ref pointer; re-probe on next
+	 * CreateContext since the new device's format menu may differ. */
+	driver_data->video_format = NULL;
+	driver_data->fmt_valid = false;
+
+	driver_data->video_fd = target_video;
+	driver_data->media_fd = target_media;
+	return 0;
+}
+
 /* Set default visibility for the init function only. */
 VAStatus __attribute__((visibility("default")))
 VA_DRIVER_INIT_FUNC(VADriverContextP context);
@@ -146,9 +665,23 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 	object_heap_init(&driver_data->image_heap, sizeof(struct object_image),
 			 IMAGE_ID_OFFSET);

+	static char auto_video[32], auto_media[32];
+	bool auto_media_set = false;
+
 	video_path = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
-	if (video_path == NULL)
-		video_path = "/dev/video0";
+	if (video_path == NULL) {
+		if (getenv("LIBVA_V4L2_REQUEST_NO_AUTODETECT")) {
+			video_path = "/dev/video0";
+		} else if (find_codec_device(auto_video, sizeof auto_video,
+					     auto_media, sizeof auto_media) == 0) {
+			video_path = auto_video;
+			auto_media_set = true;
+			request_log("auto-selected codec device: %s + %s\n",
+				    auto_video, auto_media);
+		} else {
+			video_path = "/dev/video0";
+		}
+	}

 	video_fd = open(video_path, O_RDWR | O_NONBLOCK);
 	if (video_fd < 0)
@@ -169,8 +702,12 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 	}

 	media_path = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
-	if (media_path == NULL)
-		media_path = "/dev/media0";
+	if (media_path == NULL) {
+		if (auto_media_set)
+			media_path = auto_media;
+		else
+			media_path = "/dev/media0";
+	}

 	media_fd = open(media_path, O_RDWR | O_NONBLOCK);
 	if (media_fd < 0)
@@ -178,6 +715,138 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)

 	driver_data->video_fd = video_fd;
 	driver_data->media_fd = media_fd;
+	driver_data->video_fd_rkvdec = -1;
+	driver_data->media_fd_rkvdec = -1;
+	driver_data->video_fd_hantro = -1;
+	driver_data->media_fd_hantro = -1;
+	driver_data->video_fd_vpu981 = -1;
+	driver_data->media_fd_vpu981 = -1;
+
+	/*
+	 * iter38: probe BOTH rkvdec and hantro-vpu so a single libva session
+	 * can serve all 5 codecs. Tag the primary fd (already opened above)
+	 * by inspecting which driver the media_fd is on, then probe the OTHER
+	 * driver and open its fds if present. RequestCreateConfig retargets
+	 * driver_data->{video,media}_fd to the right pair per profile.
+	 *
+	 * Skip the alt-probe when the user provided explicit
+	 * LIBVA_V4L2_REQUEST_VIDEO_PATH / MEDIA_PATH overrides — they signal
+	 * a specific single device intent.
+	 */
+	if (!getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH") &&
+	    !getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH")) {
+		struct media_device_info info;
+		const char *primary_driver = NULL;
+		const char *alt_driver = NULL;
+
+		memset(&info, 0, sizeof info);
+		if (ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &info) == 0) {
+			if (strcmp(info.driver, "rkvdec") == 0) {
+				primary_driver = "rkvdec";
+				alt_driver = "hantro-vpu";
+				driver_data->video_fd_rkvdec = video_fd;
+				driver_data->media_fd_rkvdec = media_fd;
+			} else if (strcmp(info.driver, "hantro-vpu") == 0) {
+				primary_driver = "hantro-vpu";
+				alt_driver = "rkvdec";
+				driver_data->video_fd_hantro = video_fd;
+				driver_data->media_fd_hantro = media_fd;
+			}
+		}
+
+		if (alt_driver != NULL) {
+			static char alt_video[32], alt_media[32];
+			if (find_decoder_device_by_driver(alt_driver,
+							  alt_video, sizeof alt_video,
+							  alt_media, sizeof alt_media) == 0) {
+				int alt_v = open(alt_video, O_RDWR | O_NONBLOCK);
+				int alt_m = (alt_v >= 0) ? open(alt_media, O_RDWR | O_NONBLOCK) : -1;
+				if (alt_v >= 0 && alt_m >= 0) {
+					if (strcmp(alt_driver, "rkvdec") == 0) {
+						driver_data->video_fd_rkvdec = alt_v;
+						driver_data->media_fd_rkvdec = alt_m;
+					} else {
+						driver_data->video_fd_hantro = alt_v;
+						driver_data->media_fd_hantro = alt_m;
+					}
+					request_log("iter38: also opened %s decoder at %s + %s\n",
+						    alt_driver, alt_video, alt_media);
+				} else {
+					if (alt_v >= 0) close(alt_v);
+					if (alt_m >= 0) close(alt_m);
+				}
+			}
+		}
+		(void)primary_driver;
+
+		/*
+		 * ampere-av1-enablement Phase 2 — additionally probe for
+		 * vpu981 (RK3588's dedicated AV1 decoder). Driver name
+		 * "hantro-vpu" alone is ambiguous on RK3588 (3 instances:
+		 * legacy MPEG2/VP8, encoder, vpu981 AV1). Discriminate by
+		 * V4L2_PIX_FMT_AV1_FRAME capability. If the primary or alt
+		 * hantro happens to BE vpu981 (unlikely but possible on
+		 * non-RK3588 boards), this probe finds it again and we just
+		 * dedupe via the fd value.
+		 */
+		{
+			static char av1_video[32], av1_media[32];
+			if (find_decoder_device_by_driver_with_fmt(
+				    "hantro-vpu", V4L2_PIX_FMT_AV1_FRAME,
+				    av1_video, sizeof av1_video,
+				    av1_media, sizeof av1_media) == 0) {
+				int av1_v = open(av1_video, O_RDWR | O_NONBLOCK);
+				int av1_m = (av1_v >= 0)
+					? open(av1_media, O_RDWR | O_NONBLOCK)
+					: -1;
+				if (av1_v >= 0 && av1_m >= 0) {
+					driver_data->video_fd_vpu981 = av1_v;
+					driver_data->media_fd_vpu981 = av1_m;
+					request_log(
+					    "ampere-av1: vpu981 AV1 decoder "
+					    "at %s + %s\n",
+					    av1_video, av1_media);
+				} else {
+					if (av1_v >= 0) close(av1_v);
+					if (av1_m >= 0) close(av1_m);
+				}
+			}
+		}
+	}
+
+	/*
+	 * iter2 (ampere-kernel-decoders): probe the new HEVC EXT_SPS_RPS
+	 * controls on each rkvdec/hantro fd. Result is consumed by
+	 * h265_set_controls per-codec gate. Per-fd storage matches the
+	 * iter38 multi-device-probe pattern (Phase 5 review item).
+	 */
+	driver_data->has_hevc_ext_sps_rps_rkvdec =
+		probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rkvdec);
+	driver_data->has_hevc_ext_sps_rps_hantro =
+		probe_hevc_ext_sps_rps_controls(driver_data->video_fd_hantro);
+	if (driver_data->has_hevc_ext_sps_rps_rkvdec) {
+		request_log("iter2: kernel registers HEVC EXT_SPS_{ST,LT}_RPS "
+			    "controls on rkvdec fd (will route through "
+			    "vendored GStreamer parser)\n");
+	}
+
+	/*
+	 * ampere-av1 Phase 2.1: probe V4L2_CID_STATELESS_AV1_FILM_GRAIN
+	 * on the vpu981 fd. Per Janet v3 amendment, this runs at backend
+	 * init (not lazily) so any race window with concurrent device
+	 * switching can't observe an inconsistent flag.
+	 */
+	driver_data->has_av1_film_grain = false;
+	if (driver_data->video_fd_vpu981 >= 0) {
+		struct v4l2_query_ext_ctrl qec;
+		if (v4l2_query_ext_ctrl(driver_data->video_fd_vpu981,
+					V4L2_CID_STATELESS_AV1_FILM_GRAIN,
+					&qec) == 0) {
+			driver_data->has_av1_film_grain = true;
+			request_log("ampere-av1: vpu981 advertises FILM_GRAIN "
+				    "control (will include in per-frame batch)\n");
+		}
+	}

 	status = VA_STATUS_SUCCESS;
 	goto complete;
@@ -205,8 +874,40 @@ VAStatus RequestTerminate(VADriverContextP context)
 	struct object_config *config_object;
 	int iterator;

-	close(driver_data->video_fd);
-	close(driver_data->media_fd);
+	/*
+	 * Tear down the OUTPUT buffer pool before closing video_fd so
+	 * the munmap calls in request_pool_destroy() can still touch the
+	 * mmap regions (which are tied to that fd's lifetime).
+	 */
+	request_pool_destroy(&driver_data->output_pool);
+
+	/*
+	 * iter38: close both probed device pairs. video_fd / media_fd above
+	 * are ACTIVE pointers into one of these pairs; close the underlying
+	 * fds explicitly. Each may be -1 if its device wasn't found.
+	 */
+	if (driver_data->video_fd_rkvdec >= 0)
+		close(driver_data->video_fd_rkvdec);
+	if (driver_data->media_fd_rkvdec >= 0)
+		close(driver_data->media_fd_rkvdec);
+	if (driver_data->video_fd_hantro >= 0)
+		close(driver_data->video_fd_hantro);
+	if (driver_data->media_fd_hantro >= 0)
+		close(driver_data->media_fd_hantro);
+	if (driver_data->video_fd_vpu981 >= 0)
+		close(driver_data->video_fd_vpu981);
+	if (driver_data->media_fd_vpu981 >= 0)
+		close(driver_data->media_fd_vpu981);
+	/* Fall back to direct close if neither alt fd captured the active
+	 * pair (env-override path). */
+	if (driver_data->video_fd_rkvdec < 0 &&
+	    driver_data->video_fd_hantro < 0 &&
+	    driver_data->video_fd_vpu981 < 0) {
+		if (driver_data->video_fd >= 0)
+			close(driver_data->video_fd);
+		if (driver_data->media_fd >= 0)
+			close(driver_data->media_fd);
+	}

 	/* Cleanup leftover buffers. */

@@ -31,11 +31,15 @@

 #include "context.h"
 #include "object_heap.h"
+#include "request_pool.h"
+#include "cap_pool.h"
 #include "video.h"
 #include <va/va.h>

 #include <linux/videodev2.h>

+#include "hevc-ctrls/v4l2-hevc-ext-controls.h"
+
 #define V4L2_REQUEST_STR_VENDOR			"v4l2-request"

 #define V4L2_REQUEST_MAX_PROFILES		11
@@ -54,10 +58,163 @@ struct request_data {
 	int video_fd;
 	int media_fd;

+	/*
+	 * iter38: multi-device probe. RK3399 has two V4L2 stateless decoders:
+	 *   - rkvdec → H264 / HEVC / VP9
+	 *   - hantro-vpu (rk3399-vpu-dec) → MPEG-2 / VP8
+	 * At VA_DRIVER_INIT we probe both, open their fds, and store them
+	 * here. driver_data->video_fd / media_fd above are the "active" fds
+	 * (point at one of the pairs below). RequestCreateConfig retargets
+	 * them based on the profile's required device. Pools and video_format
+	 * are torn down at retarget time so the next CreateContext rebuilds
+	 * them against the right device.
+	 *
+	 * -1 means that device kind isn't present on this kernel boot.
+	 * Honours LIBVA_V4L2_REQUEST_VIDEO_PATH / MEDIA_PATH explicit
+	 * overrides — when those are set, only the single requested device
+	 * is opened and the alt fds stay -1.
+	 */
+	int video_fd_rkvdec;
+	int media_fd_rkvdec;
+	int video_fd_hantro;
+	int media_fd_hantro;
+
+	/*
+	 * ampere-av1-enablement Phase 2 — vpu981 is a THIRD physical
+	 * hantro-vpu instance on RK3588 (separate from the legacy MPEG2/VP8
+	 * hantro at /dev/video2). It's the dedicated AV1 decoder at
+	 * /dev/video4 with card name "rockchip,rk3588-av1-vpu-dec".
+	 *
+	 * Driver-name alone ("hantro-vpu") is ambiguous on RK3588 — three
+	 * instances share the name. The probe discriminates by capability:
+	 * which OUTPUT format does the device advertise? Only vpu981
+	 * exposes V4L2_PIX_FMT_AV1_FRAME.
+	 */
+	int video_fd_vpu981;
+	int media_fd_vpu981;
+
+	/*
+	 * iter2 (ampere-kernel-decoders campaign) — per-fd probe result
+	 * for the V4L2_CID_STATELESS_HEVC_EXT_SPS_{ST,LT}_RPS controls
+	 * introduced in Linux 7.0 (Casanova VDPU381/VDPU383 series).
+	 * RK3399 rkvdec doesn't have them and the probe returns false;
+	 * RK3588 rkvdec (VDPU381/383) registers them and the probe is
+	 * true. h265_set_controls consults only the rkvdec entry because
+	 * HEVC routes through rkvdec only — hantro's entry stays false
+	 * naturally (it doesn't have rkvdec-specific controls).
+	 *
+	 * The pair-of-flags layout mirrors video_fd_rkvdec /
+	 * video_fd_hantro above (iter38 multi-device-probe pattern,
+	 * memory feedback_multi_device_probe_design). Phase 5 review
+	 * surfaced this as a correctness item: a single scalar on
+	 * driver_data would silently misbehave across device-switch
+	 * boundaries; per-fd storage is the safe shape.
+	 */
+	bool has_hevc_ext_sps_rps_rkvdec;
+	bool has_hevc_ext_sps_rps_hantro;
+
+	/*
+	 * ampere-av1 Phase 2.1: probe result for the optional
+	 * V4L2_CID_STATELESS_AV1_FILM_GRAIN control on the vpu981 fd.
+	 * Probed at VA_DRIVER_INIT (per Janet v3 amendment — init-time
+	 * not lazy). Consumed by av1_set_controls to conditionally include
+	 * the 4th control in the per-frame batch.
+	 *
+	 * True iff vpu981 advertises the control via VIDIOC_QUERY_EXT_CTRL.
+	 * False for non-RK3588 hosts (no vpu981 fd) or older kernels.
+	 */
+	bool has_av1_film_grain;
+
+	/*
+	 * iter2 — cached SPS-derived RPS arrays. SPS NALs only appear in
+	 * source_data on IDR frames; non-IDR frames' h265_set_controls
+	 * reuse the cached arrays so we don't submit zero-filled RPS to
+	 * the kernel (which would re-trigger the OOPS the iter2 fix is
+	 * designed to prevent). Single-slot cache (sps_id 0 only) —
+	 * adequate for the BBB / typical-stream case; multi-SPS streams
+	 * would need expanding to a [16] cache keyed by sps_id.
+	 *
+	 * The cache stores the post-mapped V4L2 control struct arrays
+	 * (not the intermediate GstH265SPS) so request.h doesn't need
+	 * to know about the vendored GStreamer parser types — only the
+	 * V4L2 UAPI structs from hevc-ctrls/v4l2-hevc-ext-controls.h
+	 * included above.
+	 *
+	 * Owned by h265.c; freed at RequestTerminate.
+	 */
+	struct v4l2_ctrl_hevc_ext_sps_st_rps *hevc_rps_cache_st;
+	unsigned int                          hevc_rps_cache_st_count;
+	struct v4l2_ctrl_hevc_ext_sps_lt_rps *hevc_rps_cache_lt;
+	unsigned int                          hevc_rps_cache_lt_count;
+	bool                                  hevc_rps_cache_valid;
+
 	struct video_format *video_format;
+
+	/*
+	 * OUTPUT (bitstream-input) buffer pool, decoupled from VA
+	 * surfaces. Sized by codec pipeline depth, populated on first
+	 * RequestCreateContext, torn down at driver Terminate.
+	 */
+	struct request_pool output_pool;
+
+	/*
+	 * CAPTURE (decoded-frame) buffer pool, decoupled from VA
+	 * surfaces (iter2 Fix 3). Each surface acquires a slot at
+	 * vaBeginPicture time and releases it on the next acquisition
+	 * or vaDestroySurfaces. Pool sized to max(surfaces_count,
+	 * MIN_CAP_POOL) at first vaCreateSurfaces2; torn down at
+	 * vaDestroyContext.
+	 *
+	 * Background: pre-iter2 each surface was 1:1 bound to one
+	 * CAPTURE buffer index; mpv re-using a surface for a new decode
+	 * caused V4L2 to re-QBUF the same physical buffer while a
+	 * compositor still held an EXPBUF'd dma_buf fd, producing
+	 * visible stutter on mpv vaapi --vo=gpu.
+	 */
+	struct cap_pool capture_pool;
+
+	/*
+	 * iter5b-β: the pre-β last_output_{width,height} cache fields
+	 * and surface_reset_format_cache() helper are deleted. They
+	 * existed because CreateSurfaces2 owned the OUTPUT-side V4L2
+	 * device-format lifecycle and needed to gate re-S_FMT on
+	 * resolution change. β moves that lifecycle to CreateContext,
+	 * which is naturally one-shot per context cycle; no caching is
+	 * required. DestroyContext + next CreateContext rebuild from
+	 * scratch.
+	 *
+	 * iter5b-β Commit D: cache the format-uniform CAPTURE-side
+	 * geometry from v4l2_get_format so CreateSurfaces2 can populate
+	 * a newly-created surface's destination_* fields without
+	 * re-querying the device. Set by CreateContext after the
+	 * v4l2_get_format(CAPTURE) call; consumed by both:
+	 *   1. CreateContext's surface_heap walk (fills surfaces that
+	 *      pre-exist when CreateContext fires);
+	 *   2. CreateSurfaces2's per-surface init (fills surfaces
+	 *      created AFTER CreateContext, e.g. ffmpeg vaapi-copy
+	 *      pool dynamics where the consumer passes surfaces_count=0
+	 *      to vaCreateContext and creates surfaces lazily).
+	 *
+	 * fmt_valid is true once CreateContext has populated the cache;
+	 * CreateSurfaces2 only lazy-fills when fmt_valid is true.
+	 */
+	bool fmt_valid;
+	unsigned int fmt_format_height;
+	unsigned int fmt_planes_count;
+	unsigned int fmt_buffers_count;
+	unsigned int fmt_sizes[VIDEO_MAX_PLANES];
+	unsigned int fmt_bytesperlines[VIDEO_MAX_PLANES];
 };

 VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context);
 VAStatus RequestTerminate(VADriverContextP context);

+/*
+ * iter38: retarget driver_data->{video,media}_fd to the device required by
+ * `profile`. Returns 0 on success, -1 on profile not mappable to any kind.
+ * Defined in request.c.
+ */
+int request_switch_device_for_profile(struct request_data *driver_data,
+				      VAProfile profile);
+
 #endif
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
+ */
+
+#include "request_pool.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "media.h"
+#include "utils.h"
+#include "v4l2.h"
+
+int request_pool_init(struct request_pool *pool, int video_fd, int media_fd,
+		      unsigned int output_type, unsigned int count)
+{
+	unsigned int index_base;
+	unsigned int length;
+	unsigned int offset;
+	unsigned int i;
+	int rc;
+
+	if (pool == NULL || count == 0)
+		return -1;
+
+	if (pool->initialized)
+		return 0;
+
+	pool->slots = calloc(count, sizeof(*pool->slots));
+	if (pool->slots == NULL)
+		return -1;
+
+	pool->count = count;
+	pool->next = 0;
+	pool->media_fd = media_fd;	/* iter7: kept for force_release re-alloc */
+
+	for (i = 0; i < count; i++)
+		pool->slots[i].request_fd = -1;
+
+	rc = v4l2_create_buffers(video_fd, output_type, count, &index_base);
+	if (rc < 0)
+		goto error;
+
+	for (i = 0; i < count; i++) {
+		pool->slots[i].index = index_base + i;
+		pool->slots[i].busy = false;
+
+		rc = v4l2_query_buffer(video_fd, output_type,
+				       pool->slots[i].index,
+				       &length, &offset, 1);
+		if (rc < 0)
+			goto error;
+
+		pool->slots[i].data = mmap(NULL, length,
+					   PROT_READ | PROT_WRITE,
+					   MAP_SHARED, video_fd, offset);
+		if (pool->slots[i].data == MAP_FAILED) {
+			pool->slots[i].data = NULL;
+			goto error;
+		}
+
+		pool->slots[i].size = length;
+
+		/*
+		 * iter6: each pool slot owns a permanent media-request fd,
+		 * allocated once here and REINIT'd between uses in
+		 * RequestSyncSurface. Replaces the iter4 close+alloc-per-
+		 * frame model, whose lowest-free fd reuse was racing with
+		 * the kernel's per-buffer state-machine teardown when the
+		 * consumer rotated through multiple OUTPUT pool slots
+		 * faster than the kernel cleanup drained (Firefox's
+		 * MediaSource pattern). 1:1 slot-to-fd binding eliminates
+		 * cross-slot fd reuse.
+		 */
+		pool->slots[i].request_fd = media_request_alloc(media_fd);
+		if (pool->slots[i].request_fd < 0)
+			goto error;
+	}
+
+	pool->initialized = true;
+	return 0;
+
+error:
+	request_pool_destroy(pool);
+	return -1;
+}
+
+void request_pool_destroy(struct request_pool *pool)
+{
+	unsigned int i;
+
+	if (pool == NULL || pool->slots == NULL)
+		return;
+
+	for (i = 0; i < pool->count; i++) {
+		if (pool->slots[i].request_fd >= 0)
+			close(pool->slots[i].request_fd);
+		if (pool->slots[i].data != NULL && pool->slots[i].size > 0)
+			munmap(pool->slots[i].data, pool->slots[i].size);
+	}
+
+	free(pool->slots);
+	pool->slots = NULL;
+	pool->count = 0;
+	pool->next = 0;
+	pool->initialized = false;
+}
+
+int request_pool_acquire(struct request_pool *pool)
+{
+	unsigned int start;
+	unsigned int i;
+
+	if (pool == NULL || !pool->initialized || pool->count == 0)
+		return -1;
+
+	start = pool->next;
+	for (i = 0; i < pool->count; i++) {
+		unsigned int slot = (start + i) % pool->count;
+
+		if (!pool->slots[slot].busy) {
+			pool->slots[slot].busy = true;
+			pool->next = (slot + 1) % pool->count;
+			return (int)pool->slots[slot].index;
+		}
+	}
+
+	/* All slots busy; caller must wait for an in-flight DQBUF. */
+	return -1;
+}
+
+void request_pool_release(struct request_pool *pool, unsigned int index)
+{
+	unsigned int i;
+
+	if (pool == NULL || pool->slots == NULL)
+		return;
+
+	for (i = 0; i < pool->count; i++) {
+		if (pool->slots[i].index == index) {
+			pool->slots[i].busy = false;
+			return;
+		}
+	}
+}
+
+void request_pool_force_release(struct request_pool *pool, unsigned int index)
+{
+	struct request_pool_slot *slot;
+	unsigned int i;
+
+	if (pool == NULL || pool->slots == NULL)
+		return;
+
+	slot = NULL;
+	for (i = 0; i < pool->count; i++) {
+		if (pool->slots[i].index == index) {
+			slot = &pool->slots[i];
+			break;
+		}
+	}
+	if (slot == NULL)
+		return;
+
+	/*
+	 * Try to recover the kernel-side request object via REINIT first.
+	 * REINIT is the cheap path: kernel resets the request in place,
+	 * fd stays valid, slot can be reused immediately.
+	 */
+	if (slot->request_fd >= 0 && media_request_reinit(slot->request_fd) == 0) {
+		slot->busy = false;
+		return;
+	}
+
+	/*
+	 * REINIT failed (or slot's fd was already invalid). Close the fd
+	 * and try to allocate a fresh one. This costs an extra ioctl pair
+	 * relative to the REINIT happy path but keeps the slot usable.
+	 *
+	 * NOTE: alloc may return the same lowest-free fd number that was
+	 * just closed. That's fine here because (a) this is a rare error-
+	 * recovery path, not the per-frame happy path, and (b) the slot's
+	 * V4L2 buffer has already been DQBUF'd by this point (or is in an
+	 * indeterminate state we can't recover from regardless), so the
+	 * iter6 race condition (cross-slot fd-reuse against a kernel
+	 * buffer in mid-cleanup) does not apply.
+	 */
+	if (slot->request_fd >= 0)
+		close(slot->request_fd);
+	slot->request_fd = media_request_alloc(pool->media_fd);
+	if (slot->request_fd < 0) {
+		/*
+		 * Realloc failed. Slot is now permanently dead — leave
+		 * busy=true so acquire skips it. Pool capacity is
+		 * effectively reduced by 1 until pool destroy.
+		 */
+		return;
+	}
+
+	slot->busy = false;
+}
+
+struct request_pool_slot *request_pool_slot(struct request_pool *pool,
+					    unsigned int index)
+{
+	unsigned int i;
+
+	if (pool == NULL || pool->slots == NULL)
+		return NULL;
+
+	for (i = 0; i < pool->count; i++) {
+		if (pool->slots[i].index == index)
+			return &pool->slots[i];
+	}
+
+	return NULL;
+}
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
+ */
+
+#ifndef _REQUEST_POOL_H_
+#define _REQUEST_POOL_H_
+
+#include <stdbool.h>
+
+/*
+ * OUTPUT (bitstream-input) buffer pool, decoupled from caller-allocated
+ * VA surfaces. Sizing is driven by codec pipeline depth (typically 4
+ * for H.264), not by the consumer's surface count.
+ *
+ * The pool owns the V4L2 buffer indices and their mmap pointers. A
+ * decode request "borrows" a slot at vaBeginPicture, fills it across
+ * vaRenderPicture calls, queues it at vaEndPicture, and releases it
+ * after VIDIOC_DQBUF returns.
+ *
+ * This replaces the per-surface OUTPUT-buffer ownership model in the
+ * pre-refactor code, where object_surface.source_* fields permanently
+ * held a single OUTPUT buffer per surface — incorrect because OUTPUT
+ * buffers are request-time resources, not picture-time resources, and
+ * because the per-surface loop in RequestCreateContext only ran when
+ * surfaces_count > 0 (breaking ffmpeg's vaapi-copy num_render_targets=0
+ * convention).
+ */
+
+struct request_pool_slot {
+	unsigned int	index;		/* V4L2 buffer index in OUTPUT queue */
+	void		*data;		/* mmap pointer for this slot */
+	unsigned int	size;		/* mmap size in bytes */
+	bool		busy;		/* true while borrowed for a request */
+	int		request_fd;	/* per-slot media-request fd, allocated
+					 * once at pool init, REINIT'd between
+					 * uses. iter6: replaces iter4 close+
+					 * alloc-per-frame to eliminate cross-
+					 * slot fd-reuse race that broke Firefox
+					 * MediaSource's multi-surface decode. */
+};
+
+struct request_pool {
+	struct request_pool_slot	*slots;
+	unsigned int			 count;
+	unsigned int			 next;	/* round-robin acquire cursor */
+	int				 media_fd;	/* iter7: kept for
+							 * force_release re-alloc */
+	bool				 initialized;
+};
+
+/*
+ * Allocate count OUTPUT buffers via VIDIOC_CREATE_BUFS, query and mmap
+ * each, populate pool->slots[]. Caller must have already done
+ * VIDIOC_S_FMT on the OUTPUT queue. Returns 0 on success, -1 on
+ * failure.
+ */
+int request_pool_init(struct request_pool *pool, int video_fd, int media_fd,
+		      unsigned int output_type, unsigned int count);
+
+/*
+ * Munmap all slots and free the slots array. Idempotent.
+ */
+void request_pool_destroy(struct request_pool *pool);
+
+/*
+ * Claim the next free slot (round-robin). Returns the slot's V4L2
+ * buffer index on success (slot in pool->slots[] is determined by
+ * the returned index), or -1 if all slots are busy.
+ */
+int request_pool_acquire(struct request_pool *pool);
+
+/*
+ * Mark the slot at pool->slots[i] free for reuse. Caller must pass the
+ * V4L2 buffer index returned earlier from request_pool_acquire().
+ */
+void request_pool_release(struct request_pool *pool, unsigned int index);
+
+/*
+ * iter7: error-recovery release. Called from RequestSyncSurface error
+ * paths when media_request_reinit or VIDIOC_DQBUF failed mid-cycle and
+ * the slot's request_fd is now in an undefined state. REINITs the fd;
+ * if REINIT fails (kernel-side request object too far gone), close
+ * the fd and re-alloc a fresh one. If realloc also fails, the slot
+ * is left busy=true (effectively dead, count decremented by 1) — pool
+ * survives but with reduced capacity until driver terminate. Other
+ * slots are unaffected.
+ *
+ * Caller passes the V4L2 buffer index from request_pool_acquire().
+ */
+void request_pool_force_release(struct request_pool *pool,
+				unsigned int index);
+
+/*
+ * Look up the pool slot owning a given V4L2 buffer index. Returns
+ * pointer to the slot on success, NULL if the index is out of range.
+ * The returned pointer is valid until pool destruction; do not free.
+ */
+struct request_pool_slot *request_pool_slot(struct request_pool *pool,
+					    unsigned int index);
+
+#endif
@@ -29,6 +29,7 @@

 #include <assert.h>
 #include <errno.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -46,6 +47,119 @@
 #include "v4l2.h"
 #include "video.h"

+/*
+ * iter5b-β: the OUTPUT-side V4L2 device-format lifecycle moved out
+ * of this file. Pre-β CreateSurfaces2 owned the S_FMT(OUTPUT) +
+ * CAPTURE-format probe + cap_pool_init + per-surface destination_*
+ * fill; now that responsibility lives in context.c::RequestCreateContext
+ * where the bound config (and therefore the active VAProfile) is
+ * known via config_id. CreateSurfaces2 retains only surface object
+ * ID allocation and per-surface bookkeeping. The previous
+ * `surface_reset_format_cache` helper and `last_output_width/height`
+ * fields are deleted (β doesn't gate re-S_FMT on
+ * resolution — the lifecycle is CreateContext-centric and natural
+ * setup/teardown happens at each context cycle).
+ */
+
+/*
+ * Iter2 Fix 3 helpers — bind / unbind a cap_pool_slot to an
+ * object_surface. Called from BeginPicture (acquire+bind) and
+ * DestroySurfaces (unbind). Populates surface_object->destination_*
+ * fields from the slot so existing code paths (the QBUF in
+ * picture.c::EndPicture, the EXPBUF in ExportSurfaceHandle, the
+ * mmap-read in copy_surface_to_image) continue to work unchanged.
+ *
+ * surface_bind_slot is called only from BeginPicture; the surface's
+ * format-uniform fields (destination_planes_count, destination_sizes,
+ * destination_offsets, destination_bytesperlines) are already set
+ * by CreateSurfaces2 and stay constant.
+ */
+void surface_bind_slot(struct object_surface *surface_object,
+		       struct cap_pool_slot *slot)
+{
+	unsigned int j;
+
+	surface_object->current_slot = slot;
+	surface_object->destination_index = slot->v4l2_index;
+	surface_object->destination_buffers_count = slot->buffers_count;
+
+	for (j = 0; j < slot->buffers_count; j++) {
+		surface_object->destination_map[j] = slot->map[j];
+		surface_object->destination_map_lengths[j] = slot->map_lengths[j];
+		surface_object->destination_map_offsets[j] = slot->map_offsets[j];
+	}
+
+	/*
+	 * destination_data[j] is the per-plane CPU pointer used by
+	 * copy_surface_to_image. For single-buffer MPLANE NV12 (our
+	 * common case), all planes live in slot->map[0] at varying
+	 * offsets recorded in destination_offsets[].
+	 */
+	if (slot->buffers_count == 1) {
+		for (j = 0; j < surface_object->destination_planes_count; j++)
+			surface_object->destination_data[j] =
+				(unsigned char *)slot->map[0] +
+				surface_object->destination_offsets[j];
+	} else {
+		for (j = 0; j < surface_object->destination_planes_count; j++)
+			surface_object->destination_data[j] = slot->map[j];
+	}
+}
+
+void surface_unbind_slot(struct request_data *driver_data,
+			 struct object_surface *surface_object)
+{
+	if (surface_object->current_slot == NULL)
+		return;
+	/* AV1 Phase 3 diag: log every unbind with surface id + slot idx
+	 * + status — confirms whether BeginPicture rebind is racing the
+	 * consumer's vaGetImage on the previous frame. */
+	request_log("surface_unbind_slot id=0x%x status=%d slot_idx=%u\n",
+		    surface_object->base.id,
+		    surface_object->status,
+		    surface_object->current_slot->v4l2_index);
+	cap_pool_release(&driver_data->capture_pool, surface_object->current_slot);
+	surface_object->current_slot = NULL;
+}
+
+/*
+ * iter5b-β Commit D: fill format-uniform destination_* on a surface
+ * from driver_data's CAPTURE-format cache. Idempotent: no-op if
+ * destination_planes_count is non-zero already.
+ */
+void surface_fill_format_uniform(struct request_data *driver_data,
+				 struct object_surface *surface_object)
+{
+	unsigned int j;
+
+	if (!driver_data->fmt_valid)
+		return;
+	if (surface_object->destination_planes_count != 0)
+		return;
+
+	surface_object->destination_planes_count = driver_data->fmt_planes_count;
+	surface_object->destination_buffers_count = driver_data->fmt_buffers_count;
+
+	if (driver_data->fmt_buffers_count == 1) {
+		for (j = 0; j < driver_data->fmt_planes_count; j++) {
+			surface_object->destination_offsets[j] =
+				j > 0 ? driver_data->fmt_sizes[j - 1] : 0;
+			surface_object->destination_sizes[j] =
+				driver_data->fmt_sizes[j];
+			surface_object->destination_bytesperlines[j] =
+				driver_data->fmt_bytesperlines[0];
+		}
+	} else if (driver_data->fmt_buffers_count == driver_data->fmt_planes_count) {
+		for (j = 0; j < driver_data->fmt_planes_count; j++) {
+			surface_object->destination_offsets[j] = 0;
+			surface_object->destination_sizes[j] =
+				driver_data->fmt_sizes[j];
+			surface_object->destination_bytesperlines[j] =
+				driver_data->fmt_bytesperlines[j];
+		}
+	}
+}
+
 VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
 				unsigned int width, unsigned int height,
 				VASurfaceID *surfaces_ids,
@@ -55,130 +169,41 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
 {
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;
-	struct video_format *video_format = NULL;
-	unsigned int destination_sizes[VIDEO_MAX_PLANES];
-	unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
-	unsigned int destination_planes_count;
-	unsigned int format_width, format_height;
-	unsigned int capture_type;
-	unsigned int index_base;
-	unsigned int index;
-	unsigned int i, j;
+	unsigned int i;
 	VASurfaceID id;
-	bool found;
-	int rc;

+	/*
+	 * iter5b-β: only RT-format-level validation here. All V4L2
+	 * device state (OUTPUT format, CAPTURE format probe,
+	 * cap_pool_init, per-surface destination_* fill) is deferred
+	 * to RequestCreateContext where the bound VAConfigID
+	 * (and therefore the active VAProfile) is known. CreateSurfaces2
+	 * has no config_id parameter; the VA-API contract is
+	 * CreateConfig → CreateSurfaces → CreateContext, and we
+	 * can't know the OUTPUT pixel format until CreateContext binds.
+	 *
+	 * Surface objects allocated here hold only the requested
+	 * width/height and per-surface lifecycle bookkeeping
+	 * (current_slot, status, params, etc). The format-uniform
+	 * destination_* fields are filled by CreateContext via
+	 * surface_bind_format_uniform_fields(); the per-slot
+	 * destination_* fields fill at BeginPicture via surface_bind_slot.
+	 */
 	if (format != VA_RT_FORMAT_YUV420)
 		return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;

-
-        if (!driver_data->video_format) {
-		found = v4l2_find_format(driver_data->video_fd,
-					 V4L2_BUF_TYPE_VIDEO_CAPTURE,
-					 V4L2_PIX_FMT_SUNXI_TILED_NV12);
-		if (found)
-			video_format = video_format_find(V4L2_PIX_FMT_SUNXI_TILED_NV12);
-
-		found = v4l2_find_format(driver_data->video_fd,
-					 V4L2_BUF_TYPE_VIDEO_CAPTURE,
-					 V4L2_PIX_FMT_NV12);
-		if (found)
-			video_format = video_format_find(V4L2_PIX_FMT_NV12);
-
-		if (video_format == NULL)
-			return VA_STATUS_ERROR_OPERATION_FAILED;
-
-		driver_data->video_format = video_format;
-
-		capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
-
-		rc = v4l2_set_format(driver_data->video_fd, capture_type,
-				     video_format->v4l2_format, width, height);
-		if (rc < 0)
-			return VA_STATUS_ERROR_OPERATION_FAILED;
-        } else {
-		video_format = driver_data->video_format;
-		capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
-	}
-
-	rc = v4l2_get_format(driver_data->video_fd, capture_type, &format_width,
-			     &format_height, destination_bytesperlines,
-			     destination_sizes, NULL);
-	if (rc < 0)
-		return VA_STATUS_ERROR_OPERATION_FAILED;
-
-	destination_planes_count = video_format->planes_count;
-
-	rc = v4l2_create_buffers(driver_data->video_fd, capture_type,
-				 surfaces_count, &index_base);
-	if (rc < 0)
-		return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
 	for (i = 0; i < surfaces_count; i++) {
-		index = index_base + i;
-
 		id = object_heap_allocate(&driver_data->surface_heap);
 		surface_object = SURFACE(driver_data, id);
 		if (surface_object == NULL)
 			return VA_STATUS_ERROR_ALLOCATION_FAILED;

-		rc = v4l2_query_buffer(driver_data->video_fd, capture_type,
-				       index,
-				       surface_object->destination_map_lengths,
-				       surface_object->destination_map_offsets,
-				       video_format->v4l2_buffers_count);
-		if (rc < 0)
-			return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
-		for (j = 0; j < video_format->v4l2_buffers_count; j++) {
-			surface_object->destination_map[j] =
-				mmap(NULL,
-				     surface_object->destination_map_lengths[j],
-				     PROT_READ | PROT_WRITE, MAP_SHARED,
-				     driver_data->video_fd,
-				     surface_object->destination_map_offsets[j]);
-
-			if (surface_object->destination_map[j] == MAP_FAILED)
-				return VA_STATUS_ERROR_ALLOCATION_FAILED;
-		}
-
-		/*
-		 * FIXME: Handle this per-pixelformat, trying to generalize it
-		 * is not a reasonable approach. The final description should be
-		 * in terms of (logical) planes.
-		 */
-
-		if (video_format->v4l2_buffers_count == 1) {
-			destination_sizes[0] = destination_bytesperlines[0] *
-					       format_height;
-
-			for (j = 1; j < destination_planes_count; j++)
-				destination_sizes[j] = destination_sizes[0] / 2;
-
-			for (j = 0; j < destination_planes_count; j++) {
-				surface_object->destination_offsets[j] =
-					j > 0 ? destination_sizes[j - 1] : 0;
-				surface_object->destination_data[j] =
-					((unsigned char *)surface_object->destination_map[0] +
-					 surface_object->destination_offsets[j]);
-				surface_object->destination_sizes[j] =
-					destination_sizes[j];
-				surface_object->destination_bytesperlines[j] =
-					destination_bytesperlines[0];
-			}
-		} else if (video_format->v4l2_buffers_count == destination_planes_count) {
-			for (j = 0; j < destination_planes_count; j++) {
-				surface_object->destination_offsets[j] = 0;
-				surface_object->destination_data[j] =
-					surface_object->destination_map[j];
-				surface_object->destination_sizes[j] =
-					destination_sizes[j];
-				surface_object->destination_bytesperlines[j] =
-					destination_bytesperlines[j];
-			}
-		} else {
-			return VA_STATUS_ERROR_ALLOCATION_FAILED;
-		}
+		surface_object->current_slot = NULL;	/* iter2 Fix 3 */
+		surface_object->linked_decode_surface_id = VA_INVALID_SURFACE;
+		surface_object->av1_order_hint = 0;
+		surface_object->destination_index = 0;	/* set on bind */
+		surface_object->destination_planes_count = 0;	/* set at CreateContext */
+		surface_object->destination_buffers_count = 0;	/* set at CreateContext */

 		surface_object->status = VASurfaceReady;
 		surface_object->width = width;
@@ -188,13 +213,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
 		surface_object->source_data = NULL;
 		surface_object->source_size = 0;

-		surface_object->destination_index = index;
-
-		surface_object->destination_planes_count =
-			destination_planes_count;
-		surface_object->destination_buffers_count =
-			video_format->v4l2_buffers_count;
-
 		memset(&surface_object->params, 0,
 		       sizeof(surface_object->params));
 		surface_object->slices_count = 0;
@@ -202,6 +220,17 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,

 		surface_object->request_fd = -1;

+		/*
+		 * iter5b-β Commit D: if CreateContext has already populated
+		 * the format-uniform cache (driver_data->fmt_valid), fill
+		 * the new surface's destination_* immediately. This covers
+		 * the case where a consumer creates more surfaces AFTER
+		 * CreateContext. The first batch of surfaces (created before
+		 * CreateContext) gets filled by CreateContext's surface_heap
+		 * walk; this lazy-fill handles late arrivals.
+		 */
+		surface_fill_format_uniform(driver_data, surface_object);
+
 		surfaces_ids[i] = id;
 	}

@@ -221,26 +250,32 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,
 {
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;
-	unsigned int i, j;
+	unsigned int i;

 	for (i = 0; i < surfaces_count; i++) {
 		surface_object = SURFACE(driver_data, surfaces_ids[i]);
 		if (surface_object == NULL)
 			return VA_STATUS_ERROR_INVALID_SURFACE;

-		if (surface_object->source_data != NULL &&
-		    surface_object->source_size > 0)
-			munmap(surface_object->source_data,
-			       surface_object->source_size);
+		/*
+		 * source_* are now transient borrows from request_pool, not
+		 * surface-owned mappings; the pool owns the underlying mmap.
+		 * Nothing to free here.
+		 *
+		 * Iter2 Fix 3: destination_* mappings are owned by cap_pool;
+		 * surface_unbind_slot returns the slot to FREE (closing OUR
+		 * EXPBUF fd if any). Pool-owned mmaps are freed at
+		 * cap_pool_destroy time (RequestDestroyContext).
+		 */
+		surface_unbind_slot(driver_data, surface_object);

-		for (j = 0; j < surface_object->destination_buffers_count; j++)
-			if (surface_object->destination_map[j] != NULL &&
-			    surface_object->destination_map_lengths[j] > 0)
-				munmap(surface_object->destination_map[j],
-				       surface_object->destination_map_lengths[j]);
-
-		if (surface_object->request_fd > 0)
-			close(surface_object->request_fd);
+		/*
+		 * iter6: request_fd is owned by the OUTPUT pool slot, not by
+		 * the surface. Do not close here. The pool closes all slot
+		 * fds at request_pool_destroy time, which fires from
+		 * RequestTerminate (driver unload) — the pool is driver-wide
+		 * and survives context destroy/recreate cycles.
+		 */

 		object_heap_free(&driver_data->surface_heap,
 				 (struct object_base *)surface_object);
@@ -251,8 +286,9 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,

 VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
 {
+
 	struct request_data *driver_data = context->pDriverData;
-	struct object_surface *surface_object;
+	struct object_surface *surface_object = NULL;
 	VAStatus status;
 	struct video_format *video_format;
 	unsigned int output_type, capture_type;
@@ -297,19 +333,51 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
 		goto error;
 	}

+	/*
+	 * iter6: the request_fd belongs to the OUTPUT pool slot, not to the
+	 * surface. REINIT to reset its state in place — close+alloc would
+	 * reuse the lowest-free fd number against a kernel object whose
+	 * teardown hasn't fully drained, racing with QBUF on a slot that
+	 * was just released. The pool's 1:1 slot-to-fd binding eliminates
+	 * cross-slot fd reuse, and REINIT here resets the request object
+	 * for the next decode cycle on the same slot.
+	 *
+	 * Iter4's frame-11 EINVAL (which prompted the iter4 close+alloc
+	 * model) was a control-payload bug — DPB carry-over with FFmpeg's
+	 * V4L2_H264_FRAME_REF semantics not yet matched. That's been fixed
+	 * since iter4 (`74d8dd1`), so REINIT is no longer compromised by
+	 * the cluster-validation EINVAL pattern.
+	 */
 	rc = media_request_reinit(request_fd);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
+	surface_object->request_fd = -1;

 	rc = v4l2_dequeue_buffer(driver_data->video_fd, -1, output_type,
 				 surface_object->source_index, 1);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
-		goto error;
+		/*
+		 * iter7: OUTPUT DQBUF failed. The V4L2 buffer is in an
+		 * indeterminate kernel state — it may still be QUEUED. Do
+		 * NOT return the slot to acquire-rotation: the next QBUF
+		 * on it would EINVAL. Leave source_data set so the error
+		 * handler skips force_release and the slot stays dead-busy.
+		 */
+		goto error_buffer_indeterminate;
 	}

+	/*
+	 * OUTPUT buffer is back from the kernel: return its pool slot
+	 * for reuse and clear the surface's transient borrow handle.
+	 */
+	request_pool_release(&driver_data->output_pool,
+			     surface_object->source_index);
+	surface_object->source_data = NULL;
+	surface_object->source_size = 0;
+
 	rc = v4l2_dequeue_buffer(driver_data->video_fd, -1, capture_type,
 				 surface_object->destination_index,
 				 surface_object->destination_buffers_count);
@@ -318,14 +386,152 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
 		goto error;
 	}

+	/*
+	 * Iter2 Fix 3: CAPTURE buffer is back from the kernel with valid
+	 * pixel content. Transition the slot IN_DECODE → DECODED. The slot
+	 * stays bound to this surface until either ExportSurfaceHandle
+	 * (→ EXPORTED), the next BeginPicture for this surface (slot is
+	 * released first), or DestroySurfaces (release).
+	 */
+	if (surface_object->current_slot != NULL) {
+		cap_pool_mark_decoded(&driver_data->capture_pool,
+				      surface_object->current_slot);
+
+		/*
+		 * iter8 Phase 6 (γ): env-gated diagnostic dump of the CAPTURE
+		 * buffer immediately after DQBUF + mark_decoded. Distinguishes
+		 * "kernel didn't write" from "libva mis-reads" for Bug 4
+		 * (H.264 partial-fill). Off by default; enable with
+		 * LIBVA_V4L2_DUMP_CAPTURE=1. destination_data[] is valid here
+		 * (surface_bind_slot populated it at BeginPicture).
+		 */
+		static const char *dump_env = NULL;
+		static bool dump_env_checked = false;
+		if (!dump_env_checked) {
+			dump_env = getenv("LIBVA_V4L2_DUMP_CAPTURE");
+			dump_env_checked = true;
+		}
+		if (dump_env != NULL && dump_env[0] == '1') {
+			unsigned int p;
+			char hexbuf[128];
+			request_log("γ-dump: surface_id=%u v4l2_index=%u planes=%u\n",
+				    (unsigned int)surface_id,
+				    surface_object->destination_index,
+				    surface_object->destination_planes_count);
+			for (p = 0; p < surface_object->destination_planes_count; p++) {
+				const unsigned char *d = surface_object->destination_data[p];
+				size_t sz = surface_object->destination_sizes[p];
+				size_t scan_lim;
+				unsigned int nz = 0;
+				size_t i;
+				int pos;
+
+				if (d == NULL) {
+					request_log("γ-dump:  plane[%u] NULL ptr (size=%zu)\n",
+						    p, sz);
+					continue;
+				}
+
+				/*
+				 * Phase 5 MIN-2: scan at least one Y-MB row
+				 * (16 lines * bytesperline) for plane 0, else
+				 * 1024 bytes for chroma plane.
+				 */
+				if (p == 0) {
+					size_t mbrow =
+					    surface_object->destination_bytesperlines[0] * 16;
+					scan_lim = sz < mbrow ? sz : mbrow;
+				} else {
+					scan_lim = sz < 1024 ? sz : 1024;
+				}
+				for (i = 0; i < scan_lim; i++)
+					if (d[i] != 0)
+						nz++;
+
+				request_log("γ-dump:  plane[%u] sz=%zu bpl=%u "
+					    "scan=%zu non_zero=%u\n",
+					    p, sz,
+					    surface_object->destination_bytesperlines[p],
+					    scan_lim, nz);
+
+				pos = 0;
+				for (i = 0; i < 32 && i < sz; i++)
+					pos += snprintf(hexbuf + pos,
+							sizeof(hexbuf) - pos,
+							"%02x ", d[i]);
+				request_log("γ-dump:  plane[%u] head[0..32]: %s\n",
+					    p, hexbuf);
+
+				if (sz >= 32) {
+					pos = 0;
+					for (i = 0; i < 32; i++)
+						pos += snprintf(hexbuf + pos,
+								sizeof(hexbuf) - pos,
+								"%02x ", d[sz - 32 + i]);
+					request_log("γ-dump:  plane[%u] tail[%zu..%zu]: %s\n",
+						    p, sz - 32, sz - 1, hexbuf);
+				}
+			}
+		}
+	}
+
 	surface_object->status = VASurfaceDisplaying;

 	status = VA_STATUS_SUCCESS;
 	goto complete;

 error:
-	if (request_fd >= 0) {
-		close(request_fd);
+	/*
+	 * iter7: error recovery for the OUTPUT pool slot. If the surface
+	 * acquired a slot in BeginPicture (source_data != NULL indicates
+	 * an active borrow), reset the slot's request_fd via
+	 * request_pool_force_release so the slot returns to the
+	 * acquire-rotation. force_release tries REINIT first; falls back
+	 * to close+alloc if REINIT fails; leaves the slot dead-busy if
+	 * even alloc fails (other slots unaffected). Replaces iter6's
+	 * accepted bounded leak.
+	 *
+	 * Reachable from: media_request_queue / wait_completion / REINIT
+	 * failures. NOT reachable for OUTPUT-DQBUF failure (separate label
+	 * `error_buffer_indeterminate` below) because in that case the
+	 * V4L2 buffer is in an indeterminate kernel state and reusing the
+	 * slot would EINVAL on the next QBUF.
+	 *
+	 * If the surface never acquired a slot (source_data == NULL),
+	 * there is no slot to release; nothing to do.
+	 */
+	if (surface_object != NULL) {
+		if (surface_object->source_data != NULL) {
+			request_pool_force_release(&driver_data->output_pool,
+						   surface_object->source_index);
+			surface_object->source_data = NULL;
+			surface_object->source_size = 0;
+		}
+		surface_object->request_fd = -1;
+	}
+	goto complete;
+
+error_buffer_indeterminate:
+	/*
+	 * iter7: OUTPUT DQBUF failed after a successful REINIT. The kernel
+	 * V4L2 buffer is in an unknown state (possibly still QUEUED with
+	 * pending decode result, possibly half-dequeued, possibly stuck
+	 * in driver internals). The slot's request_fd has already been
+	 * REINIT'd to a clean state, but reusing the slot for a new
+	 * decode would QBUF on a buffer the kernel may still hold —
+	 * triggering exactly the iter6 race we eliminated for the happy
+	 * path.
+	 *
+	 * Leave the slot dead-busy: don't release, don't force_release.
+	 * Other slots are unaffected. If this fires repeatedly, the pool
+	 * leaks slots until starvation, at which point acquire returns -1
+	 * and BeginPicture cleanly propagates ALLOCATION_FAILED. This is
+	 * a strictly safer failure mode than reusing an indeterminate
+	 * V4L2 buffer.
+	 */
+	if (surface_object != NULL) {
+		surface_object->source_data = NULL;
+		surface_object->source_size = 0;
 		surface_object->request_fd = -1;
 	}

@@ -338,6 +544,7 @@ VAStatus RequestQuerySurfaceAttributes(VADriverContextP context,
 				       VASurfaceAttrib *attributes,
 				       unsigned int *attributes_count)
 {
+
 	struct request_data *driver_data = context->pDriverData;
 	VASurfaceAttrib *attributes_list;
 	unsigned int attributes_list_size = V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES *
@@ -416,6 +623,7 @@ VAStatus RequestQuerySurfaceStatus(VADriverContextP context,
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;

+
 	surface_object = SURFACE(driver_data, surface_id);
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
@@ -493,6 +701,18 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
 		goto error;
 	}

+	/*
+	 * Iter2 Fix 3: pool now owns OUR copy of the EXPBUF'd fd. The
+	 * consumer receives a dup'd / equivalent fd via the descriptor.
+	 * Slot transitions DECODED → EXPORTED; it will be force-recyclable
+	 * by LRU when the pool is exhausted, but FREE slots are always
+	 * preferred.
+	 */
+	if (surface_object->current_slot != NULL && export_fds_count > 0)
+		cap_pool_mark_exported(&driver_data->capture_pool,
+				       surface_object->current_slot,
+				       export_fds[0]);
+
 	planes_count = surface_object->destination_planes_count;

 	surface_descriptor->fourcc = VA_FOURCC_NV12;
@@ -506,27 +726,102 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
 		for (i = 0; i < planes_count; i++)
 			size += surface_object->destination_sizes[i];

+	/*
+	 * Iteration 2 Fix 2: choose drm_format_modifier conditionally on
+	 * pitch alignment. Mesa's WSI / Panfrost compositor path rejects
+	 * DRM_FORMAT_MOD_NONE (= LINEAR explicit) buffers whose pitch isn't
+	 * GPU-aligned (typically 64+ bytes for Mali). For 1920-wide content
+	 * the pitch is 1920 (64-aligned, fine); for 864-wide content the
+	 * pitch is 864 (only 16-aligned), Mesa rejects with "WSI pitch not
+	 * properly aligned" and Firefox falls back to SW.
+	 *
+	 * Setting DRM_FORMAT_MOD_INVALID tells the importer "modifier
+	 * unknown, treat as implicit / texture-only" — Firefox's
+	 * DMABufSurface.cpp:1920 explicitly omits modifier attribs from
+	 * eglCreateImage when the value is MOD_INVALID, bypassing Mesa's
+	 * scanout-alignment check. The buffer is then texture-imported
+	 * (small perf cost) instead of WSI scanout-imported, which is
+	 * the correct behavior for a buffer that doesn't meet scanout
+	 * alignment requirements.
+	 *
+	 * We branch on pitch alignment to preserve LINEAR semantics for
+	 * already-aligned content (avoids unnecessary perf cost on the
+	 * common 1920-wide case).
+	 *
+	 * Sonnet Phase 5 review (iter2 question 4) endorsed this
+	 * conditional approach over a universal MOD_INVALID change.
+	 */
 	for (i = 0; i < export_fds_count; i++) {
-		surface_descriptor->objects[i].drm_format_modifier =
-			video_format->drm_modifier;
+		uint64_t modifier = video_format->drm_modifier;
+		unsigned int bytesperline =
+			surface_object->destination_bytesperlines[0];
+		if (bytesperline & 63) /* not 64-byte aligned */
+			modifier = DRM_FORMAT_MOD_INVALID;
+		surface_descriptor->objects[i].drm_format_modifier = modifier;
 		surface_descriptor->objects[i].fd = export_fds[i];
 		surface_descriptor->objects[i].size = export_fds_count == 1 ?
 						      size :
 						      surface_object->destination_sizes[i];
 	}

-	surface_descriptor->num_layers = 1;
+	/*
+	 * Layer construction depends on the consumer's request flags
+	 * (VA_EXPORT_SURFACE_*_LAYERS):
+	 *
+	 *   COMPOSED_LAYERS (default, mpv): one layer carrying both
+	 *   Y and UV planes (drm_format=NV12, num_planes=2). Mesa
+	 *   imports as a single NV12 EGLImage.
+	 *
+	 *   SEPARATE_LAYERS (Firefox 150 RDD): two layers, Y as a
+	 *   single-plane R8 layer, UV as a single-plane GR88 layer.
+	 *   Firefox's GetVAAPISurfaceDescriptor passes
+	 *   VA_EXPORT_SURFACE_SEPARATE_LAYERS so its DMABufSurfaceYUV
+	 *   import code can address Y and UV planes independently.
+	 *   Without this branch, Firefox parsed our COMPOSED layout
+	 *   as if it were SEPARATE, found bogus layer-1 data, and
+	 *   silently fell back to FFmpeg(FFVPX) software decode.
+	 *
+	 * The earlier path 0001 mplane port assumed a single COMPOSED
+	 * shape — fine for mpv but breaks any consumer requesting
+	 * SEPARATE. Honor the flag.
+	 */
+	if ((flags & VA_EXPORT_SURFACE_SEPARATE_LAYERS) && planes_count == 2) {
+		surface_descriptor->num_layers = 2;

-	surface_descriptor->layers[0].drm_format = video_format->drm_format;
-	surface_descriptor->layers[0].num_planes = planes_count;
+		/* Layer 0: Y plane as DRM_FORMAT_R8 (1 byte/pixel luma). */
+		surface_descriptor->layers[0].drm_format = DRM_FORMAT_R8;
+		surface_descriptor->layers[0].num_planes = 1;
+		surface_descriptor->layers[0].object_index[0] =
+			export_fds_count == 1 ? 0 : 0;
+		surface_descriptor->layers[0].offset[0] =
+			surface_object->destination_offsets[0];
+		surface_descriptor->layers[0].pitch[0] =
+			surface_object->destination_bytesperlines[0];

-	for (i = 0; i < planes_count; i++) {
-		surface_descriptor->layers[0].object_index[i] =
-			export_fds_count == 1 ? 0 : i;
-		surface_descriptor->layers[0].offset[i] =
-			surface_object->destination_offsets[i];
-		surface_descriptor->layers[0].pitch[i] =
-			surface_object->destination_bytesperlines[i];
+		/* Layer 1: UV plane as DRM_FORMAT_GR88 (interleaved
+		 * U+V, 2 bytes/pixel chroma at half resolution). */
+		surface_descriptor->layers[1].drm_format = DRM_FORMAT_GR88;
+		surface_descriptor->layers[1].num_planes = 1;
+		surface_descriptor->layers[1].object_index[0] =
+			export_fds_count == 1 ? 0 : 1;
+		surface_descriptor->layers[1].offset[0] =
+			surface_object->destination_offsets[1];
+		surface_descriptor->layers[1].pitch[0] =
+			surface_object->destination_bytesperlines[1];
+	} else {
+		/* COMPOSED_LAYERS / default: one layer with all planes. */
+		surface_descriptor->num_layers = 1;
+		surface_descriptor->layers[0].drm_format = video_format->drm_format;
+		surface_descriptor->layers[0].num_planes = planes_count;
+
+		for (i = 0; i < planes_count; i++) {
+			surface_descriptor->layers[0].object_index[i] =
+				export_fds_count == 1 ? 0 : i;
+			surface_descriptor->layers[0].offset[i] =
+				surface_object->destination_offsets[i];
+			surface_descriptor->layers[0].pitch[i] =
+				surface_object->destination_bytesperlines[i];
+		}
 	}

 	status = VA_STATUS_SUCCESS;
@@ -32,6 +32,11 @@
 #include <va/va_backend.h>

 #include "object_heap.h"
+#include "cap_pool.h"
+
+#include "h265.h"
+
+struct request_data;

 #define SURFACE(data, id)                                                      \
 	((struct object_surface *)object_heap_lookup(&(data)->surface_heap, id))
@@ -40,7 +45,7 @@
 struct object_surface {
 	struct object_base base;

-	VAStatus status;
+	VASurfaceStatus status;
 	int width;
 	int height;

@@ -48,6 +53,26 @@ struct object_surface {
 	void *source_data;
 	unsigned int source_size;

+	/*
+	 * Iter2 Fix 3: destination_* fields below are now per-decode-cycle.
+	 * They are populated from current_slot in RequestBeginPicture and
+	 * remain valid through SyncSurface, ExportSurfaceHandle, and
+	 * DeriveImage/copy_surface_to_image (vaapi-copy path). Subsequent
+	 * BeginPicture for this surface releases the prior slot and
+	 * acquires a new one.
+	 *
+	 * destination_planes_count, destination_sizes, destination_offsets,
+	 * destination_bytesperlines are FORMAT-uniform across all CAPTURE
+	 * buffers, so they're set once at CreateSurfaces2 time and stay.
+	 *
+	 * destination_index, destination_map[], destination_map_lengths,
+	 * destination_map_offsets, destination_data[] are SLOT-specific
+	 * and re-populated each BeginPicture from current_slot.
+	 *
+	 * destination_buffers_count is also format-uniform (V4L2 planes
+	 * per buffer = 1 for single-plane MPLANE NV12).
+	 */
+	struct cap_pool_slot *current_slot;	/* iter2 Fix 3 */
 	unsigned int destination_index;
 	void *destination_map[VIDEO_MAX_PLANES];
 	unsigned int destination_map_lengths[VIDEO_MAX_PLANES];
@@ -64,6 +89,33 @@ struct object_surface {

 	struct timeval timestamp;

+	/*
+	 * AV1 Phase 3: for streams with apply_grain=1, VAAPI's
+	 * VADecPictureParameterBufferAV1 carries current_display_picture
+	 * (display-time surface) separate from current_frame (decode
+	 * target). vpu981 HW applies grain inline to the decode CAPTURE
+	 * buffer, so the decoded data lives in current_frame's slot — but
+	 * ffmpeg calls vaGetImage on current_display_picture which has no
+	 * slot bound. linked_decode_surface_id, set in av1_set_controls
+	 * on the display surface, points to the decode surface so
+	 * copy_surface_to_image can borrow its destination_data[].
+	 *
+	 * VA_INVALID_SURFACE = no link (the common case: 8-bit codecs,
+	 * AV1 with apply_grain=0, AV1 frames where cur_frame ==
+	 * cur_display).
+	 */
+	VASurfaceID linked_decode_surface_id;
+
+	/*
+	 * AV1 Phase 3: AV1 order_hint of the frame currently decoded into
+	 * this surface. VAAPI's VADecPictureParameterBufferAV1.order_hint
+	 * is per-frame; kernel's v4l2_ctrl_av1_frame.order_hints[8] is
+	 * per-reference. We track each decoded frame's order_hint here so
+	 * the next frame's av1_set_controls can populate order_hints[i]
+	 * from ref_frame_map[i] → SURFACE → av1_order_hint.
+	 */
+	uint8_t av1_order_hint;
+
 	union {
 		struct {
 			VAPictureParameterBufferMPEG2 picture;
@@ -73,15 +125,43 @@ struct object_surface {
 		} mpeg2;
 		struct {
 			VAIQMatrixBufferH264 matrix;
+			bool matrix_set;
 			VAPictureParameterBufferH264 picture;
 			VASliceParameterBufferH264 slice;
 		} h264;
 		struct {
 			VAPictureParameterBufferHEVC picture;
 			VASliceParameterBufferHEVC slice;
+			VASliceParameterBufferHEVC slices[HEVC_MAX_SLICES_PER_FRAME];
+			unsigned int num_slices;
 			VAIQMatrixBufferHEVC iqmatrix;
 			bool iqmatrix_set;
 		} h265;
+		struct {
+			VAPictureParameterBufferVP8 picture;
+			VASliceParameterBufferVP8 slice;
+			VAIQMatrixBufferVP8 iqmatrix;
+			bool iqmatrix_set;
+			VAProbabilityDataBufferVP8 probability;
+			bool probability_set;
+		} vp8;
+		struct {
+			VADecPictureParameterBufferVP9 picture;
+			VASliceParameterBufferVP9 slice;
+		} vp9;
+		/*
+		 * ampere-av1-enablement: AV1 needs picture-header +
+		 * variable number of slice/tile params (one per tile).
+		 * tile_group_entries[] holds parsed VASliceParameterBufferAV1
+		 * entries up to MAX_TILES; av1.c builds the matching
+		 * v4l2_ctrl_av1_tile_group_entry[] at set_controls time.
+		 */
+		struct {
+#define AV1_MAX_TILES 128
+			VADecPictureParameterBufferAV1 picture;
+			VASliceParameterBufferAV1 tile_group_entries[AV1_MAX_TILES];
+			unsigned int num_tile_group_entries;
+		} av1;
 	} params;

 	int request_fd;
@@ -125,4 +205,37 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
 				    VASurfaceID surface_id, uint32_t mem_type,
 				    uint32_t flags, void *descriptor);

+/*
+ * iter5b-β Commit D: populate a surface's format-uniform destination_*
+ * fields (planes_count, buffers_count, offsets, sizes, bytesperlines)
+ * from driver_data's cached CAPTURE-side geometry. Idempotent: skip
+ * if already filled (destination_planes_count != 0). Caller must
+ * ensure driver_data->fmt_valid is true (CreateContext has run).
+ *
+ * Called by:
+ *   - context.c::RequestCreateContext after v4l2_get_format(CAPTURE)
+ *     populates the cache; walks the surface_heap and fills every
+ *     existing surface (covers surfaces created before CreateContext,
+ *     including the ffmpeg vaapi-copy case where surfaces_count=0 is
+ *     passed but surfaces exist in the heap from earlier
+ *     CreateSurfaces2 calls).
+ *   - surface.c::RequestCreateSurfaces2 after surface allocation,
+ *     covering the case where CreateContext fired before this
+ *     CreateSurfaces2 call (fmt cache is valid, fill immediately).
+ */
+void surface_fill_format_uniform(struct request_data *driver_data,
+				 struct object_surface *surface_object);
+
+/*
+ * Iter2 Fix 3: bind / unbind a CAPTURE-pool slot to an object_surface.
+ * Called from picture.c::RequestBeginPicture (acquire+bind) and
+ * surface.c::RequestDestroySurfaces (unbind). Mirrors slot's V4L2 index
+ * and mmap pointers into surface_object->destination_* so existing
+ * QBUF/DQBUF/EXPBUF code paths see no behavioral change.
+ */
+void surface_bind_slot(struct object_surface *surface_object,
+		       struct cap_pool_slot *slot);
+void surface_unbind_slot(struct request_data *driver_data,
+			 struct object_surface *surface_object);
+
 #endif
@@ -27,7 +27,7 @@
 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
 #endif

-#ifndef __aarch64__
+#ifdef __arm__

 .text
 .syntax unified
@@ -428,37 +428,102 @@ int v4l2_export_buffer(int video_fd, unsigned int type, unsigned int index,
 	return 0;
 }

-int v4l2_set_control(int video_fd, int request_fd, unsigned int id, void *data,
-		     unsigned int size)
+static int v4l2_ioctl_controls(int video_fd, int request_fd, unsigned long ioc,
+			       struct v4l2_ext_control *control_array,
+			       unsigned int num_controls)
 {
-	struct v4l2_ext_control control;
 	struct v4l2_ext_controls controls;
 	int rc;

-	memset(&control, 0, sizeof(control));
 	memset(&controls, 0, sizeof(controls));

-	control.id = id;
-	control.ptr = data;
-	control.size = size;
-
-	controls.controls = &control;
-	controls.count = 1;
+	controls.controls = control_array;
+	controls.count = num_controls;

 	if (request_fd >= 0) {
 		controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
 		controls.request_fd = request_fd;
 	}

-	rc = ioctl(video_fd, VIDIOC_S_EXT_CTRLS, &controls);
+	rc = ioctl(video_fd, ioc, &controls);
 	if (rc < 0) {
-		request_log("Unable to set control: %s\n", strerror(errno));
+		/* ampere-av1 Phase 2.1 diag: surface error_idx so the caller's
+		 * error path knows which CID failed validation. error_idx >=
+		 * count means the failure was pre-validation (e.g., bad
+		 * request_fd). errno carries the syscall-level reason. */
+		const char *failed_cid_label = "<pre-validation>";
+		unsigned int failed_size = 0;
+		if (controls.error_idx < num_controls) {
+			failed_size = control_array[controls.error_idx].size;
+			(void)failed_cid_label;  /* keep symbol if logger truncates */
+		}
+		request_log("v4l2_ioctl_controls: rc=%d errno=%d (%s) "
+			    "ioc=0x%lx error_idx=%u count=%u "
+			    "failed_cid=0x%x failed_size=%u\n",
+			    rc, errno, strerror(errno), ioc,
+			    controls.error_idx, num_controls,
+			    controls.error_idx < num_controls
+			        ? control_array[controls.error_idx].id : 0,
+			    failed_size);
+	}
+	return rc;
+}
+
+int v4l2_get_controls(int video_fd, int request_fd,
+		      struct v4l2_ext_control *control_array,
+		      unsigned int num_controls)
+{
+	int rc;
+
+	rc = v4l2_ioctl_controls(video_fd, request_fd, VIDIOC_G_EXT_CTRLS,
+				 control_array, num_controls);
+	if (rc < 0) {
+		/*
+		 * EACCES on G_EXT_CTRLS for request fds is the normal case on
+		 * this hantro rig — the kernel doesn't allow readback through
+		 * the request_fd. Caller (h264.c) tracks this with a one-time
+		 * "V4L2 readback unavailable" announcement. Suppress per-call
+		 * noise to keep the log signal-to-noise high.
+		 */
+		if (errno != EACCES)
+			request_log("Unable to get control(s): %s\n",
+				    strerror(errno));
 		return -1;
 	}

 	return 0;
 }

+int v4l2_set_controls(int video_fd, int request_fd,
+		      struct v4l2_ext_control *control_array,
+		      unsigned int num_controls)
+{
+	int rc;
+
+	rc = v4l2_ioctl_controls(video_fd, request_fd, VIDIOC_S_EXT_CTRLS,
+				 control_array, num_controls);
+	if (rc < 0) {
+		request_log("Unable to set control(s): %s\n", strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+int v4l2_set_control(int video_fd, int request_fd, unsigned int id, void *data,
+		     unsigned int size)
+{
+	struct v4l2_ext_control control;
+
+	memset(&control, 0, sizeof(control));
+
+	control.id = id;
+	control.ptr = data;
+	control.size = size;
+
+	return v4l2_set_controls(video_fd, request_fd, &control, 1);
+}
+
 int v4l2_set_stream(int video_fd, unsigned int type, bool enable)
 {
 	enum v4l2_buf_type buf_type = type;
@@ -474,3 +539,63 @@ int v4l2_set_stream(int video_fd, unsigned int type, bool enable)

 	return 0;
 }
+
+int v4l2_query_ext_ctrl(int video_fd, unsigned int id,
+			struct v4l2_query_ext_ctrl *qec)
+{
+	struct v4l2_query_ext_ctrl local;
+	struct v4l2_query_ext_ctrl *target = qec ? qec : &local;
+	int rc;
+
+	memset(target, 0, sizeof(*target));
+	target->id = id;
+
+	rc = ioctl(video_fd, VIDIOC_QUERY_EXT_CTRL, target);
+	if (rc < 0)
+		return -1;
+
+	return 0;
+}
+
+int v4l2_query_menu(int video_fd, unsigned int id, unsigned int index,
+		    struct v4l2_querymenu *qm)
+{
+	int rc;
+
+	if (qm == NULL)
+		return -1;
+
+	memset(qm, 0, sizeof(*qm));
+	qm->id = id;
+	qm->index = index;
+
+	rc = ioctl(video_fd, VIDIOC_QUERYMENU, qm);
+	if (rc < 0)
+		return -1;
+
+	return 0;
+}
+
+bool v4l2_ctrl_menu_has_value(int video_fd, unsigned int id,
+			      unsigned int value)
+{
+	struct v4l2_query_ext_ctrl qec;
+	struct v4l2_querymenu qm;
+	long long i;
+
+	if (v4l2_query_ext_ctrl(video_fd, id, &qec) < 0)
+		return false;
+
+	if (qec.type != V4L2_CTRL_TYPE_MENU &&
+	    qec.type != V4L2_CTRL_TYPE_INTEGER_MENU)
+		return false;
+
+	for (i = qec.minimum; i <= qec.maximum; i += qec.step ? qec.step : 1) {
+		if (v4l2_query_menu(video_fd, id, (unsigned int)i, &qm) < 0)
+			continue;
+		if ((unsigned int)i == value)
+			return true;
+	}
+
+	return false;
+}
@@ -54,8 +54,47 @@ int v4l2_dequeue_buffer(int video_fd, int request_fd, unsigned int type,
 int v4l2_export_buffer(int video_fd, unsigned int type, unsigned int index,
 		       unsigned int flags, int *export_fds,
 		       unsigned int export_fds_count);
+int v4l2_get_controls(int video_fd, int request_fd,
+		      struct v4l2_ext_control *controls,
+		      unsigned int num_controls);
+int v4l2_set_controls(int video_fd, int request_fd,
+		      struct v4l2_ext_control *controls,
+		      unsigned int num_controls);
 int v4l2_set_control(int video_fd, int request_fd, unsigned int id, void *data,
 		     unsigned int size);
 int v4l2_set_stream(int video_fd, unsigned int type, bool enable);

+/*
+ * Capability-probe helpers. These let calling code discover what the
+ * backing kernel driver supports rather than hardcoding assumptions
+ * about specific decoder hardware.
+ */
+
+/*
+ * Query the metadata of an extended control by CID. Fills *qec on
+ * success. Returns 0 if the control exists, -1 (errno=EINVAL) if the
+ * driver does not expose this CID. Pass qec=NULL to test existence
+ * only.
+ */
+struct v4l2_query_ext_ctrl;
+int v4l2_query_ext_ctrl(int video_fd, unsigned int id,
+			struct v4l2_query_ext_ctrl *qec);
+
+/*
+ * Query a single menu item of a menu/intmenu control at the given
+ * index. Fills *qm on success. Returns 0 if the menu item exists at
+ * this index, -1 otherwise.
+ */
+struct v4l2_querymenu;
+int v4l2_query_menu(int video_fd, unsigned int id, unsigned int index,
+		    struct v4l2_querymenu *qm);
+
+/*
+ * Convenience: for a menu-type control, return true iff `value` is a
+ * valid menu entry (i.e. the driver accepts it). Walks all menu items
+ * up to the control's maximum to check.
+ */
+bool v4l2_ctrl_menu_has_value(int video_fd, unsigned int id,
+			      unsigned int value);
+
 #endif
@@ -39,12 +39,14 @@ static struct video_format formats[] = {
 		.description		= "NV12 YUV",
 		.v4l2_format		= V4L2_PIX_FMT_NV12,
 		.v4l2_buffers_count	= 1,
-		.v4l2_mplane		= false,
+		.v4l2_mplane		= true,
 		.drm_format		= DRM_FORMAT_NV12,
 		.drm_modifier		= DRM_FORMAT_MOD_NONE,
 		.planes_count		= 2,
 		.bpp			= 16,
 	},
+// Code to handle this DRM_FORMAT is __arm__ only
+#ifdef __arm__
 	{
 		.description		= "Sunxi tiled NV12 YUV",
 		.v4l2_format		= V4L2_PIX_FMT_SUNXI_TILED_NV12,
@@ -55,6 +57,7 @@ static struct video_format formats[] = {
 		.planes_count		= 2,
 		.bpp			= 16
 	},
+#endif
 };

 static unsigned int formats_count = sizeof(formats) / sizeof(formats[0]);
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * fresnel-fourier iter3 Phase 6 commit B: VP8 codec dispatcher
+ * implemented against V4L2_CID_STATELESS_VP8_FRAME (kernel UAPI
+ * <linux/v4l2-controls.h>:1900-1958). Single batched control per
+ * frame, no init-time device-wide menus (VP8 has no DECODE_MODE/
+ * START_CODE — confirmed by Phase 0 V4L2 inventory + Phase 3
+ * cross-validator strace).
+ *
+ * Reference: FFmpeg libavcodec/v4l2_request_vp8.c (kwiboo branch);
+ *            FFmpeg libavcodec/vaapi_vp8.c (VAAPI source-side
+ *            verification of the field semantics);
+ *            kernel drivers/media/platform/verisilicon/
+ *              hantro_g1_vp8_dec.c (RK3399 hardware reads
+ *              first_part_header_bits + first_part_size to compute
+ *              MB-data DMA offset).
+ *
+ * Phase 5 review amendments incorporated (see phase5_iter3_review.md):
+ *   C1 first_part_header_bits = slice->macroblock_offset
+ *      (NOT 0; kernel reads it unconditionally; same formula as
+ *      v4l2_request_vp8.c uses internally)
+ *   C2 first_part_size = slice->partition_size[0] +
+ *                        ((macroblock_offset + 7) / 8)
+ *      (recover total partition size from VAAPI's post-parse
+ *      remainder)
+ *   C3 VAProbabilityBufferType (not VAProbabilityDataBufferType)
+ *   C4 (int8_t) cast (not (s8); kernel-internal typedef not in
+ *      userspace UAPI)
+ *   S3 assert(probability_set) runtime guard (kernel has NO
+ *      coeff_probs default fallback; consumer MUST send
+ *      VAProbabilityBufferType per frame)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "vp8.h"
+#include "context.h"
+#include "request.h"
+#include "surface.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <sys/ioctl.h>
+
+#include <linux/videodev2.h>
+#include <linux/v4l2-controls.h>
+
+#include "v4l2.h"
+
+int vp8_set_controls(struct request_data *driver_data,
+		     struct object_context *context_object,
+		     struct object_surface *surface_object)
+{
+	VAPictureParameterBufferVP8 *picture =
+		&surface_object->params.vp8.picture;
+	VASliceParameterBufferVP8 *slice =
+		&surface_object->params.vp8.slice;
+	VAIQMatrixBufferVP8 *iqmatrix =
+		&surface_object->params.vp8.iqmatrix;
+	VAProbabilityDataBufferVP8 *probability =
+		&surface_object->params.vp8.probability;
+	bool iqmatrix_set = surface_object->params.vp8.iqmatrix_set;
+	bool probability_set = surface_object->params.vp8.probability_set;
+
+	struct v4l2_ctrl_vp8_frame frame;
+	struct object_surface *last_ref;
+	struct object_surface *golden_ref;
+	struct object_surface *alt_ref;
+	int rc;
+	int i, j;
+
+	memset(&frame, 0, sizeof frame);
+
+	/* Phase 5 S3: kernel has no coeff_probs default fallback. The
+	 * VAAPI consumer chain (FFmpeg's vaapi_vp8.c:146-148, used by
+	 * mpv and ffmpeg-vaapi) always sends VAProbabilityBufferType
+	 * per frame. Surface immediately if a future consumer doesn't. */
+	assert(probability_set);
+
+	/* Clause 3: frame geometry + per-frame scalars */
+	frame.width = picture->frame_width;
+	frame.height = picture->frame_height;
+	frame.horizontal_scale = 0;  /* not exposed by VAAPI */
+	frame.vertical_scale = 0;
+	frame.version = picture->pic_fields.bits.version;
+	frame.prob_skip_false = picture->prob_skip_false;
+	frame.prob_intra = picture->prob_intra;
+	frame.prob_last = picture->prob_last;
+	frame.prob_gf = picture->prob_gf;
+	/* Phase 3 Q2: VAAPI counts include control partition;
+	 * kernel counts DCT only — off-by-one. */
+	frame.num_dct_parts = slice->num_of_partitions - 1;
+
+	/* Clause 4: DPB timestamp resolution (mirrors mpeg2.c pattern;
+	 * NULL surface → timestamp stays 0 from memset). */
+	last_ref = SURFACE(driver_data, picture->last_ref_frame);
+	golden_ref = SURFACE(driver_data, picture->golden_ref_frame);
+	alt_ref = SURFACE(driver_data, picture->alt_ref_frame);
+	if (last_ref != NULL)
+		frame.last_frame_ts =
+			v4l2_timeval_to_ns(&last_ref->timestamp);
+	if (golden_ref != NULL)
+		frame.golden_frame_ts =
+			v4l2_timeval_to_ns(&golden_ref->timestamp);
+	if (alt_ref != NULL)
+		frame.alt_frame_ts =
+			v4l2_timeval_to_ns(&alt_ref->timestamp);
+
+	/* Clause 5: loop filter mapping */
+	for (i = 0; i < 4; i++) {
+		frame.lf.ref_frm_delta[i] =
+			picture->loop_filter_deltas_ref_frame[i];
+		frame.lf.mb_mode_delta[i] =
+			picture->loop_filter_deltas_mode[i];
+	}
+	frame.lf.sharpness_level = picture->pic_fields.bits.sharpness_level;
+	frame.lf.level = picture->loop_filter_level[0];
+	if (picture->pic_fields.bits.loop_filter_adj_enable)
+		frame.lf.flags |= V4L2_VP8_LF_ADJ_ENABLE;
+	if (picture->pic_fields.bits.mode_ref_lf_delta_update)
+		frame.lf.flags |= V4L2_VP8_LF_DELTA_UPDATE;
+	if (picture->pic_fields.bits.filter_type)
+		frame.lf.flags |= V4L2_VP8_LF_FILTER_TYPE_SIMPLE;
+
+	/* Clause 6: quantization base + delta derivation */
+	if (iqmatrix_set) {
+		frame.quant.y_ac_qi =
+			iqmatrix->quantization_index[0][0];
+		frame.quant.y_dc_delta = (int8_t)
+			(iqmatrix->quantization_index[0][1] -
+			 iqmatrix->quantization_index[0][0]);
+		frame.quant.y2_dc_delta = (int8_t)
+			(iqmatrix->quantization_index[0][2] -
+			 iqmatrix->quantization_index[0][0]);
+		frame.quant.y2_ac_delta = (int8_t)
+			(iqmatrix->quantization_index[0][3] -
+			 iqmatrix->quantization_index[0][0]);
+		frame.quant.uv_dc_delta = (int8_t)
+			(iqmatrix->quantization_index[0][4] -
+			 iqmatrix->quantization_index[0][0]);
+		frame.quant.uv_ac_delta = (int8_t)
+			(iqmatrix->quantization_index[0][5] -
+			 iqmatrix->quantization_index[0][0]);
+	}
+
+	if (picture->pic_fields.bits.segmentation_enabled && iqmatrix_set) {
+		for (i = 1; i < 4; i++)
+			frame.segment.quant_update[i] = (int8_t)
+				(iqmatrix->quantization_index[i][0] -
+				 iqmatrix->quantization_index[0][0]);
+	}
+
+	/* Clause 7: segment fields */
+	for (i = 0; i < 3; i++)
+		frame.segment.segment_probs[i] =
+			picture->mb_segment_tree_probs[i];
+	if (picture->pic_fields.bits.segmentation_enabled)
+		frame.segment.flags |= V4L2_VP8_SEGMENT_FLAG_ENABLED;
+	if (picture->pic_fields.bits.update_mb_segmentation_map)
+		frame.segment.flags |= V4L2_VP8_SEGMENT_FLAG_UPDATE_MAP;
+	if (picture->pic_fields.bits.update_segment_feature_data)
+		frame.segment.flags |=
+			V4L2_VP8_SEGMENT_FLAG_UPDATE_FEATURE_DATA;
+	/* DELTA_VALUE_MODE: VAAPI doesn't expose abs_delta. FFmpeg sets
+	 * unconditionally per !s->segmentation.absolute_vals (default).
+	 * Kernel ignores when ENABLED bit clear (BBB case). */
+	frame.segment.flags |= V4L2_VP8_SEGMENT_FLAG_DELTA_VALUE_MODE;
+
+	if (picture->pic_fields.bits.segmentation_enabled) {
+		for (i = 0; i < 4; i++)
+			frame.segment.lf_update[i] = (int8_t)
+				(picture->loop_filter_level[i] -
+				 picture->loop_filter_level[0]);
+	}
+
+	/* Clause 8: entropy table mapping (3 VAAPI sources merged) */
+	for (i = 0; i < 4; i++)
+		frame.entropy.y_mode_probs[i] = picture->y_mode_probs[i];
+	for (i = 0; i < 3; i++)
+		frame.entropy.uv_mode_probs[i] = picture->uv_mode_probs[i];
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < 19; j++)
+			frame.entropy.mv_probs[i][j] =
+				picture->mv_probs[i][j];
+	/* coeff_probs[4][8][3][11]: VAAPI layout matches kernel exactly;
+	 * direct memcpy. Both vaapi_vp8.c:133-143 and v4l2_request_vp8.c:
+	 * 141-153 apply identical coeff_bands_inverse reordering before
+	 * writing — VAAPI consumer has done the reordering for us. */
+	memcpy(frame.entropy.coeff_probs,
+	       probability->dct_coeff_probs,
+	       sizeof frame.entropy.coeff_probs);
+
+	/* Clause 9: coder state + first-partition fields */
+	frame.coder_state.range = picture->bool_coder_ctx.range;
+	frame.coder_state.value = picture->bool_coder_ctx.value;
+	frame.coder_state.bit_count = picture->bool_coder_ctx.count;
+
+	/* Phase 5 C1+C2: macroblock_offset IS first_part_header_bits by
+	 * source identity; kernel hantro_g1_vp8_dec.c:260 reads it
+	 * unconditionally to compute MB-data DMA offset. partition_size[0]
+	 * is the post-parse REMAINDER; recover total via
+	 * + ceil(macroblock_offset/8). */
+	frame.first_part_header_bits = slice->macroblock_offset;
+	frame.first_part_size =
+		slice->partition_size[0] +
+		((uint32_t)slice->macroblock_offset + 7) / 8;
+
+	for (i = 0; i < 8; i++)
+		frame.dct_part_sizes[i] = slice->partition_size[i + 1];
+
+	/* Clause 9: flags assembly (6 mainline-documented bits only;
+	 * EXPERIMENTAL + bit 0x40 NOT replicated despite ffmpeg-v4l2-
+	 * request-git setting them — kernel hantro_vp8.c only inspects
+	 * KEY_FRAME bit). VAAPI inverts: key_frame=0 means it IS a
+	 * keyframe per VP8 spec. */
+	if (!picture->pic_fields.bits.key_frame)
+		frame.flags |= V4L2_VP8_FRAME_FLAG_KEY_FRAME;
+	frame.flags |= V4L2_VP8_FRAME_FLAG_SHOW_FRAME;
+	if (picture->pic_fields.bits.mb_no_coeff_skip)
+		frame.flags |= V4L2_VP8_FRAME_FLAG_MB_NO_SKIP_COEFF;
+	if (picture->pic_fields.bits.sign_bias_golden)
+		frame.flags |= V4L2_VP8_FRAME_FLAG_SIGN_BIAS_GOLDEN;
+	if (picture->pic_fields.bits.sign_bias_alternate)
+		frame.flags |= V4L2_VP8_FRAME_FLAG_SIGN_BIAS_ALT;
+
+	/* Clause 1+10: single-control batched submission */
+	struct v4l2_ext_control ctrls[1] = {
+		{
+			.id = V4L2_CID_STATELESS_VP8_FRAME,
+			.ptr = &frame,
+			.size = sizeof frame,
+		},
+	};
+
+	rc = v4l2_set_controls(driver_data->video_fd,
+			       surface_object->request_fd,
+			       ctrls, 1);
+	if (rc < 0)
+		return VA_STATUS_ERROR_OPERATION_FAILED;
+
+	return 0;
+}
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * fresnel-fourier iter3: VP8 codec dispatcher header.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _VP8_H_
+#define _VP8_H_
+
+struct object_context;
+struct object_surface;
+struct request_data;
+
+int vp8_set_controls(struct request_data *driver_data,
+		     struct object_context *context,
+		     struct object_surface *surface_object);
+
+#endif
@@ -0,0 +1,754 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * fresnel-fourier iter4 Phase 6 commit B: VP9 codec dispatcher
+ * implemented against V4L2_CID_STATELESS_VP9_FRAME (0xa40a2c) +
+ * V4L2_CID_STATELESS_VP9_COMPRESSED_HDR (0xa40a2d). rkvdec on
+ * RK3399 mandatorily requires both controls per
+ * drivers/staging/media/rkvdec/rkvdec-vp9.c::rkvdec_vp9_run_preamble:752.
+ *
+ * Reference: FFmpeg libavcodec/v4l2_request_vp9.c (kwiboo branch);
+ *            FFmpeg libavcodec/vaapi_vp9.c (VAAPI source-side
+ *            verification of field semantics);
+ *            kernel drivers/media/v4l2-core/v4l2-vp9.c +
+ *              drivers/staging/media/rkvdec/rkvdec-vp9.c.
+ *
+ * Phase 5 review amendments incorporated (see phase5_iter4_review.md):
+ *   C1 frame.interpolation_filter = picture->mcomp_filter_type
+ *      (NO XOR; vaapi_vp9.c:62 already applied the XOR before storing
+ *      into VAAPI's mcomp_filter_type; double-XOR would swap
+ *      EIGHTTAP and EIGHTTAP_SMOOTH for inter frames)
+ *   C2 LF deltas persisted across frames in object_context.vp9_lf,
+ *      init to VP9 spec defaults {1,0,-1,-1,0,0} on
+ *      keyframe/intra_only/error_resilient, updated only when parsed
+ *      lf_delta.update=1, ALWAYS copied to kernel control
+ *   C3 vp9_fill_compressed_hdr takes out_reference_mode pointer
+ *      (reference_mode lives in v4l2_ctrl_vp9_frame, NOT in
+ *      _compressed_hdr; threaded via parameter)
+ *
+ * Suggested findings incorporated:
+ *   S4 uv_mode memcpy from FFmpeg's fill_compressed_hdr is omitted —
+ *      rkvdec reads uv_mode from kernel's persistent
+ *      probability_tables, NOT from prob_updates ctrl
+ *   S5 lossless_flag semantics align with FFmpeg's s->s.h.lossless
+ *      (LosslessFlag = base_qindex==0 && y_dc_delta_q==0 &&
+ *      uv_dc_delta_q==0 && uv_ac_delta_q==0)
+ */
+
+#include "vp9.h"
+
+#include "v4l2.h"
+#include "utils.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/v4l2-controls.h>
+#include <linux/videodev2.h>
+
+/* Clause 3: compile-time size assertions. UAPI shifts must fail loudly. */
+_Static_assert(sizeof(struct v4l2_ctrl_vp9_frame) == 168,
+	       "v4l2_ctrl_vp9_frame size mismatch — kernel UAPI changed");
+_Static_assert(sizeof(struct v4l2_ctrl_vp9_compressed_hdr) == 2040,
+	       "v4l2_ctrl_vp9_compressed_hdr size mismatch — kernel UAPI changed");
+
+/*
+ * VPX range coder — minimal port of FFmpeg vpx_rac.[ch] + vp89_rac.h.
+ * Stateless static helpers; bitstream-only readers. ~80 LOC.
+ */
+
+static const uint8_t vpx_norm_shift[256] = {
+	8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,
+	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+struct vp9_rac {
+	int high;
+	int bits;
+	const uint8_t *buffer;
+	const uint8_t *end;
+	unsigned int code_word;
+};
+
+static int vp9_rac_init(struct vp9_rac *c, const uint8_t *buf, int size)
+{
+	if (size < 1)
+		return -1;
+	c->high = 255;
+	c->bits = -16;
+	c->buffer = buf;
+	c->end = buf + size;
+	c->code_word = ((unsigned)buf[0] << 16) | ((unsigned)buf[1] << 8) | buf[2];
+	c->buffer += 3;
+	return 0;
+}
+
+static unsigned vp9_rac_renorm(struct vp9_rac *c)
+{
+	int shift = vpx_norm_shift[c->high];
+	int bits = c->bits;
+	unsigned code_word = c->code_word;
+
+	c->high <<= shift;
+	code_word <<= shift;
+	bits += shift;
+	if (bits >= 0 && c->buffer + 1 < c->end) {
+		code_word |= (((unsigned)c->buffer[0] << 8) | c->buffer[1]) << bits;
+		c->buffer += 2;
+		bits -= 16;
+	}
+	c->bits = bits;
+	return code_word;
+}
+
+static int vp9_rac_get_prob(struct vp9_rac *c, uint8_t prob)
+{
+	unsigned code_word = vp9_rac_renorm(c);
+	unsigned low = 1 + (((c->high - 1) * prob) >> 8);
+	unsigned low_shift = low << 16;
+	int bit = code_word >= low_shift;
+
+	c->high = bit ? c->high - low : low;
+	c->code_word = bit ? code_word - low_shift : code_word;
+	return bit;
+}
+
+static int vp9_rac_get_branchy(struct vp9_rac *c, int prob)
+{
+	return vp9_rac_get_prob(c, (uint8_t)prob);
+}
+
+static int vp9_rac_bit(struct vp9_rac *c)
+{
+	return vp9_rac_get_prob(c, 128);
+}
+
+static int vp9_rac_uint(struct vp9_rac *c, int bits)
+{
+	int value = 0;
+
+	while (bits--)
+		value = (value << 1) | vp9_rac_bit(c);
+	return value;
+}
+
+/* inv_map_table: VP9 differential probability update table.
+ * Verbatim copy from FFmpeg v4l2_request_vp9.c:44-64. */
+static const uint8_t inv_map_table[255] = {
+	  7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
+	189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
+	 10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
+	 25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
+	 40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
+	 55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
+	 70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
+	 86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
+	101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
+	116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
+	131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
+	146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
+	161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
+	207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
+	222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
+	237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
+	252, 253, 253,
+};
+
+static int read_prob_delta(struct vp9_rac *c)
+{
+	int d;
+
+	if (!vp9_rac_bit(c)) {
+		d = vp9_rac_uint(c, 4);
+	} else if (!vp9_rac_bit(c)) {
+		d = vp9_rac_uint(c, 4) + 16;
+	} else if (!vp9_rac_bit(c)) {
+		d = vp9_rac_uint(c, 5) + 32;
+	} else {
+		d = vp9_rac_uint(c, 7);
+		if (d >= 65)
+			d = (d << 1) - 65 + vp9_rac_bit(c);
+		d += 64;
+	}
+	return inv_map_table[d];
+}
+
+/*
+ * Clause 6: minimal big-endian bit reader over the uncompressed header
+ * for the fields VAAPI doesn't expose: lf_delta_enabled / lf_delta_update /
+ * lf_ref_deltas / lf_mode_deltas / base_q_idx / delta_q_y_dc / delta_q_uv_dc /
+ * delta_q_uv_ac.
+ *
+ * Walks: frame_marker(2) + profile(2 or 3) + show_existing_frame(1) +
+ *        frame_type(1) + show_frame(1) + error_resilient(1) +
+ *        if keyframe: sync_code(24) + color_config + frame_size + render_size
+ *        else: intra_only(1 if !show_frame) + reset(2) +
+ *              if intra_only: sync_code(24) + (if profile>0: color_config) +
+ *                             refresh_flags(8) + frame_size + render_size
+ *              else: refresh_flags(8) + 3*(ref_idx(3)+sign_bias(1)) +
+ *                    frame_size_with_refs + allow_hpmv(1) + interp_filter(2 or 3)
+ *        loop_filter_params + quantization_params
+ *
+ * Only profile-0 paths are exercised for BBB; non-profile-0 fields read
+ * their bits but do not write them back. Keep targeted, not general.
+ */
+
+struct uh_reader {
+	const uint8_t *buf;
+	size_t size;
+	size_t bit_pos;
+};
+
+static unsigned uh_read_bits(struct uh_reader *r, int n)
+{
+	unsigned v = 0;
+	int i;
+
+	for (i = 0; i < n; i++) {
+		size_t byte = r->bit_pos >> 3;
+		int bit = 7 - (r->bit_pos & 7);
+
+		if (byte >= r->size)
+			return 0;
+		v = (v << 1) | ((r->buf[byte] >> bit) & 1);
+		r->bit_pos++;
+	}
+	return v;
+}
+
+/* Phase 7 fix: VP9 spec s(N) is N magnitude bits + 1 sign bit (total N+1).
+ * Previous uh_read_signed_6 read 4+1=5 bits instead of 6+1=7; bit drift of
+ * 2 bits per ref_delta accumulated across the lf_delta updates and shifted
+ * base_q_idx by 8 bits, producing 0x41 (frame 1 keyframe) instead of 0x2e.
+ * Phase 3 anchor cross-check confirmed the corrected 7-bit read places
+ * base_q_idx at bit 111 with value 0x2e=46. */
+static int uh_read_sbits(struct uh_reader *r, int n)
+{
+	int v = (int)uh_read_bits(r, n);
+	int sign = (int)uh_read_bits(r, 1);
+
+	return sign ? -v : v;
+}
+
+static int uh_read_delta_q(struct uh_reader *r)
+{
+	/* read_delta_q(): if delta_coded bit set, read s(4) = 4 mag + 1 sign */
+	if (uh_read_bits(r, 1))
+		return uh_read_sbits(r, 4);
+	return 0;
+}
+
+static void vp9_parse_uncompressed_header_lf_quant(
+	const uint8_t *data, uint32_t size,
+	struct v4l2_ctrl_vp9_frame *frame,
+	int8_t persistent_ref_deltas[4],
+	int8_t persistent_mode_deltas[2],
+	bool *out_keyframe_or_intraonly,
+	bool *out_lf_delta_updated)
+{
+	struct uh_reader r = { .buf = data, .size = size, .bit_pos = 0 };
+	bool keyframe, intra_only = false, show_frame, error_resilient;
+	int profile;
+	int i;
+
+	*out_lf_delta_updated = false;
+
+	uh_read_bits(&r, 2);                 /* frame_marker */
+	{
+		int p_lo = uh_read_bits(&r, 1);
+		int p_hi = uh_read_bits(&r, 1);
+		profile = p_lo + (p_hi << 1);
+		if (profile == 3)
+			uh_read_bits(&r, 1);
+	}
+
+	if (uh_read_bits(&r, 1))             /* show_existing_frame */
+		return;                      /* no LF/quant in the bitstream */
+
+	keyframe = !uh_read_bits(&r, 1);
+	show_frame = uh_read_bits(&r, 1);
+	error_resilient = uh_read_bits(&r, 1);
+
+	if (keyframe) {
+		uh_read_bits(&r, 24);        /* sync_code */
+		/* color_config (profile=0): just bt709 + range bit */
+		if (profile >= 2)
+			uh_read_bits(&r, 1); /* ten_or_twelve_bit */
+		uh_read_bits(&r, 3);         /* color_space */
+		if (1) {                     /* color_space != CS_RGB */
+			uh_read_bits(&r, 1); /* color_range */
+			if (profile == 1 || profile == 3) {
+				uh_read_bits(&r, 2); /* subsampling */
+				uh_read_bits(&r, 1); /* reserved */
+			}
+		} else if (profile == 1 || profile == 3) {
+			uh_read_bits(&r, 1); /* reserved */
+		}
+		uh_read_bits(&r, 16);        /* frame_width_minus_1 */
+		uh_read_bits(&r, 16);        /* frame_height_minus_1 */
+		if (uh_read_bits(&r, 1)) {
+			uh_read_bits(&r, 16);
+			uh_read_bits(&r, 16);
+		}
+	} else {
+		intra_only = show_frame ? 0 : uh_read_bits(&r, 1);
+		if (!error_resilient)
+			uh_read_bits(&r, 2); /* reset_frame_context */
+		if (intra_only) {
+			uh_read_bits(&r, 24); /* sync_code */
+			if (profile > 0) {
+				if (profile >= 2)
+					uh_read_bits(&r, 1);
+				uh_read_bits(&r, 3); /* color_space */
+				uh_read_bits(&r, 1); /* color_range */
+				if (profile == 1 || profile == 3) {
+					uh_read_bits(&r, 2);
+					uh_read_bits(&r, 1);
+				}
+			}
+			uh_read_bits(&r, 8); /* refresh_frame_flags */
+			uh_read_bits(&r, 16);
+			uh_read_bits(&r, 16);
+			if (uh_read_bits(&r, 1)) {
+				uh_read_bits(&r, 16);
+				uh_read_bits(&r, 16);
+			}
+		} else {
+			uh_read_bits(&r, 8); /* refresh_frame_flags */
+			for (i = 0; i < 3; i++) {
+				uh_read_bits(&r, 3);
+				uh_read_bits(&r, 1);
+			}
+			/* frame_size_with_refs: up to 3 found_ref bits, then
+			 * if no found_ref: explicit width+height; else ref-pick.
+			 * Then render_size. Just walk it. */
+			{
+				bool found = false;
+				for (i = 0; i < 3; i++) {
+					if (uh_read_bits(&r, 1))
+						found = true;
+				}
+				if (!found) {
+					uh_read_bits(&r, 16);
+					uh_read_bits(&r, 16);
+				}
+				if (uh_read_bits(&r, 1)) {
+					uh_read_bits(&r, 16);
+					uh_read_bits(&r, 16);
+				}
+			}
+			uh_read_bits(&r, 1); /* allow_hpmv */
+			if (uh_read_bits(&r, 1)) /* interp_filter switchable */
+				;
+			else
+				uh_read_bits(&r, 2); /* interp_filter literal */
+		}
+	}
+
+	*out_keyframe_or_intraonly = keyframe || intra_only;
+
+	uh_read_bits(&r, 1);                 /* refresh_frame_context */
+	uh_read_bits(&r, 1);                 /* frame_parallel_decoding_mode */
+	if (!error_resilient || keyframe || intra_only)
+		uh_read_bits(&r, 2);         /* frame_context_idx + reset_frame_context */
+
+	/* loop_filter_params */
+	uh_read_bits(&r, 6);                 /* filter_level (already in VAAPI) */
+	uh_read_bits(&r, 3);                 /* sharpness (already in VAAPI) */
+	if (uh_read_bits(&r, 1)) {           /* lf_delta.enabled */
+		frame->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED;
+		if (uh_read_bits(&r, 1)) {   /* lf_delta.updated */
+			frame->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE;
+			*out_lf_delta_updated = true;
+			for (i = 0; i < 4; i++) {
+				if (uh_read_bits(&r, 1))
+					persistent_ref_deltas[i] =
+						(int8_t)uh_read_sbits(&r, 6);
+			}
+			for (i = 0; i < 2; i++) {
+				if (uh_read_bits(&r, 1))
+					persistent_mode_deltas[i] =
+						(int8_t)uh_read_sbits(&r, 6);
+			}
+		}
+	}
+
+	/* quantization_params */
+	frame->quant.base_q_idx     = (uint8_t)uh_read_bits(&r, 8);
+	frame->quant.delta_q_y_dc   = (int8_t)uh_read_delta_q(&r);
+	frame->quant.delta_q_uv_dc  = (int8_t)uh_read_delta_q(&r);
+	frame->quant.delta_q_uv_ac  = (int8_t)uh_read_delta_q(&r);
+}
+
+/*
+ * Clause 9: compressed-header parser — port of FFmpeg
+ * v4l2_request_vp9.c:99-261::fill_compressed_hdr.
+ *
+ * Phase 5 C3: out_reference_mode threaded via out-param. Callers
+ * derive `allowcompinter` from VAAPI sign-bias bits and pass it.
+ */
+#define V4L2_VP9_TX_MODE_ONLY_4X4_LOCAL    0
+#define V4L2_VP9_TX_MODE_ALLOW_32X32_LOCAL 3
+#define V4L2_VP9_TX_MODE_SELECT_LOCAL      4
+
+static void vp9_fill_compressed_hdr(
+	struct v4l2_ctrl_vp9_compressed_hdr *ctrl,
+	const uint8_t *buffer, uint32_t size,
+	uint8_t lossless_flag,
+	bool keyframe_or_intraonly,
+	bool allowcompinter,
+	bool highprecision_mvs,
+	int interp_filter_switchable,
+	uint8_t *out_reference_mode)
+{
+	struct vp9_rac c;
+	int comppredmode = 0;
+	int i, j, k, l, m, n;
+
+	*out_reference_mode = 0;
+
+	if (vp9_rac_init(&c, buffer, size) < 0)
+		return;
+
+	if (vp9_rac_get_branchy(&c, 128))    /* marker bit */
+		return;
+
+	if (lossless_flag) {
+		ctrl->tx_mode = V4L2_VP9_TX_MODE_ONLY_4X4_LOCAL;
+	} else {
+		ctrl->tx_mode = (uint8_t)vp9_rac_uint(&c, 2);
+		if (ctrl->tx_mode == V4L2_VP9_TX_MODE_ALLOW_32X32_LOCAL)
+			ctrl->tx_mode = (uint8_t)(ctrl->tx_mode + vp9_rac_bit(&c));
+		if (ctrl->tx_mode == V4L2_VP9_TX_MODE_SELECT_LOCAL) {
+			for (i = 0; i < 2; i++)
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->tx8[i][0] = (uint8_t)read_prob_delta(&c);
+			for (i = 0; i < 2; i++)
+				for (j = 0; j < 2; j++)
+					if (vp9_rac_get_branchy(&c, 252))
+						ctrl->tx16[i][j] = (uint8_t)read_prob_delta(&c);
+			for (i = 0; i < 2; i++)
+				for (j = 0; j < 3; j++)
+					if (vp9_rac_get_branchy(&c, 252))
+						ctrl->tx32[i][j] = (uint8_t)read_prob_delta(&c);
+		}
+	}
+
+	for (i = 0; i < 4; i++) {
+		if (vp9_rac_bit(&c)) {
+			for (j = 0; j < 2; j++)
+				for (k = 0; k < 2; k++)
+					for (l = 0; l < 6; l++)
+						for (m = 0; m < 6; m++) {
+							if (m >= 3 && l == 0)
+								break;
+							for (n = 0; n < 3; n++)
+								if (vp9_rac_get_branchy(&c, 252))
+									ctrl->coef[i][j][k][l][m][n] =
+										(uint8_t)read_prob_delta(&c);
+						}
+		}
+		if (ctrl->tx_mode == i)
+			break;
+	}
+
+	for (i = 0; i < 3; i++)
+		if (vp9_rac_get_branchy(&c, 252))
+			ctrl->skip[i] = (uint8_t)read_prob_delta(&c);
+
+	if (!keyframe_or_intraonly) {
+		for (i = 0; i < 7; i++)
+			for (j = 0; j < 3; j++)
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->inter_mode[i][j] = (uint8_t)read_prob_delta(&c);
+
+		if (interp_filter_switchable)
+			for (i = 0; i < 4; i++)
+				for (j = 0; j < 2; j++)
+					if (vp9_rac_get_branchy(&c, 252))
+						ctrl->interp_filter[i][j] =
+							(uint8_t)read_prob_delta(&c);
+
+		for (i = 0; i < 4; i++)
+			if (vp9_rac_get_branchy(&c, 252))
+				ctrl->is_inter[i] = (uint8_t)read_prob_delta(&c);
+
+		if (allowcompinter) {
+			comppredmode = vp9_rac_bit(&c);
+			if (comppredmode)
+				comppredmode += vp9_rac_bit(&c);
+			if (comppredmode == 2)   /* PRED_SWITCHABLE */
+				for (i = 0; i < 5; i++)
+					if (vp9_rac_get_branchy(&c, 252))
+						ctrl->comp_mode[i] = (uint8_t)read_prob_delta(&c);
+		} else {
+			comppredmode = 0;        /* PRED_SINGLEREF */
+		}
+
+		if (comppredmode != 1) {        /* != PRED_COMPREF */
+			for (i = 0; i < 5; i++) {
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->single_ref[i][0] = (uint8_t)read_prob_delta(&c);
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->single_ref[i][1] = (uint8_t)read_prob_delta(&c);
+			}
+		}
+		if (comppredmode != 0) {        /* != PRED_SINGLEREF */
+			for (i = 0; i < 5; i++)
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->comp_ref[i] = (uint8_t)read_prob_delta(&c);
+		}
+
+		for (i = 0; i < 4; i++)
+			for (j = 0; j < 9; j++)
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->y_mode[i][j] = (uint8_t)read_prob_delta(&c);
+
+		for (i = 0; i < 4; i++)
+			for (j = 0; j < 4; j++)
+				for (k = 0; k < 3; k++)
+					if (vp9_rac_get_branchy(&c, 252))
+						ctrl->partition[(i * 4) + j][k] =
+							(uint8_t)read_prob_delta(&c);
+
+		for (i = 0; i < 3; i++)
+			if (vp9_rac_get_branchy(&c, 252))
+				ctrl->mv.joint[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+		for (i = 0; i < 2; i++) {
+			if (vp9_rac_get_branchy(&c, 252))
+				ctrl->mv.sign[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+			for (j = 0; j < 10; j++)
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->mv.classes[i][j] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+			if (vp9_rac_get_branchy(&c, 252))
+				ctrl->mv.class0_bit[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+			for (j = 0; j < 10; j++)
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->mv.bits[i][j] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+		}
+		for (i = 0; i < 2; i++) {
+			for (j = 0; j < 2; j++)
+				for (k = 0; k < 3; k++)
+					if (vp9_rac_get_branchy(&c, 252))
+						ctrl->mv.class0_fr[i][j][k] =
+							(uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+			for (j = 0; j < 3; j++)
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->mv.fr[i][j] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+		}
+		if (highprecision_mvs) {
+			for (i = 0; i < 2; i++) {
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->mv.class0_hp[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+				if (vp9_rac_get_branchy(&c, 252))
+					ctrl->mv.hp[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
+			}
+		}
+	}
+
+	*out_reference_mode = (uint8_t)comppredmode;
+}
+
+/*
+ * Clause 1+2+4+5+7+10+11+12: orchestrate VP9 control submission.
+ * 2 batched controls per frame: VP9_FRAME + VP9_COMPRESSED_HDR.
+ */
+int vp9_set_controls(struct request_data *driver_data,
+		     struct object_context *context,
+		     struct object_surface *surface_object)
+{
+	VADecPictureParameterBufferVP9 *picture =
+		&surface_object->params.vp9.picture;
+	VASliceParameterBufferVP9 *slice =
+		&surface_object->params.vp9.slice;
+
+	struct v4l2_ctrl_vp9_frame frame;
+	struct v4l2_ctrl_vp9_compressed_hdr compressed_hdr;
+	struct v4l2_ext_control ctrls[2];
+	int rc, i;
+	bool keyframe = !picture->pic_fields.bits.frame_type;
+	bool intra_only = picture->pic_fields.bits.intra_only;
+	bool error_resilient = picture->pic_fields.bits.error_resilient_mode;
+	bool allowcompinter;
+	bool keyframe_or_intraonly_parsed = false;
+	bool lf_delta_updated = false;
+	uint8_t parsed_reference_mode = 0;
+
+	memset(&frame, 0, sizeof frame);
+	memset(&compressed_hdr, 0, sizeof compressed_hdr);
+
+	/* Clause 4: frame geometry + per-frame scalars */
+	frame.frame_width_minus_1   = (uint16_t)(picture->frame_width - 1);
+	frame.frame_height_minus_1  = (uint16_t)(picture->frame_height - 1);
+	frame.render_width_minus_1  = frame.frame_width_minus_1;
+	frame.render_height_minus_1 = frame.frame_height_minus_1;
+
+	frame.profile           = picture->profile;
+	frame.bit_depth         = picture->bit_depth;
+	frame.tile_cols_log2    = picture->log2_tile_columns;
+	frame.tile_rows_log2    = picture->log2_tile_rows;
+	frame.frame_context_idx = picture->pic_fields.bits.frame_context_idx;
+
+	frame.lf.level     = picture->filter_level;
+	frame.lf.sharpness = picture->sharpness_level;
+
+	frame.uncompressed_header_size = picture->frame_header_length_in_bytes;
+	frame.compressed_header_size   = picture->first_partition_size;
+
+	/* Clause 5: DPB timestamp resolution */
+	{
+		VASurfaceID last_id = picture->reference_frames[picture->pic_fields.bits.last_ref_frame];
+		VASurfaceID gold_id = picture->reference_frames[picture->pic_fields.bits.golden_ref_frame];
+		VASurfaceID alt_id  = picture->reference_frames[picture->pic_fields.bits.alt_ref_frame];
+		struct object_surface *last_ref =
+			(last_id != VA_INVALID_SURFACE) ? SURFACE(driver_data, last_id) : NULL;
+		struct object_surface *gold_ref =
+			(gold_id != VA_INVALID_SURFACE) ? SURFACE(driver_data, gold_id) : NULL;
+		struct object_surface *alt_ref =
+			(alt_id  != VA_INVALID_SURFACE) ? SURFACE(driver_data, alt_id)  : NULL;
+
+		if (last_ref) frame.last_frame_ts   = v4l2_timeval_to_ns(&last_ref->timestamp);
+		if (gold_ref) frame.golden_frame_ts = v4l2_timeval_to_ns(&gold_ref->timestamp);
+		if (alt_ref)  frame.alt_frame_ts    = v4l2_timeval_to_ns(&alt_ref->timestamp);
+	}
+
+	if (picture->pic_fields.bits.last_ref_frame_sign_bias)
+		frame.ref_frame_sign_bias |= V4L2_VP9_SIGN_BIAS_LAST;
+	if (picture->pic_fields.bits.golden_ref_frame_sign_bias)
+		frame.ref_frame_sign_bias |= V4L2_VP9_SIGN_BIAS_GOLDEN;
+	if (picture->pic_fields.bits.alt_ref_frame_sign_bias)
+		frame.ref_frame_sign_bias |= V4L2_VP9_SIGN_BIAS_ALT;
+
+	allowcompinter = !(
+		picture->pic_fields.bits.last_ref_frame_sign_bias ==
+		    picture->pic_fields.bits.golden_ref_frame_sign_bias &&
+		picture->pic_fields.bits.golden_ref_frame_sign_bias ==
+		    picture->pic_fields.bits.alt_ref_frame_sign_bias);
+
+	/* Clause 6: persistent LF delta state — Phase 5 C2 */
+	if (!context->vp9_lf.initialized || keyframe || intra_only || error_resilient) {
+		context->vp9_lf.ref_deltas[0] = 1;
+		context->vp9_lf.ref_deltas[1] = 0;
+		context->vp9_lf.ref_deltas[2] = -1;
+		context->vp9_lf.ref_deltas[3] = -1;
+		context->vp9_lf.mode_deltas[0] = 0;
+		context->vp9_lf.mode_deltas[1] = 0;
+		context->vp9_lf.initialized = true;
+	}
+
+	vp9_parse_uncompressed_header_lf_quant(
+		surface_object->source_data,
+		surface_object->source_size,
+		&frame,
+		context->vp9_lf.ref_deltas,
+		context->vp9_lf.mode_deltas,
+		&keyframe_or_intraonly_parsed,
+		&lf_delta_updated);
+	(void)lf_delta_updated;
+
+	for (i = 0; i < 4; i++)
+		frame.lf.ref_deltas[i] = context->vp9_lf.ref_deltas[i];
+	for (i = 0; i < 2; i++)
+		frame.lf.mode_deltas[i] = context->vp9_lf.mode_deltas[i];
+
+	/* Clause 7: segmentation mapping */
+	for (i = 0; i < 7; i++)
+		frame.seg.tree_probs[i] = picture->mb_segment_tree_probs[i];
+	for (i = 0; i < 3; i++)
+		frame.seg.pred_probs[i] = picture->segment_pred_probs[i];
+
+	if (picture->pic_fields.bits.segmentation_enabled)
+		frame.seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ENABLED;
+	if (picture->pic_fields.bits.segmentation_update_map)
+		frame.seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP;
+	if (picture->pic_fields.bits.segmentation_temporal_update)
+		frame.seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE;
+
+	for (i = 0; i < 8; i++) {
+		if (slice->seg_param[i].segment_flags.fields.segment_reference_enabled) {
+			frame.seg.feature_enabled[i] |= 1 << V4L2_VP9_SEG_LVL_REF_FRAME;
+			frame.seg.feature_data[i][V4L2_VP9_SEG_LVL_REF_FRAME] =
+				(int16_t)slice->seg_param[i].segment_flags.fields.segment_reference;
+		}
+		if (slice->seg_param[i].segment_flags.fields.segment_reference_skipped)
+			frame.seg.feature_enabled[i] |= 1 << V4L2_VP9_SEG_LVL_SKIP;
+	}
+
+	/* Clause 10: frame flags + reference_mode + interpolation_filter */
+	if (keyframe)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_KEY_FRAME;
+	if (picture->pic_fields.bits.show_frame)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_SHOW_FRAME;
+	if (error_resilient)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT;
+	if (intra_only)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_INTRA_ONLY;
+	if (picture->pic_fields.bits.allow_high_precision_mv)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV;
+	if (picture->pic_fields.bits.refresh_frame_context)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX;
+	if (picture->pic_fields.bits.frame_parallel_decoding_mode)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE;
+	if (picture->pic_fields.bits.subsampling_x)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING;
+	if (picture->pic_fields.bits.subsampling_y)
+		frame.flags |= V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING;
+
+	/* Phase 5 C1: NO XOR. VAAPI's mcomp_filter_type is already post-XOR. */
+	frame.interpolation_filter = picture->pic_fields.bits.mcomp_filter_type;
+
+	/* reset_frame_context: FFmpeg's (resetctx > 0 ? resetctx - 1 : 0) */
+	frame.reset_frame_context =
+		picture->pic_fields.bits.reset_frame_context > 0
+		? (uint8_t)(picture->pic_fields.bits.reset_frame_context - 1)
+		: 0;
+
+	/* Clause 9: compressed-header parser fills both compressed_hdr and
+	 * out_reference_mode. allowcompinter derived from sign biases above. */
+	{
+		int interp_switchable = (frame.interpolation_filter == V4L2_VP9_INTERP_FILTER_SWITCHABLE);
+
+		vp9_fill_compressed_hdr(
+			&compressed_hdr,
+			surface_object->source_data + frame.uncompressed_header_size,
+			frame.compressed_header_size,
+			picture->pic_fields.bits.lossless_flag,
+			keyframe || intra_only,
+			allowcompinter,
+			picture->pic_fields.bits.allow_high_precision_mv,
+			interp_switchable,
+			&parsed_reference_mode);
+	}
+	frame.reference_mode = parsed_reference_mode;
+
+	/* Clause 11: 2-control batched submission */
+	memset(ctrls, 0, sizeof ctrls);
+	ctrls[0].id   = V4L2_CID_STATELESS_VP9_FRAME;
+	ctrls[0].ptr  = &frame;
+	ctrls[0].size = sizeof frame;
+	ctrls[1].id   = V4L2_CID_STATELESS_VP9_COMPRESSED_HDR;
+	ctrls[1].ptr  = &compressed_hdr;
+	ctrls[1].size = sizeof compressed_hdr;
+
+	rc = v4l2_set_controls(driver_data->video_fd,
+			       surface_object->request_fd,
+			       ctrls, 2);
+	if (rc < 0)
+		return VA_STATUS_ERROR_OPERATION_FAILED;
+
+	return VA_STATUS_SUCCESS;
+}
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
+ *
+ * fresnel-fourier iter4 Phase 6 commit B: VP9 codec dispatcher header.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _VP9_H_
+#define _VP9_H_
+
+#include "context.h"
+#include "request.h"
+#include "surface.h"
+
+int vp9_set_controls(struct request_data *driver_data,
+		     struct object_context *context,
+		     struct object_surface *surface);
+
+#endif /* _VP9_H_ */
@@ -0,0 +1,167 @@
+/*
+ * cap_pool_probe_pattern.c — synthetic regression test for the
+ * iter5 sonnet C4 / iter6 candidate A "cap_pool resolution-change race."
+ *
+ * Exercises the surface-allocation pattern that originally tripped
+ * REQBUFS-EBUSY on the iter5-end driver: vaCreateSurfaces at one
+ * resolution, then vaDestroySurfaces, then vaCreateSurfaces at a
+ * different resolution. iter6's REINIT discipline + cap_pool's
+ * REQBUFS(0)-on-CAPTURE-and-OUTPUT during S_FMT-on-resolution-change
+ * (CreateSurfaces2 in surface.c) closes this race; this test anchors
+ * that fact with a deterministic repro.
+ *
+ * Build:
+ *   gcc -O2 -Wall -Wextra -o cap_pool_probe_pattern \
+ *       cap_pool_probe_pattern.c \
+ *       $(pkg-config --cflags --libs libva libva-drm)
+ *
+ * Run:
+ *   LIBVA_DRIVER_NAME=v4l2_request \
+ *   LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
+ *   LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
+ *     ./cap_pool_probe_pattern
+ *
+ * Pass criterion (on iter6 driver and later):
+ *   - Exit code 0
+ *   - No "REQBUFS" / "EBUSY" / "Unable to request buffers" /
+ *     "Unable to set format" lines on the v4l2-request driver's stderr
+ *   - vainfo or visual inspection confirms the test program reached
+ *     the "PASS" line on stdout
+ *
+ * Fail behavior pre-iter5: vaCreateSurfaces at the second resolution
+ * would emit REQBUFS-EBUSY because OUTPUT/CAPTURE buffers from the
+ * first allocation hadn't been torn down before S_FMT was attempted
+ * on the new resolution. iter5's CreateSurfaces2 added the dual
+ * REQBUFS(0) drain; iter6's REINIT keeps the OUTPUT pool's request_fd
+ * lifecycle clean across the destroy-recreate cycle.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <va/va.h>
+#include <va/va_drm.h>
+
+#define DRM_RENDER_NODE "/dev/dri/renderD128"
+
+static const char *va_status_str(VAStatus s)
+{
+	return vaErrorStr(s);
+}
+
+#define VA_OK_OR_FAIL(call, msg) do { \
+	VAStatus _vs = (call); \
+	if (_vs != VA_STATUS_SUCCESS) { \
+		fprintf(stderr, "FAIL: %s: %s (0x%x)\n", \
+			(msg), va_status_str(_vs), _vs); \
+		return 10; \
+	} \
+} while (0)
+
+int main(void)
+{
+	int drm_fd;
+	VADisplay dpy;
+	int va_major = 0, va_minor = 0;
+	VAConfigID config = VA_INVALID_ID;
+	VAContextID context = VA_INVALID_ID;
+	VASurfaceID small_surfaces[4];
+	VASurfaceID big_surfaces[4];
+	const unsigned int small_w = 128, small_h = 128;
+	const unsigned int big_w = 1920, big_h = 1080;
+
+	/* Open render node + libva display. */
+	drm_fd = open(DRM_RENDER_NODE, O_RDWR);
+	if (drm_fd < 0) {
+		fprintf(stderr, "FAIL: open(%s): %s\n",
+			DRM_RENDER_NODE, strerror(errno));
+		return 1;
+	}
+
+	dpy = vaGetDisplayDRM(drm_fd);
+	if (dpy == NULL) {
+		fprintf(stderr, "FAIL: vaGetDisplayDRM returned NULL\n");
+		close(drm_fd);
+		return 2;
+	}
+
+	VA_OK_OR_FAIL(vaInitialize(dpy, &va_major, &va_minor),
+		      "vaInitialize");
+	printf("libva %d.%d initialized via %s\n", va_major, va_minor,
+	       DRM_RENDER_NODE);
+
+	/*
+	 * vaCreateConfig with H.264 Main + VLD entrypoint forces our
+	 * driver's RequestCreateConfig to set up the H.264 decode path,
+	 * which is the path that reaches CreateSurfaces2 (and the
+	 * resolution-change handling there).
+	 */
+	VA_OK_OR_FAIL(vaCreateConfig(dpy, VAProfileH264Main, VAEntrypointVLD,
+				     NULL, 0, &config),
+		      "vaCreateConfig(H264Main, VLD)");
+
+	/* Phase 1: allocate small probe-pattern surfaces.
+	 *
+	 * iter5 sonnet C4 specified the race as vaCreateSurfaces(small)
+	 * then vaCreateSurfaces(big), allocation-only — matching mpv's
+	 * libplacebo probe pattern that surfaced the original failure.
+	 * No context creation needed for the C4 race; the cap_pool's
+	 * resolution-change handling lives in CreateSurfaces2 itself
+	 * (REQBUFS(0)+S_FMT pair on the OUTPUT queue, cap_pool_destroy
+	 * + cap_pool_init on the CAPTURE queue).
+	 *
+	 * (vaCreateContext + recreate at a new resolution surfaced an
+	 * additional STREAMON-on-recreate failure during iter7 Phase 7
+	 * verification. That's iter8 candidate; out of scope for the C4
+	 * regression test.)
+	 */
+	printf("Phase 1: vaCreateSurfaces %ux%u, count=4\n", small_w, small_h);
+	VA_OK_OR_FAIL(vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420,
+				       small_w, small_h, small_surfaces, 4,
+				       NULL, 0),
+		      "vaCreateSurfaces (small)");
+
+	/* Phase 2: dispose small surfaces. Our driver's CreateSurfaces2
+	 * keeps the cap_pool initialized at the small resolution; the
+	 * pool will be torn down + rebuilt by Phase 3's resolution-change
+	 * branch in CreateSurfaces2.
+	 */
+	printf("Phase 2: vaDestroySurfaces (small)\n");
+	VA_OK_OR_FAIL(vaDestroySurfaces(dpy, small_surfaces, 4),
+		      "vaDestroySurfaces (small)");
+
+	/* Phase 3: allocate at the new (much larger) resolution. This is
+	 * the C4 race-hitting path: pre-iter5 hit REQBUFS-EBUSY because
+	 * CAPTURE/OUTPUT buffers from the small allocation hadn't been
+	 * torn down before S_FMT on the new size. iter5's CreateSurfaces2
+	 * added the dual REQBUFS(0) drain; iter7 also adds OUTPUT pool
+	 * teardown for the case where a context-bound resolution change
+	 * leaves the request_pool stale (defensive — not exercised in
+	 * this no-context test path).
+	 */
+	printf("Phase 3: vaCreateSurfaces %ux%u, count=4 (resolution change)\n",
+	       big_w, big_h);
+	VA_OK_OR_FAIL(vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420,
+				       big_w, big_h, big_surfaces, 4,
+				       NULL, 0),
+		      "vaCreateSurfaces (big)");
+
+	/* Phase 4: clean up. */
+	printf("Phase 4: cleanup\n");
+	VA_OK_OR_FAIL(vaDestroySurfaces(dpy, big_surfaces, 4),
+		      "vaDestroySurfaces (big)");
+	VA_OK_OR_FAIL(vaDestroyConfig(dpy, config),
+		      "vaDestroyConfig");
+	VA_OK_OR_FAIL(vaTerminate(dpy),
+		      "vaTerminate");
+	close(drm_fd);
+	(void)context; /* unused in the C4-faithful no-context test path */
+
+	printf("PASS: cap_pool probe-pattern resolution-change handled cleanly.\n");
+	printf("Inspect driver stderr for absence of REQBUFS/EBUSY/Unable lines.\n");
+	return 0;
+}
@@ -0,0 +1,53 @@
+#!/bin/bash
+# run_cap_pool_probe.sh — orchestrate the cap_pool probe-pattern regression test.
+#
+# Runs the cap_pool_probe_pattern test program with the v4l2_request driver
+# and grep-checks driver stderr for race indicators. Exits 0 on PASS, 1 on FAIL.
+#
+# Usage: ./run_cap_pool_probe.sh [path_to_test_binary]
+# If no argument, looks for ./cap_pool_probe_pattern in the same directory.
+
+set -eu
+
+BIN="${1:-$(dirname "$0")/cap_pool_probe_pattern}"
+
+if [[ ! -x "$BIN" ]]; then
+	echo "FAIL: test binary not found or not executable: $BIN" >&2
+	echo "Build it first:" >&2
+	echo "  gcc -O2 -Wall -Wextra -o $BIN $(dirname "$0")/cap_pool_probe_pattern.c \\" >&2
+	echo "      \$(pkg-config --cflags --libs libva libva-drm)" >&2
+	exit 2
+fi
+
+LOG=$(mktemp -t cap_pool_probe.XXXXXX.log)
+trap 'rm -f "$LOG"' EXIT
+
+env LIBVA_DRIVER_NAME=v4l2_request \
+    LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
+    LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
+    "$BIN" >"$LOG" 2>&1
+rc=$?
+
+echo "--- test program output ---"
+cat "$LOG"
+echo "--- end output ---"
+
+if [[ "$rc" -ne 0 ]]; then
+	echo "FAIL: test binary exited with rc=$rc" >&2
+	exit 1
+fi
+
+# Race indicators on driver-prefixed lines only (avoids matching the
+# test program's own informational output). Driver log lines start with
+# "v4l2-request:".
+race_lines=$(grep -E '^v4l2-request:' "$LOG" \
+	| grep -iE 'REQBUFS|EBUSY|Unable to request buffers|Unable to set format' \
+	|| true)
+if [[ -n "$race_lines" ]]; then
+	echo "FAIL: driver stderr contains race indicators:" >&2
+	echo "$race_lines" >&2
+	exit 1
+fi
+
+echo "PASS: cap_pool probe-pattern test clean (no race indicators)."
+exit 0
@@ -0,0 +1,139 @@
+#!/bin/bash
+# run_msync_pixel_verify.sh — verify decoded pixel correctness post-msync-removal.
+#
+# iter5 sweep commit d3a299b removed msync(MS_SYNC|MS_INVALIDATE) from the
+# CAPTURE buffer DQBUF path alongside the iter1 patch-0010 hex-dump diagnostic.
+# iter5 Phase 5 sonnet caveat C3 flagged: no formal pixel-correctness check
+# was done. This script is that check.
+#
+# Approach:
+#   1. SW reference: ffmpeg libavcodec H.264 decode of bbb_1080p30_h264.mp4,
+#      first 100 frames, NV12 raw output -> sw_ref.yuv.
+#   2. HW subject: same input through our v4l2_request driver via
+#      ffmpeg -hwaccel vaapi -hwaccel_output_format vaapi
+#             -i ... -vf hwdownload,format=nv12 -f rawvideo -pix_fmt nv12
+#      Captures the post-DQBUF buffer through libva readback, exercising
+#      the same code path we removed msync from.
+#   3. Compare: byte-for-byte cmp + per-frame sha256.
+#
+# Pass: byte-for-byte identical (or per-frame sha matches) -> msync
+# verifiably unnecessary on this hardware/kernel; iter5 sonnet C3 closes.
+# Fail: divergence; restore msync in surface.c, re-run, document outcome.
+#
+# Usage: ./run_msync_pixel_verify.sh [fixture_path]
+# If no argument, defaults to /home/mfritsche/fourier-test/bbb_1080p30_h264.mp4
+
+set -eu
+
+FIXTURE="${1:-/home/mfritsche/fourier-test/bbb_1080p30_h264.mp4}"
+N_FRAMES=100
+WORKDIR=$(mktemp -d -t msync_verify.XXXXXX)
+trap 'rm -rf "$WORKDIR"' EXIT
+
+if [[ ! -f "$FIXTURE" ]]; then
+	echo "FAIL: fixture not found: $FIXTURE" >&2
+	exit 2
+fi
+
+# Probe fixture dimensions for crop alignment of the HW path.
+# Hantro pads height to MB boundaries (16-line align); FFmpeg SW decode
+# returns crop-aligned (visible) frame size. Without explicit cropping
+# on the HW side, hwdownload + format=nv12 emits MB-padded frames, which
+# would diverge in size from SW even if pixels are correct.
+read FIXTURE_W FIXTURE_H < <(ffprobe -v error -select_streams v:0 \
+	-show_entries stream=width,height -of csv=p=0 "$FIXTURE" \
+	| tr ',' ' ')
+if [[ -z "${FIXTURE_W:-}" || -z "${FIXTURE_H:-}" ]]; then
+	echo "FAIL: ffprobe could not read width/height from $FIXTURE" >&2
+	exit 2
+fi
+
+echo "Fixture: $FIXTURE ($FIXTURE_W x $FIXTURE_H)"
+echo "Frames:  $N_FRAMES"
+echo "Workdir: $WORKDIR"
+echo
+
+# 1. SW reference
+echo "[1/3] FFmpeg SW decode -> sw_ref.yuv"
+ffmpeg -hide_banner -loglevel error -y \
+	-i "$FIXTURE" \
+	-frames:v "$N_FRAMES" \
+	-f rawvideo -pix_fmt nv12 \
+	"$WORKDIR/sw_ref.yuv"
+SW_BYTES=$(stat -c %s "$WORKDIR/sw_ref.yuv")
+SW_SHA=$(sha256sum "$WORKDIR/sw_ref.yuv" | cut -d' ' -f1)
+echo "    sw_ref.yuv: $SW_BYTES bytes, sha256=$SW_SHA"
+
+# 2. HW subject via libva v4l2_request
+# Explicit crop=$FIXTURE_W:$FIXTURE_H after hwdownload normalizes any
+# MB-padding the HW driver applies (hantro pads height to multiples of
+# 16). Without this crop, an iter6+ correct decode could falsely
+# diverge in total byte count from the SW reference.
+echo "[2/3] FFmpeg HW decode via v4l2_request driver -> hw_capture.yuv"
+env LIBVA_DRIVER_NAME=v4l2_request \
+    LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
+    LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
+    ffmpeg -hide_banner -loglevel error -y \
+	-hwaccel vaapi -hwaccel_output_format vaapi \
+	-i "$FIXTURE" \
+	-vf "hwdownload,format=nv12,crop=$FIXTURE_W:$FIXTURE_H:0:0" \
+	-frames:v "$N_FRAMES" \
+	-f rawvideo -pix_fmt nv12 \
+	"$WORKDIR/hw_capture.yuv"
+HW_BYTES=$(stat -c %s "$WORKDIR/hw_capture.yuv")
+HW_SHA=$(sha256sum "$WORKDIR/hw_capture.yuv" | cut -d' ' -f1)
+echo "    hw_capture.yuv: $HW_BYTES bytes, sha256=$HW_SHA"
+echo
+
+# 3. Compare
+echo "[3/3] Compare"
+if [[ "$SW_BYTES" -ne "$HW_BYTES" ]]; then
+	# Diagnose stride/padding artifacts before declaring pixel
+	# corruption. With explicit crop in step 2 this should not
+	# happen, but if a future kernel change shifts the alignment
+	# we want a clear diagnostic, not a false pixel-corruption
+	# accusation.
+	EXPECTED_SW=$(( FIXTURE_W * FIXTURE_H * 3 / 2 * N_FRAMES ))
+	for PAD in 16 32; do
+		PADDED_H=$(( (FIXTURE_H + PAD - 1) / PAD * PAD ))
+		EXPECTED_PADDED=$(( FIXTURE_W * PADDED_H * 3 / 2 * N_FRAMES ))
+		if [[ "$HW_BYTES" -eq "$EXPECTED_PADDED" ]]; then
+			echo "DIAGNOSTIC: HW size $HW_BYTES matches MB-padded layout" >&2
+			echo "  ($FIXTURE_W x $PADDED_H, $PAD-line align). The crop=$FIXTURE_W:$FIXTURE_H" >&2
+			echo "  filter step did not normalize. Check FFmpeg version / hwdownload behavior." >&2
+			echo "  This is a stride artifact, not pixel corruption." >&2
+			exit 3
+		fi
+	done
+	echo "FAIL: size mismatch (SW=$SW_BYTES vs HW=$HW_BYTES, expected $EXPECTED_SW)" >&2
+	echo "      Different frame count or NV12 packing — investigate." >&2
+	exit 1
+fi
+
+if [[ "$SW_SHA" == "$HW_SHA" ]]; then
+	echo "PASS: byte-for-byte identical."
+	echo "      msync removal verified safe on this hardware/kernel."
+	exit 0
+fi
+
+# Per-frame divergence analysis on full-buffer mismatch.
+echo "Buffer-level sha differs. Computing per-frame divergence..."
+FRAME_SIZE=$(( SW_BYTES / N_FRAMES ))
+DIVERGENT=0
+for ((i = 0; i < N_FRAMES; i++)); do
+	OFFSET=$(( i * FRAME_SIZE ))
+	SW_FRAME_SHA=$(dd if="$WORKDIR/sw_ref.yuv" bs="$FRAME_SIZE" \
+		count=1 skip="$i" 2>/dev/null | sha256sum | cut -d' ' -f1)
+	HW_FRAME_SHA=$(dd if="$WORKDIR/hw_capture.yuv" bs="$FRAME_SIZE" \
+		count=1 skip="$i" 2>/dev/null | sha256sum | cut -d' ' -f1)
+	if [[ "$SW_FRAME_SHA" != "$HW_FRAME_SHA" ]]; then
+		DIVERGENT=$(( DIVERGENT + 1 ))
+		[[ "$DIVERGENT" -le 5 ]] && \
+			echo "    frame $i: SW=$SW_FRAME_SHA HW=$HW_FRAME_SHA"
+	fi
+done
+
+echo "FAIL: $DIVERGENT / $N_FRAMES frames diverge from SW reference."
+echo "      Action: restore msync(MS_SYNC|MS_INVALIDATE) in surface.c"
+echo "      RequestSyncSurface DQBUF path; re-run this script."
+exit 1
@@ -0,0 +1,299 @@
+#!/bin/bash
+# run_perf_binding_cell.sh — iter8 perf binding cell.
+#
+# Anchors campaign-wide claims with measured numbers. Runs four consumer
+# configurations for $DURATION seconds each on $FIXTURE and emits a
+# markdown table comparing:
+#   1. mpv --hwdec=vaapi          (DMA-BUF zero-copy through libva)
+#   2. mpv --hwdec=vaapi-copy     (HW decode + VAImage readback)
+#   3. firefox (iter5-amend, sandbox enabled, file:// URL)
+#   4. mpv --hwdec=no             (SW decode baseline / control)
+#
+# For each consumer: CPU% (median + p90), GPU freq (median MHz), drops in
+# measurement window, p50 frame interval (ms), VmRSS delta (MiB).
+#
+# Usage:
+#   ./run_perf_binding_cell.sh [fixture_path]
+#
+# If no argument, defaults to /home/mfritsche/fourier-test/bbb_1080p30_h264.mp4
+# Override DURATION via env: DURATION=60 ./run_perf_binding_cell.sh
+#
+# Reproducibility: results depend on (a) the iter7-end driver being installed
+# at /usr/lib/dri/v4l2_request_drv_video.so, (b) ohm idle (no other compute
+# load), (c) fixture present at the expected path. Run on a stable thermal
+# state (after a few minutes of cool-down).
+
+set -eu
+
+FIXTURE="${1:-/home/mfritsche/fourier-test/bbb_1080p30_h264.mp4}"
+DURATION="${DURATION:-30}"
+WORKDIR="${WORKDIR:-$(mktemp -d -t perf_binding.XXXXXX)}"
+GPU_DEVFREQ_PATH="${GPU_DEVFREQ_PATH:-/sys/class/devfreq/fde60000.gpu/cur_freq}"
+
+# DISPLAY/Wayland env for the operator's session, needed for Firefox under sudo.
+export XDG_RUNTIME_DIR="${XDG_RUNTIME_DIR:-/run/user/1001}"
+export WAYLAND_DISPLAY="${WAYLAND_DISPLAY:-wayland-0}"
+export DISPLAY="${DISPLAY:-:0}"
+export XAUTHORITY="${XAUTHORITY:-/run/user/1001/xauth_pxiMur}"
+
+# libva env vars for the v4l2_request driver path.
+export LIBVA_DRIVER_NAME=v4l2_request
+export LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1
+export LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0
+
+if [[ ! -f "$FIXTURE" ]]; then
+	echo "FAIL: fixture not found: $FIXTURE" >&2
+	exit 2
+fi
+
+mkdir -p "$WORKDIR"
+echo "Fixture:   $FIXTURE ($(stat -c %s "$FIXTURE") bytes)"
+echo "Duration:  ${DURATION}s per configuration"
+echo "Workdir:   $WORKDIR"
+echo "GPU freq:  $GPU_DEVFREQ_PATH"
+echo "Driver sha: $(sha256sum /usr/lib/dri/v4l2_request_drv_video.so | cut -d' ' -f1)"
+echo
+
+# percentile_from_stream sorted_file pct -> echo Nth percentile value
+# Argument: a file with one numeric value per line (no header), and a percentile
+# integer (50, 90, etc.). Numbers can be float; uses awk.
+percentile_from_stream() {
+	local file="$1" pct="$2"
+	awk -v pct="$pct" '
+	{ a[NR] = $1 }
+	END {
+		if (NR == 0) { print "0"; exit }
+		# sort
+		for (i = 1; i <= NR; i++) for (j = i+1; j <= NR; j++) if (a[i] > a[j]) { t = a[i]; a[i] = a[j]; a[j] = t }
+		idx = int((pct/100.0) * NR + 0.5)
+		if (idx < 1) idx = 1
+		if (idx > NR) idx = NR
+		print a[idx]
+	}' "$file"
+}
+
+# Background-poll GPU freq while the consumer runs. Writes Hz values to $1.
+poll_gpu_freq() {
+	local out="$1"
+	: >"$out"
+	while [[ -e "/proc/$BG_PARENT_PID" ]]; do
+		if [[ -r "$GPU_DEVFREQ_PATH" ]]; then
+			cat "$GPU_DEVFREQ_PATH" 2>/dev/null >>"$out" || true
+		fi
+		sleep 0.1
+	done
+}
+
+# Run a single consumer configuration. Args:
+#   $1 label (used for filename, no spaces)
+#   $2 launcher cmd (will be exec'd as mfritsche; should be a single line)
+#   $3 'mpv' or 'firefox' — affects how we find the PID to track
+run_consumer() {
+	local label="$1"
+	local launcher="$2"
+	local kind="$3"
+	local logdir="$WORKDIR/$label"
+	mkdir -p "$logdir"
+
+	echo "=== Running: $label ==="
+
+	# Kill any running firefox/mpv first to clean state.
+	pkill -f firefox 2>/dev/null || true
+	pkill -x mpv     2>/dev/null || true
+	sleep 1
+
+	# VmRSS at start (we'll read again at end) — captured per-PID after launch.
+	# Launch consumer in background, capture stdout+stderr to a log.
+	(
+		eval "$launcher" >"$logdir/consumer.log" 2>&1
+	) &
+	local launcher_pid=$!
+
+	# Wait briefly for the process tree to spawn the actual decode worker.
+	sleep 4
+
+	local target_pid
+	case "$kind" in
+		mpv)
+			target_pid=$(pgrep -x mpv | head -1)
+			;;
+		firefox)
+			# Firefox's RDD process holds /dev/video1; that's the one with
+			# the libva decoder context. Wait an extra few seconds for it
+			# to spawn and bind the device.
+			sleep 6
+			target_pid=$(pgrep -af 'contentproc.*\brdd\b' | awk '{print $1}' | head -1)
+			if [[ -z "${target_pid:-}" ]]; then
+				# Fallback: find whichever firefox process holds /dev/video1.
+				target_pid=$(sudo lsof -t /dev/video1 2>/dev/null | head -1 || true)
+			fi
+			;;
+		*)
+			echo "  bad kind: $kind" >&2
+			return 1
+			;;
+	esac
+
+	if [[ -z "${target_pid:-}" ]]; then
+		echo "  WARN: could not locate $kind process; skipping pidstat" >&2
+		# Let the consumer run for the duration anyway so the log gets data.
+		sleep "$DURATION"
+		kill -TERM "$launcher_pid" 2>/dev/null || true
+		pkill -f firefox 2>/dev/null || true
+		pkill -x mpv     2>/dev/null || true
+		return 0
+	fi
+
+	echo "  Tracking PID $target_pid"
+
+	# VmRSS at start.
+	local rss_start
+	rss_start=$(awk '/^VmRSS:/{print $2}' "/proc/$target_pid/status" 2>/dev/null || echo 0)
+	echo "  VmRSS start: ${rss_start} kB"
+
+	# Poll GPU freq in background (keyed off launcher_pid).
+	BG_PARENT_PID=$launcher_pid
+	poll_gpu_freq "$logdir/gpu_freq.log" &
+	local poll_pid=$!
+
+	# Run pidstat for $DURATION seconds.
+	pidstat -u -p "$target_pid" 1 "$DURATION" >"$logdir/pidstat.log" 2>&1 || true
+
+	# VmRSS at end (before killing).
+	local rss_end
+	rss_end=$(awk '/^VmRSS:/{print $2}' "/proc/$target_pid/status" 2>/dev/null || echo "$rss_start")
+
+	# Stop everything.
+	kill "$poll_pid" 2>/dev/null || true
+	kill -TERM "$launcher_pid" 2>/dev/null || true
+	pkill -f firefox 2>/dev/null || true
+	pkill -x mpv     2>/dev/null || true
+	sleep 1
+
+	# Parse pidstat by header: locate the %CPU column index from the
+	# column-name row (where any field equals "%CPU"), then apply it
+	# to data rows. Robust across sysstat 12.x point releases.
+	# pidstat default output has no '#' header marker — the header is
+	# the first row containing "%CPU" as a field.
+	awk '
+		# Header row: any line where some field equals "%CPU".
+		!col {
+			for (i = 1; i <= NF; i++) if ($i == "%CPU") { col = i; next }
+		}
+		# Data row: lines whose value at $col is numeric. Skip the
+		# trailing "Average" summary by requiring $col to parse cleanly.
+		col && NF >= col && $col ~ /^[0-9]+(\.[0-9]+)?$/ {
+			print $col
+		}
+	' "$logdir/pidstat.log" >"$logdir/cpu_pct.log" || true
+
+	local cpu_p50 cpu_p90
+	if [[ -s "$logdir/cpu_pct.log" ]]; then
+		cpu_p50=$(percentile_from_stream "$logdir/cpu_pct.log" 50)
+		cpu_p90=$(percentile_from_stream "$logdir/cpu_pct.log" 90)
+	else
+		cpu_p50="ERR"
+		cpu_p90="ERR"
+	fi
+
+	# GPU freq median. Values are Hz; convert to MHz via temp file (avoids
+	# unreliable /dev/stdin in a nested subshell-over-pipe).
+	local gpu_med_mhz
+	if [[ -s "$logdir/gpu_freq.log" ]]; then
+		awk '{print $1/1000000}' "$logdir/gpu_freq.log" >"$logdir/gpu_freq_mhz.log"
+		gpu_med_mhz=$(percentile_from_stream "$logdir/gpu_freq_mhz.log" 50)
+	else
+		gpu_med_mhz="—"
+	fi
+
+	# RSS delta MiB.
+	local rss_delta_mib
+	rss_delta_mib=$(awk -v s="$rss_start" -v e="$rss_end" 'BEGIN{printf "%.1f", (e-s)/1024.0}')
+
+	# Drops + p50 frame interval — only available for mpv.
+	local drops="—"
+	local p50_frame_ms="—"
+	if [[ "$kind" == "mpv" ]]; then
+		drops=$(grep -oE 'frame-drop-count[^\t ]*\s*=\s*[0-9]+' "$logdir/consumer.log" \
+			| awk -F= '{print $2}' | tr -d ' ' | tail -1)
+		drops="${drops:-0}"
+		# p50 frame interval from mpv vsync-jitter or frame timing — leave
+		# as "—" unless mpv emitted detailed timing.
+	fi
+
+	# Emit row.
+	cat >>"$WORKDIR/results.tsv" <<-ROW
+	$label	$cpu_p50	$cpu_p90	$drops	$p50_frame_ms	$gpu_med_mhz	$rss_delta_mib
+	ROW
+
+	echo "  CPU% p50=$cpu_p50  p90=$cpu_p90  drops=$drops  gpu_med=$gpu_med_mhz MHz  rss_delta=$rss_delta_mib MiB"
+	echo
+}
+
+# Header for results.
+echo "consumer	cpu_p50	cpu_p90	drops_${DURATION}s	p50_frame_ms	gpu_med_mhz	rss_delta_mib" >"$WORKDIR/results.tsv"
+
+# === Configurations ===
+
+# 1. mpv DMA-BUF zero-copy
+run_consumer "mpv-vaapi-dmabuf" \
+	"sudo -u mfritsche env LIBVA_DRIVER_NAME=v4l2_request \
+		LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
+		LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
+		mpv --no-config --hwdec=vaapi --vo=null --no-audio \
+		    --term-status-msg='\${frame-drop-count}' \
+		    --length=$DURATION '$FIXTURE'" \
+	mpv
+
+# 2. mpv vaapi-copy
+run_consumer "mpv-vaapi-copy" \
+	"sudo -u mfritsche env LIBVA_DRIVER_NAME=v4l2_request \
+		LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
+		LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
+		mpv --no-config --hwdec=vaapi-copy --vo=null --no-audio \
+		    --term-status-msg='\${frame-drop-count}' \
+		    --length=$DURATION '$FIXTURE'" \
+	mpv
+
+# 3. Firefox-fourier (iter5-amend, sandbox enabled)
+run_consumer "firefox-fourier-hw" \
+	"sudo -u mfritsche env XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
+		WAYLAND_DISPLAY=$WAYLAND_DISPLAY DISPLAY=$DISPLAY \
+		XAUTHORITY=$XAUTHORITY \
+		LIBVA_DRIVER_NAME=v4l2_request \
+		LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
+		LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
+		firefox --new-window 'file://$FIXTURE'" \
+	firefox
+
+# 4. SW baseline
+run_consumer "mpv-sw-baseline" \
+	"sudo -u mfritsche mpv --no-config --hwdec=no --vo=null --no-audio \
+		--term-status-msg='\${frame-drop-count}' \
+		--length=$DURATION '$FIXTURE'" \
+	mpv
+
+# === Generate markdown table ===
+{
+	echo "# Performance binding cell — iter8 (libva-multiplanar campaign)"
+	echo
+	echo "Run date: $(date -Iseconds)"
+	echo "Host: $(uname -n) ($(uname -m))"
+	echo "Kernel: $(uname -r)"
+	echo "Driver sha256: \`$(sha256sum /usr/lib/dri/v4l2_request_drv_video.so | cut -d' ' -f1)\`"
+	echo "Fixture: \`$FIXTURE\` ($(stat -c %s "$FIXTURE") bytes)"
+	echo "Duration per consumer: ${DURATION}s"
+	echo
+	echo "| Consumer | CPU% p50 | CPU% p90 | Drops in window | p50 frame ms | GPU MHz median | VmRSS Δ MiB |"
+	echo "|---|---|---|---|---|---|---|"
+	tail -n +2 "$WORKDIR/results.tsv" | awk -F'\t' '{
+		printf "| %s | %s | %s | %s | %s | %s | %s |\n",
+			$1, $2, $3, $4, $5, $6, $7
+	}'
+} >"$WORKDIR/perf_binding_cell.md"
+
+echo "=== Done ==="
+echo "Results: $WORKDIR/perf_binding_cell.md"
+echo "Per-consumer logs: $WORKDIR/{mpv-vaapi-dmabuf,mpv-vaapi-copy,firefox-fourier-hw,mpv-sw-baseline}/"
+echo
+cat "$WORKDIR/perf_binding_cell.md"