STUDY.md: replace with pointer to libva-multiplanar campaign Phase 0

The Phase 0 / Phase 2 substrate that lived here has been transformed into ../phase0_findings.md as the campaign-level Phase 0 document. This file is reduced to a pointer + a git-show recipe to recover the prior content from commit e0acc33.
STUDY.md: phase 2 finding — libva surface stack works; Brave wall is chromeos pipeline
2026-05-04 08:08:32 +00:00 · 2026-04-25 22:41:54 +00:00 · 2026-04-25 22:34:22 +00:00 · 2026-04-25 22:30:44 +00:00 · 2026-04-25 22:25:36 +00:00 · 2026-04-25 22:21:44 +00:00
61 changed files with 1010 additions and 19979 deletions
@@ -0,0 +1,46 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
 * Fourier-local: H.264 stateless controls and structs were upstreamed to
 * the kernel some time after this library went dormant. Defer all of:
 *
 *   struct v4l2_ctrl_h264_sps / pps / scaling_matrix / pred_weights /
 *          slice_params / decode_params
 *   struct v4l2_h264_weight_factors / dpb_entry / reference
 *   enum v4l2_*_h264_*
 *
 * to <linux/v4l2-controls.h>, where they now live with the same field
 * names. The duplicated definitions in the original bootlin
 * include/h264-ctrls.h conflict at compile time on any current kernel.
 *
 * The CID prefix was also renamed from V4L2_CID_MPEG_VIDEO_H264_* to
 * V4L2_CID_STATELESS_H264_*. Provide compatibility aliases so h264.c
 * keeps compiling without an invasive sed across that file.
 *
 * V4L2_PIX_FMT_H264_SLICE itself lives in <linux/videodev2.h> on current
 * kernels, so no need to redefine it here.
 */
 #ifndef _H264_CTRLS_H_
 #define _H264_CTRLS_H_
 #include <linux/videodev2.h>
 #include <linux/v4l2-controls.h>
 #ifndef V4L2_CID_MPEG_VIDEO_H264_SPS
 #define V4L2_CID_MPEG_VIDEO_H264_SPS \
 	V4L2_CID_STATELESS_H264_SPS
 #define V4L2_CID_MPEG_VIDEO_H264_PPS \
 	V4L2_CID_STATELESS_H264_PPS
 #define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX \
 	V4L2_CID_STATELESS_H264_SCALING_MATRIX
 #define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS \
 	V4L2_CID_STATELESS_H264_SLICE_PARAMS
 #define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS \
 	V4L2_CID_STATELESS_H264_DECODE_PARAMS
 #define V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE \
 	V4L2_CID_STATELESS_H264_DECODE_MODE
 #define V4L2_CID_MPEG_VIDEO_H264_START_CODE \
 	V4L2_CID_STATELESS_H264_START_CODE
 #endif
 #endif /* _H264_CTRLS_H_ */
@@ -1,9 +1,19 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Fourier-local override: HEVC controls are upstream since linux-media
+/*
- * 6.6+, so defer to the kernel's linux/v4l2-controls.h instead of
+ * Fourier-local: HEVC support stripped from this build. The bundled
- * duplicating the struct definitions (duplication causes redefinition
+ * HEVC controls structs and CIDs (V4L2_CID_MPEG_VIDEO_HEVC_*) were
- * errors on newer linux-api-headers). */
+ * upstreamed and renamed to V4L2_CID_STATELESS_HEVC_* in mainline
-#ifndef _LIBVA_V4L2_REQUEST_HEVC_CTRLS_H
+ * linux/v4l2-controls.h, making the original duplicated definitions
-#define _LIBVA_V4L2_REQUEST_HEVC_CTRLS_H
+ * conflict on any current kernel. RK3566 has no HW HEVC anyway, so the
 * port is starting from H.264 + MPEG-2 only; HEVC can come back as a
 * separate effort once a) the host has HEVC silicon (RK3588 hantro/
 * VDPU381) and b) the library is updated to the renamed CIDs.
 *
 * h265.c and h265.h are excluded from src/meson.build; this header is
 * left as a placeholder so any stray '#include <hevc-ctrls.h>' from a
 * downstream patch keeps compiling.
 */
 #ifndef _HEVC_CTRLS_H_
 #define _HEVC_CTRLS_H_
 #include <linux/v4l2-controls.h>
 #endif
@@ -0,0 +1,82 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
 * These are the MPEG2 state controls for use with stateless MPEG-2
 * codec drivers.
 *
 * It turns out that these structs are not stable yet and will undergo
 * more changes. So keep them private until they are stable and ready to
 * become part of the official public API.
 */
 #ifndef _MPEG2_CTRLS_H_
 #define _MPEG2_CTRLS_H_
 #define V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS		(V4L2_CID_MPEG_BASE+250)
 #define V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION		(V4L2_CID_MPEG_BASE+251)
 /* enum v4l2_ctrl_type type values */
 #define V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS 0x0103
 #define	V4L2_CTRL_TYPE_MPEG2_QUANTIZATION 0x0104
 #define V4L2_MPEG2_PICTURE_CODING_TYPE_I	1
 #define V4L2_MPEG2_PICTURE_CODING_TYPE_P	2
 #define V4L2_MPEG2_PICTURE_CODING_TYPE_B	3
 #define V4L2_MPEG2_PICTURE_CODING_TYPE_D	4
 struct v4l2_mpeg2_sequence {
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
 	__u16	horizontal_size;
 	__u16	vertical_size;
 	__u32	vbv_buffer_size;
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
 	__u16	profile_and_level_indication;
 	__u8	progressive_sequence;
 	__u8	chroma_format;
 };
 struct v4l2_mpeg2_picture {
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
 	__u8	picture_coding_type;
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
 	__u8	f_code[2][2];
 	__u8	intra_dc_precision;
 	__u8	picture_structure;
 	__u8	top_field_first;
 	__u8	frame_pred_frame_dct;
 	__u8	concealment_motion_vectors;
 	__u8	q_scale_type;
 	__u8	intra_vlc_format;
 	__u8	alternate_scan;
 	__u8	repeat_first_field;
 	__u16	progressive_frame;
 };
 struct v4l2_ctrl_mpeg2_slice_params {
 	__u32	bit_size;
 	__u32	data_bit_offset;
 	__u64	backward_ref_ts;
 	__u64	forward_ref_ts;
 	struct v4l2_mpeg2_sequence sequence;
 	struct v4l2_mpeg2_picture picture;
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
 	__u32	quantiser_scale_code;
 };
 struct v4l2_ctrl_mpeg2_quantization {
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
 	__u8	load_intra_quantiser_matrix;
 	__u8	load_non_intra_quantiser_matrix;
 	__u8	load_chroma_intra_quantiser_matrix;
 	__u8	load_chroma_non_intra_quantiser_matrix;
 	__u8	intra_quantiser_matrix[64];
 	__u8	non_intra_quantiser_matrix[64];
 	__u8	chroma_intra_quantiser_matrix[64];
 	__u8	chroma_non_intra_quantiser_matrix[64];
 };
 #endif
@@ -1,689 +0,0 @@
 /*
 * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
 *
 * ampere-av1-enablement Phase 2.1: AV1 codec dispatcher for libva-v4l2-
 * request-fourier. Translates VAAPI AV1 picture/slice parameter buffers
 * into V4L2 stateless AV1 controls (V4L2_CID_STATELESS_AV1_*) for the
 * Rockchip vpu981 hardware on RK3588.
 *
 * Reference: Kwiboo/FFmpeg v4l2-request-n8.1:libavcodec/v4l2_request_av1.c
 * (636 LoC; reads from FFmpeg's AV1RawSequenceHeader + AV1RawFrameHeader).
 * VAAPI exposes the same AV1 spec semantics through different struct
 * shapes: sequence-level fields are folded into VADecPictureParameterBufferAV1
 * (no separate sequence buffer); per-frame fields live in the same struct.
 *
 * F1/F2/F3 risk mitigations per phase1_plan_v2 §"General fill_frame
 * implementation risks":
 *   F1 tile_info.mi_col/row_starts sentinel = 2 * ((frame_width + 7) >> 3)
 *      mirrors Kwiboo lines 238/244 exactly.
 *   F2 superres_denom: VAAPI exposes superres_scale_denominator directly
 *      and per spec it's already 8 when use_superres=0. No offset math
 *      needed (Kwiboo does it because FFmpeg stores raw coded_denom).
 *   F3 loop_restoration_size[] gated on USES_LR flag mirrors Kwiboo
 *      lines 281-287 exactly.
 *
 * V4L2 controls (4 per frame, batched in one VIDIOC_S_EXT_CTRLS):
 *   1. V4L2_CID_STATELESS_AV1_SEQUENCE
 *   2. V4L2_CID_STATELESS_AV1_FRAME
 *   3. V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY[] (DYNAMIC_ARRAY)
 *   4. V4L2_CID_STATELESS_AV1_FILM_GRAIN (conditional on driver_data->
 *      has_av1_film_grain probe)
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "av1.h"
 #include "context.h"
 #include "object_heap.h"
 #include "request.h"
 #include "surface.h"
 #include "utils.h"
 #include "v4l2.h"
 #include <va/va.h>
 #include <linux/videodev2.h>
 #include <linux/v4l2-controls.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 /* Sanity asserts to catch kernel uAPI drift. If these fire, the kernel
 * headers on the build machine are out of sync with what the running
 * driver expects — silent register-misalignment bugs result. Cross-compile
 * hazard per Janet v3 amendment: native-arm64 builds only (boltzmann +
 * ampere); no cross from x86 against ARM kernel headers. */
 _Static_assert(sizeof(struct v4l2_ctrl_av1_tile_group_entry) == 16,
 	       "v4l2_ctrl_av1_tile_group_entry size drift — recheck uAPI");
 /* Per AV1 spec, when use_superres=0 the superres denominator is 8.
 * VAAPI's superres_scale_denominator already encodes this directly
 * (per va_dec_av1.h: "When use_superres=0, superres_scale_denominator
 * must be 8"). Kwiboo's AV1_SUPERRES_DENOM_MIN+coded_denom math is
 * not needed when reading from VAAPI. */
 #define AV1_SUPERRES_NUM 8
 /* AV1 spec maxima used for V4L2 array sizing. */
 #define BACKEND_AV1_MAX_SEGMENTS	8
 #define BACKEND_AV1_SEG_LVL_MAX		8
 #define BACKEND_AV1_SEG_LVL_REF_FRAME	5
 #define BACKEND_AV1_NUM_REF_FRAMES	8
 #define BACKEND_AV1_TOTAL_REFS_PER_FRAME 8
 #define BACKEND_AV1_REFS_PER_FRAME	7
 /* ===== fill_sequence ===== */
 static void av1_fill_sequence(VADecPictureParameterBufferAV1 *picture,
 			      struct v4l2_ctrl_av1_sequence *ctrl)
 {
 	uint8_t bit_depth;
 	memset(ctrl, 0, sizeof(*ctrl));
 	switch (picture->bit_depth_idx) {
 	case 0: bit_depth = 8; break;
 	case 1: bit_depth = 10; break;
 	case 2: bit_depth = 12; break;
 	default: bit_depth = 8; break;
 	}
 	ctrl->seq_profile = picture->profile;
 	ctrl->order_hint_bits = picture->seq_info_fields.fields.enable_order_hint ?
 				(picture->order_hint_bits_minus_1 + 1) : 0;
 	ctrl->bit_depth = bit_depth;
 	/* VAAPI does NOT separately expose max_frame_{width,height}_minus_1
 	 * (sequence-level). Use the current frame size as a proxy. Correct
 	 * for fixed-size sequences (the 208/352/1080p test vectors). */
 	ctrl->max_frame_width_minus_1 = picture->frame_width_minus1;
 	ctrl->max_frame_height_minus_1 = picture->frame_height_minus1;
 	if (picture->seq_info_fields.fields.still_picture)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE;
 	if (picture->seq_info_fields.fields.use_128x128_superblock)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK;
 	if (picture->seq_info_fields.fields.enable_filter_intra)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA;
 	if (picture->seq_info_fields.fields.enable_intra_edge_filter)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER;
 	if (picture->seq_info_fields.fields.enable_interintra_compound)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND;
 	if (picture->seq_info_fields.fields.enable_masked_compound)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND;
 	/* VAAPI doesn't expose enable_warped_motion as a sequence flag;
 	 * per-frame allow_warped_motion gates it. Conservative: set true so
 	 * per-frame flag is honored. */
 	ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION;
 	if (picture->seq_info_fields.fields.enable_dual_filter)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER;
 	if (picture->seq_info_fields.fields.enable_order_hint)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT;
 	if (picture->seq_info_fields.fields.enable_jnt_comp)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP;
 	/* enable_ref_frame_mvs / enable_restoration not exposed at sequence
 	 * level — conservative set-true (kdirect also sets these for the
 	 * test streams; gating doesn't matter because per-frame flags
 	 * govern actual use). */
 	ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS;
 	/* enable_superres: gate on the current frame's use_superres so the
 	 * SEQUENCE flag matches the bitstream-derived value. Empirical
 	 * strace diff vs kdirect: kdirect clears this for streams that
 	 * never use superres; we were unconditionally setting it true. */
 	if (picture->pic_info_fields.bits.use_superres)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES;
 	if (picture->seq_info_fields.fields.enable_cdef)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF;
 	ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION;
 	if (picture->seq_info_fields.fields.mono_chrome)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME;
 	if (picture->seq_info_fields.fields.color_range)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE;
 	if (picture->seq_info_fields.fields.subsampling_x)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X;
 	if (picture->seq_info_fields.fields.subsampling_y)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y;
 	if (picture->seq_info_fields.fields.film_grain_params_present)
 		ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT;
 }
 /* ===== fill_frame ===== */
 static void av1_fill_frame(VADecPictureParameterBufferAV1 *picture,
 			   struct v4l2_ctrl_av1_frame *ctrl)
 {
 	unsigned int i, j;
 	memset(ctrl, 0, sizeof(*ctrl));
 	/* ---- tile_info ---- */
 	ctrl->tile_info.context_update_tile_id = picture->context_update_tile_id;
 	ctrl->tile_info.tile_cols = picture->tile_cols;
 	ctrl->tile_info.tile_rows = picture->tile_rows;
 	if (picture->tile_cols > 1 || picture->tile_rows > 1)
 		ctrl->tile_info.tile_size_bytes = 4;
 	else
 		ctrl->tile_info.tile_size_bytes = 0;
 	if (picture->pic_info_fields.bits.uniform_tile_spacing_flag)
 		ctrl->tile_info.flags |= V4L2_AV1_TILE_INFO_FLAG_UNIFORM_TILE_SPACING;
 	/* F1: mi_col/row_starts[]: prefix-sum from width_in_sbs_minus_1[]+1
 	 * (Kwiboo reads tile_start_col_sb[] directly; VAAPI doesn't expose
 	 * starts, only widths — reconstruct via accumulation). Plus the
 	 * sentinel at index tile_cols/tile_rows. */
 	{
 		uint16_t cum = 0;
 		for (i = 0; i < picture->tile_cols && i < 63; i++) {
 			ctrl->tile_info.mi_col_starts[i] = cum;
 			ctrl->tile_info.width_in_sbs_minus_1[i] =
 				picture->width_in_sbs_minus_1[i];
 			cum = (uint16_t)(cum + picture->width_in_sbs_minus_1[i] + 1);
 		}
 		ctrl->tile_info.mi_col_starts[picture->tile_cols] =
 			2 * ((picture->frame_width_minus1 + 1 + 7) >> 3);
 	}
 	{
 		uint16_t cum = 0;
 		for (i = 0; i < picture->tile_rows && i < 63; i++) {
 			ctrl->tile_info.mi_row_starts[i] = cum;
 			ctrl->tile_info.height_in_sbs_minus_1[i] =
 				picture->height_in_sbs_minus_1[i];
 			cum = (uint16_t)(cum + picture->height_in_sbs_minus_1[i] + 1);
 		}
 		ctrl->tile_info.mi_row_starts[picture->tile_rows] =
 			2 * ((picture->frame_height_minus1 + 1 + 7) >> 3);
 	}
 	/* ---- quantization ---- */
 	ctrl->quantization.base_q_idx = picture->base_qindex;
 	ctrl->quantization.delta_q_y_dc = picture->y_dc_delta_q;
 	ctrl->quantization.delta_q_u_dc = picture->u_dc_delta_q;
 	ctrl->quantization.delta_q_u_ac = picture->u_ac_delta_q;
 	ctrl->quantization.delta_q_v_dc = picture->v_dc_delta_q;
 	ctrl->quantization.delta_q_v_ac = picture->v_ac_delta_q;
 	ctrl->quantization.qm_y = picture->qmatrix_fields.bits.qm_y;
 	ctrl->quantization.qm_u = picture->qmatrix_fields.bits.qm_u;
 	ctrl->quantization.qm_v = picture->qmatrix_fields.bits.qm_v;
 	ctrl->quantization.delta_q_res =
 		picture->mode_control_fields.bits.log2_delta_q_res;
 	if (picture->u_dc_delta_q != picture->v_dc_delta_q ||
 	    picture->u_ac_delta_q != picture->v_ac_delta_q)
 		ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_DIFF_UV_DELTA;
 	if (picture->qmatrix_fields.bits.using_qmatrix)
 		ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_USING_QMATRIX;
 	if (picture->mode_control_fields.bits.delta_q_present_flag)
 		ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_DELTA_Q_PRESENT;
 	/* ---- segmentation ---- */
 	if (picture->seg_info.segment_info_fields.bits.enabled)
 		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_ENABLED;
 	if (picture->seg_info.segment_info_fields.bits.update_map)
 		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_UPDATE_MAP;
 	if (picture->seg_info.segment_info_fields.bits.temporal_update)
 		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_TEMPORAL_UPDATE;
 	if (picture->seg_info.segment_info_fields.bits.update_data)
 		ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_UPDATE_DATA;
 	for (i = 0; i < BACKEND_AV1_MAX_SEGMENTS; i++) {
 		for (j = 0; j < BACKEND_AV1_SEG_LVL_MAX; j++) {
 			if (picture->seg_info.feature_mask[i] & (1 << j)) {
 				ctrl->segmentation.feature_enabled[i] |=
 					V4L2_AV1_SEGMENT_FEATURE_ENABLED(j);
 				ctrl->segmentation.last_active_seg_id = i;
 				if (j >= BACKEND_AV1_SEG_LVL_REF_FRAME)
 					ctrl->segmentation.flags |=
 					    V4L2_AV1_SEGMENTATION_FLAG_SEG_ID_PRE_SKIP;
 			}
 			ctrl->segmentation.feature_data[i][j] =
 				picture->seg_info.feature_data[i][j];
 		}
 	}
 	/* ---- loop_filter ---- */
 	ctrl->loop_filter.level[0] = picture->filter_level[0];
 	ctrl->loop_filter.level[1] = picture->filter_level[1];
 	ctrl->loop_filter.level[2] = picture->filter_level_u;
 	ctrl->loop_filter.level[3] = picture->filter_level_v;
 	ctrl->loop_filter.sharpness =
 		picture->loop_filter_info_fields.bits.sharpness_level;
 	ctrl->loop_filter.mode_deltas[0] = picture->mode_deltas[0];
 	ctrl->loop_filter.mode_deltas[1] = picture->mode_deltas[1];
 	ctrl->loop_filter.delta_lf_res =
 		picture->mode_control_fields.bits.log2_delta_lf_res;
 	for (i = 0; i < BACKEND_AV1_NUM_REF_FRAMES; i++)
 		ctrl->loop_filter.ref_deltas[i] = picture->ref_deltas[i];
 	if (picture->loop_filter_info_fields.bits.mode_ref_delta_enabled)
 		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_ENABLED;
 	if (picture->loop_filter_info_fields.bits.mode_ref_delta_update)
 		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_UPDATE;
 	if (picture->mode_control_fields.bits.delta_lf_present_flag)
 		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_PRESENT;
 	if (picture->mode_control_fields.bits.delta_lf_multi)
 		ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_MULTI;
 	/* ---- cdef ---- */
 	ctrl->cdef.damping_minus_3 = picture->cdef_damping_minus_3;
 	ctrl->cdef.bits = picture->cdef_bits;
 	for (i = 0; i < (unsigned)(1 << picture->cdef_bits) && i < 8; i++) {
 		uint8_t y = picture->cdef_y_strengths[i];
 		uint8_t uv = picture->cdef_uv_strengths[i];
 		ctrl->cdef.y_pri_strength[i] = (y >> 2) & 0x0F;
 		ctrl->cdef.y_sec_strength[i] = y & 0x03;
 		ctrl->cdef.uv_pri_strength[i] = (uv >> 2) & 0x0F;
 		ctrl->cdef.uv_sec_strength[i] = uv & 0x03;
 	}
 	/* ---- loop_restoration ---- (F3)
 	 * Phase 5 review Amendment 1 was REVERTED. The reviewer proposed
 	 * remap = {NONE, SWITCHABLE, WIENER, SGRPROJ} (Kwiboo's table)
 	 * based on AV1 spec FrameRestoreType wire encoding
 	 * {NONE=0, SWITCHABLE=1, WIENER=2, SGRPROJ=3} differing from V4L2's
 	 * {NONE=0, WIENER=1, SGRPROJ=2, SWITCHABLE=3}. Empirically applying
 	 * that permutation regressed ALL tests (allintra 10/10 → 0/10) —
 	 * so either VAAPI's yframe_restoration_type is NOT the raw spec
 	 * value (already-remapped to V4L2 enum semantics?), or vpu981
 	 * interprets the V4L2 enum values via a different mapping than
 	 * the V4L2 uAPI header documents. Per
 	 * [[feedback_review_empirical_over_theoretical]] keep the
 	 * identity mapping that empirically works; revisit if a
 	 * restoration-using fixture surfaces a real decode bug.
 	 */
 	{
 		uint8_t remap[4] = {
 			V4L2_AV1_FRAME_RESTORE_NONE,
 			V4L2_AV1_FRAME_RESTORE_WIENER,
 			V4L2_AV1_FRAME_RESTORE_SGRPROJ,
 			V4L2_AV1_FRAME_RESTORE_SWITCHABLE,
 		};
 		uint8_t y_t = picture->loop_restoration_fields.bits.yframe_restoration_type & 3;
 		uint8_t cb_t = picture->loop_restoration_fields.bits.cbframe_restoration_type & 3;
 		uint8_t cr_t = picture->loop_restoration_fields.bits.crframe_restoration_type & 3;
 		bool uses_lr = false;
 		ctrl->loop_restoration.frame_restoration_type[0] = remap[y_t];
 		ctrl->loop_restoration.frame_restoration_type[1] = remap[cb_t];
 		ctrl->loop_restoration.frame_restoration_type[2] = remap[cr_t];
 		if (y_t != 0)
 			uses_lr = true;
 		if (cb_t != 0 || cr_t != 0) {
 			uses_lr = true;
 			ctrl->loop_restoration.flags |=
 				V4L2_AV1_LOOP_RESTORATION_FLAG_USES_CHROMA_LR;
 		}
 		ctrl->loop_restoration.lr_unit_shift =
 			picture->loop_restoration_fields.bits.lr_unit_shift;
 		ctrl->loop_restoration.lr_uv_shift =
 			picture->loop_restoration_fields.bits.lr_uv_shift;
 		if (uses_lr) {
 			uint8_t shift = picture->loop_restoration_fields.bits.lr_unit_shift;
 			uint8_t uv_shift = picture->loop_restoration_fields.bits.lr_uv_shift;
 			ctrl->loop_restoration.flags |=
 				V4L2_AV1_LOOP_RESTORATION_FLAG_USES_LR;
 			ctrl->loop_restoration.loop_restoration_size[0] =
 				1 << (6 + shift);
 			ctrl->loop_restoration.loop_restoration_size[1] =
 				1 << (6 + shift - uv_shift);
 			ctrl->loop_restoration.loop_restoration_size[2] =
 				1 << (6 + shift - uv_shift);
 		}
 	}
 	/* ---- global_motion ---- */
 	for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++) {
 		if (i == 0)
 			continue; /* INTRA_FRAME slot — no warp */
 		ctrl->global_motion.type[i] = picture->wm[i - 1].wmtype;
 		for (j = 0; j < 6; j++)
 			ctrl->global_motion.params[i][j] = picture->wm[i - 1].wmmat[j];
 		if (picture->wm[i - 1].invalid)
 			ctrl->global_motion.invalid |=
 				V4L2_AV1_GLOBAL_MOTION_IS_INVALID(i);
 		switch (picture->wm[i - 1].wmtype) {
 		case 1:
 			ctrl->global_motion.flags[i] |=
 				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_TRANSLATION;
 			ctrl->global_motion.flags[i] |=
 				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL;
 			break;
 		case 2:
 			ctrl->global_motion.flags[i] |=
 				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_ROT_ZOOM;
 			ctrl->global_motion.flags[i] |=
 				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL;
 			break;
 		case 3:
 			ctrl->global_motion.flags[i] |=
 				V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL;
 			break;
 		default:
 			break;
 		}
 	}
 	/* ---- reference frames + order hints ---- */
 	/* reference_frame_ts[] is filled by the orchestrator (av1_set_controls)
 	 * which has driver_data for the SURFACE() lookup. order_hints[] not
 	 * exposed per-ref by VAAPI — leave zero. ref_frame_idx[7] is the
 	 * index map from spec-defined ref slots (LAST..ALTREF) into
 	 * ref_frame_map[8] (the surface IDs). */
 	for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++)
 		ctrl->order_hints[i] = 0;
 	for (i = 0; i < BACKEND_AV1_REFS_PER_FRAME; i++)
 		ctrl->ref_frame_idx[i] = picture->ref_frame_idx[i];
 	/* F2: superres_denom direct from VAAPI; fallback to AV1_SUPERRES_NUM
 	 * if zero (spec violation but defensive). */
 	ctrl->superres_denom = picture->superres_scale_denominator
 		? picture->superres_scale_denominator : AV1_SUPERRES_NUM;
 	ctrl->skip_mode_frame[0] = 0;
 	ctrl->skip_mode_frame[1] = 0;
 	ctrl->primary_ref_frame = picture->primary_ref_frame;
 	ctrl->frame_type = picture->pic_info_fields.bits.frame_type;
 	ctrl->order_hint = picture->order_hint;
 	ctrl->upscaled_width = picture->frame_width_minus1 + 1;
 	ctrl->interpolation_filter = picture->interp_filter;
 	ctrl->tx_mode = picture->mode_control_fields.bits.tx_mode;
 	ctrl->frame_width_minus_1 = picture->frame_width_minus1;
 	ctrl->frame_height_minus_1 = picture->frame_height_minus1;
 	ctrl->render_width_minus_1 = picture->frame_width_minus1;
 	ctrl->render_height_minus_1 = picture->frame_height_minus1;
 	ctrl->current_frame_id = 0;
 	/* Phase 3: VAAPI doesn't expose refresh_frame_flags. For KEY/SWITCH
 	 * frames the AV1 spec mandates 0xff (refresh all DPB slots). For
 	 * inter frames we default to 0xff too — simple P-frame chains will
 	 * naturally rotate through slots without a precise per-slot value.
 	 * If the stream needs precise control, this needs SPS-side parsing.
 	 * Empirical diff vs kdirect shows kdirect always sends 0xff here. */
 	ctrl->refresh_frame_flags = 0xff;
 	/* ---- frame flags ---- */
 	if (picture->pic_info_fields.bits.show_frame)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SHOW_FRAME;
 	if (picture->pic_info_fields.bits.showable_frame)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME;
 	if (picture->pic_info_fields.bits.error_resilient_mode)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE;
 	if (picture->pic_info_fields.bits.disable_cdf_update)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE;
 	if (picture->pic_info_fields.bits.allow_screen_content_tools)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS;
 	if (picture->pic_info_fields.bits.force_integer_mv)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV;
 	if (picture->pic_info_fields.bits.allow_intrabc)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC;
 	if (picture->pic_info_fields.bits.use_superres)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_USE_SUPERRES;
 	if (picture->pic_info_fields.bits.allow_high_precision_mv)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV;
 	if (picture->pic_info_fields.bits.is_motion_mode_switchable)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE;
 	if (picture->pic_info_fields.bits.use_ref_frame_mvs)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS;
 	if (picture->pic_info_fields.bits.disable_frame_end_update_cdf)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF;
 	if (picture->pic_info_fields.bits.allow_warped_motion)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION;
 	if (picture->mode_control_fields.bits.reference_select)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT;
 	if (picture->mode_control_fields.bits.reduced_tx_set_used)
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET;
 	if (picture->mode_control_fields.bits.skip_mode_present) {
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED;
 		ctrl->flags |= V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT;
 	}
 }
 /* ===== fill_film_grain ===== */
 static void av1_fill_film_grain(VADecPictureParameterBufferAV1 *picture,
 				struct v4l2_ctrl_av1_film_grain *ctrl)
 {
 	VAFilmGrainStructAV1 *fg = &picture->film_grain_info;
 	unsigned int i;
 	memset(ctrl, 0, sizeof(*ctrl));
 	ctrl->cr_mult = fg->cr_mult;
 	ctrl->grain_seed = fg->grain_seed;
 	/* VAAPI doesn't expose film_grain_params_ref_idx (the reuse-from-
 	 * previous-frame index). Leave zero — only consulted when
 	 * update_grain=0, which VAAPI also doesn't expose. */
 	ctrl->film_grain_params_ref_idx = 0;
 	ctrl->num_y_points = fg->num_y_points;
 	ctrl->num_cb_points = fg->num_cb_points;
 	ctrl->num_cr_points = fg->num_cr_points;
 	ctrl->grain_scaling_minus_8 =
 		fg->film_grain_info_fields.bits.grain_scaling_minus_8;
 	ctrl->ar_coeff_lag = fg->film_grain_info_fields.bits.ar_coeff_lag;
 	ctrl->ar_coeff_shift_minus_6 =
 		fg->film_grain_info_fields.bits.ar_coeff_shift_minus_6;
 	ctrl->grain_scale_shift =
 		fg->film_grain_info_fields.bits.grain_scale_shift;
 	ctrl->cb_mult = fg->cb_mult;
 	ctrl->cb_luma_mult = fg->cb_luma_mult;
 	ctrl->cr_luma_mult = fg->cr_luma_mult;
 	ctrl->cb_offset = fg->cb_offset;
 	ctrl->cr_offset = fg->cr_offset;
 	if (fg->film_grain_info_fields.bits.apply_grain) {
 		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN;
 		/* kdirect strace diff confirmed: V4L2_AV1_FILM_GRAIN_FLAG_
 		 * UPDATE_GRAIN must be set when apply_grain=1 (kdirect's
 		 * flags byte is 0x0B = APPLY|UPDATE|...). VAAPI's
 		 * VAFilmGrainStructAV1 doesn't expose update_grain
 		 * separately. Default to UPDATE=1 (use submitted params,
 		 * not reuse from non-existent prior film_grain ref). The
 		 * earlier segfault we saw with this flag was unmasked by
 		 * the link-NULL deref (now fixed via linked_decode_surface);
 		 * not caused by UPDATE_GRAIN itself. */
 		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_UPDATE_GRAIN;
 	}
 	if (fg->film_grain_info_fields.bits.chroma_scaling_from_luma)
 		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_CHROMA_SCALING_FROM_LUMA;
 	if (fg->film_grain_info_fields.bits.overlap_flag)
 		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_OVERLAP;
 	if (fg->film_grain_info_fields.bits.clip_to_restricted_range)
 		ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_CLIP_TO_RESTRICTED_RANGE;
 	if (!fg->film_grain_info_fields.bits.apply_grain)
 		return;
 	for (i = 0; i < fg->num_y_points && i < 14; i++) {
 		ctrl->point_y_value[i] = fg->point_y_value[i];
 		ctrl->point_y_scaling[i] = fg->point_y_scaling[i];
 	}
 	for (i = 0; i < fg->num_cb_points && i < 10; i++) {
 		ctrl->point_cb_value[i] = fg->point_cb_value[i];
 		ctrl->point_cb_scaling[i] = fg->point_cb_scaling[i];
 	}
 	for (i = 0; i < fg->num_cr_points && i < 10; i++) {
 		ctrl->point_cr_value[i] = fg->point_cr_value[i];
 		ctrl->point_cr_scaling[i] = fg->point_cr_scaling[i];
 	}
 	for (i = 0; i < 24; i++)
 		ctrl->ar_coeffs_y_plus_128[i] = (uint8_t)(fg->ar_coeffs_y[i] + 128);
 	for (i = 0; i < 25; i++) {
 		ctrl->ar_coeffs_cb_plus_128[i] = (uint8_t)(fg->ar_coeffs_cb[i] + 128);
 		ctrl->ar_coeffs_cr_plus_128[i] = (uint8_t)(fg->ar_coeffs_cr[i] + 128);
 	}
 }
 /* ===== orchestrator ===== */
 int av1_set_controls(struct request_data *driver_data,
 		     struct object_context *context,
 		     struct object_surface *surface_object)
 {
 	VADecPictureParameterBufferAV1 *picture =
 		&surface_object->params.av1.picture;
 	unsigned int num_tiles = surface_object->params.av1.num_tile_group_entries;
 	struct v4l2_ctrl_av1_sequence sequence;
 	struct v4l2_ctrl_av1_frame frame;
 	struct v4l2_ctrl_av1_film_grain film_grain;
 	struct v4l2_ctrl_av1_tile_group_entry *tile_entries = NULL;
 	struct v4l2_ext_control controls[4];
 	unsigned int n = 0;
 	unsigned int i;
 	unsigned int alloc_tiles;
 	int rc;
 	(void)context;
 	/*
 	 * AV1 film_grain link: when apply_grain=1, ffmpeg-vaapi allocates a
 	 * separate display surface (current_display_picture) from the decode
 	 * surface (current_frame). vpu981 HW applies grain inline to the
 	 * decode CAPTURE buffer, so the consumable data is in current_frame's
 	 * slot. ffmpeg then calls vaGetImage on current_display_picture which
 	 * has no slot bound. Link the display surface back to the decode
 	 * surface so copy_surface_to_image can borrow destination_data[].
 	 */
 	if (picture->current_display_picture != VA_INVALID_SURFACE &&
 	    picture->current_display_picture != picture->current_frame) {
 		struct object_surface *display_surface =
 			SURFACE(driver_data, picture->current_display_picture);
 		if (display_surface != NULL)
 			display_surface->linked_decode_surface_id =
 				picture->current_frame;
 	}
 	if (num_tiles > AV1_MAX_TILES)
 		num_tiles = AV1_MAX_TILES;
 	/* DYNAMIC_ARRAY size = MAX(num_tiles, 1) per Janet v2 Q1
 	 * amendment — kernel UB on size=0. */
 	alloc_tiles = num_tiles > 0 ? num_tiles : 1;
 	tile_entries = calloc(alloc_tiles, sizeof(*tile_entries));
 	if (tile_entries == NULL)
 		return -1;
 	for (i = 0; i < num_tiles; i++) {
 		VASliceParameterBufferAV1 *slice =
 			&surface_object->params.av1.tile_group_entries[i];
 		tile_entries[i].tile_offset = slice->slice_data_offset;
 		tile_entries[i].tile_size = slice->slice_data_size;
 		tile_entries[i].tile_row = (uint8_t)slice->tile_row;
 		tile_entries[i].tile_col = (uint8_t)slice->tile_column;
 	}
 	av1_fill_sequence(picture, &sequence);
 	av1_fill_frame(picture, &frame);
 	/*
 	 * Phase 2.1 + frame-2 divergence fix: wire reference_frame_ts[].
 	 * VAAPI exposes ref_frame_map[8] as VASurfaceIDs; the kernel needs
 	 * v4l2-style timestamps to cross-reference the corresponding
 	 * CAPTURE buffers (set on the OUTPUT buffer at QBUF time per
 	 * picture.c::EndPicture, via surface_object->timestamp). Mirrors
 	 * the vp9.c:614-628 pattern, scaled to AV1's 8 ref slots.
 	 *
 	 * VA_INVALID_SURFACE entries stay at the calloc'd zero timestamp
 	 * (kernel reads zero, doesn't try to dereference).
 	 */
 	/*
 	 * Empirical: DPB-slot iteration (i over ref_frame_map[i]) gives
 	 * better correctness than ref-name iteration via ref_frame_idx[].
 	 * Tried the ref-name reindex (Kwiboo convention via FFmpeg s->ref[i])
 	 * and lost frames that previously PASSed (3/10 → 1/10) — so the V4L2
 	 * uAPI semantic here may be DPB-slot-indexed despite the AV1 spec
 	 * lexicon. Phase 3 open question pending kernel-side disambiguation.
 	 */
 	for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++) {
 		VASurfaceID ref_id = picture->ref_frame_map[i];
 		struct object_surface *ref_surface;
 		uint64_t ts;
 		if (ref_id == VA_INVALID_SURFACE)
 			continue;
 		ref_surface = SURFACE(driver_data, ref_id);
 		if (ref_surface == NULL)
 			continue;
 		ts = v4l2_timeval_to_ns(&ref_surface->timestamp);
 		if (ts == 0 &&
 		    ref_surface->linked_decode_surface_id != VA_INVALID_SURFACE) {
 			struct object_surface *dec =
 				SURFACE(driver_data,
 					ref_surface->linked_decode_surface_id);
 			if (dec != NULL) {
 				ts = v4l2_timeval_to_ns(&dec->timestamp);
 				frame.order_hints[i] = dec->av1_order_hint;
 			}
 		} else {
 			frame.order_hints[i] = ref_surface->av1_order_hint;
 		}
 		frame.reference_frame_ts[i] = ts;
 	}
 	/* Phase 3: record this frame's order_hint on the surface so the
 	 * NEXT frame's ref-loop can populate order_hints[] for slots that
 	 * reference us. */
 	surface_object->av1_order_hint = picture->order_hint;
 	/* Also propagate to the linked display surface (if any), since
 	 * future frames' ref_frame_map[] may point at either. */
 	if (picture->current_display_picture != VA_INVALID_SURFACE &&
 	    picture->current_display_picture != picture->current_frame) {
 		struct object_surface *disp =
 			SURFACE(driver_data, picture->current_display_picture);
 		if (disp != NULL)
 			disp->av1_order_hint = picture->order_hint;
 	}
 	if (driver_data->has_av1_film_grain)
 		av1_fill_film_grain(picture, &film_grain);
 	controls[n++] = (struct v4l2_ext_control){
 		.id = V4L2_CID_STATELESS_AV1_SEQUENCE,
 		.ptr = &sequence,
 		.size = sizeof(sequence),
 	};
 	controls[n++] = (struct v4l2_ext_control){
 		.id = V4L2_CID_STATELESS_AV1_FRAME,
 		.ptr = &frame,
 		.size = sizeof(frame),
 	};
 	controls[n++] = (struct v4l2_ext_control){
 		.id = V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY,
 		.ptr = tile_entries,
 		.size = sizeof(*tile_entries) * alloc_tiles,
 	};
 	if (driver_data->has_av1_film_grain) {
 		controls[n++] = (struct v4l2_ext_control){
 			.id = V4L2_CID_STATELESS_AV1_FILM_GRAIN,
 			.ptr = &film_grain,
 			.size = sizeof(film_grain),
 		};
 	}
 	rc = v4l2_set_controls(driver_data->video_fd,
 			       surface_object->request_fd,
 			       controls, n);
 	free(tile_entries);
 	if (rc < 0) {
 		request_log("ampere-av1: VIDIOC_S_EXT_CTRLS failed rc=%d\n", rc);
 		return -1;
 	}
 	return 0;
 }
@@ -1,45 +0,0 @@
 /*
 * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
 *
 * ampere-av1-enablement Phase 2: AV1 codec dispatcher header for libva-
 * v4l2-request-fourier. Mirrors vp9.h shape — single set_controls entry
 * point that translates surface->params.av1.* VAAPI structures into a
 * batch of V4L2_CID_STATELESS_AV1_{SEQUENCE,FRAME,TILE_GROUP_ENTRY,
 * FILM_GRAIN} controls + the underlying request_fd / OUTPUT plane setup.
 *
 * V4L2 target: V4L2_PIX_FMT_AV1_FRAME on the vpu981 hantro instance
 * (RK3588's dedicated AV1 decoder).
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef _AV1_H_
 #define _AV1_H_
 #include "context.h"
 #include "request.h"
 #include "surface.h"
 int av1_set_controls(struct request_data *driver_data,
 		     struct object_context *context,
 		     struct object_surface *surface);
 #endif /* _AV1_H_ */
@@ -49,20 +49,19 @@ VAStatus RequestCreateBuffer(VADriverContextP context, VAContextID context_id,
 			     unsigned int count, void *data,
 			     VABufferID *buffer_id)
 {
 	request_log("ENTER RequestCreateBuffer\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object = NULL;
 	void *buffer_data;
 	VAStatus status;
 	VABufferID id;
 	switch (type) {
 	case VAPictureParameterBufferType:
 	case VAIQMatrixBufferType:
 	case VASliceParameterBufferType:
 	case VASliceDataBufferType:
 	case VAImageBufferType:
 	case VAProbabilityBufferType:
 		break;
 	default:
@@ -111,7 +110,7 @@ complete:
 VAStatus RequestDestroyBuffer(VADriverContextP context, VABufferID buffer_id)
 {
-
+	request_log("ENTER RequestDestroyBuffer\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
@@ -131,7 +130,7 @@ VAStatus RequestDestroyBuffer(VADriverContextP context, VABufferID buffer_id)
 VAStatus RequestMapBuffer(VADriverContextP context, VABufferID buffer_id,
 			  void **data_map)
 {
-
+	request_log("ENTER RequestMapBuffer\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
@@ -147,7 +146,7 @@ VAStatus RequestMapBuffer(VADriverContextP context, VABufferID buffer_id,
 VAStatus RequestUnmapBuffer(VADriverContextP context, VABufferID buffer_id)
 {
-
+	request_log("ENTER RequestUnmapBuffer\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
@@ -163,6 +162,7 @@ VAStatus RequestUnmapBuffer(VADriverContextP context, VABufferID buffer_id)
 VAStatus RequestBufferSetNumElements(VADriverContextP context,
 				     VABufferID buffer_id, unsigned int count)
 {
 	request_log("ENTER RequestBufferSetNumElements\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
@@ -182,6 +182,7 @@ VAStatus RequestBufferInfo(VADriverContextP context, VABufferID buffer_id,
 			   VABufferType *type, unsigned int *size,
 			   unsigned int *count)
 {
 	request_log("ENTER RequestBufferInfo\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
@@ -200,6 +201,7 @@ VAStatus RequestAcquireBufferHandle(VADriverContextP context,
 				    VABufferID buffer_id,
 				    VABufferInfo *buffer_info)
 {
 	request_log("ENTER RequestAcquireBufferHandle\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
 	struct object_surface *surface_object;
@@ -250,7 +252,7 @@ VAStatus RequestAcquireBufferHandle(VADriverContextP context,
 VAStatus RequestReleaseBufferHandle(VADriverContextP context,
 	VABufferID buffer_id)
 {
-
+	request_log("ENTER RequestReleaseBufferHandle\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_buffer *buffer_object;
 	int export_fd;
@@ -1,303 +0,0 @@
 /*
 * Iteration 2 Fix 3: cap_pool implementation.
 *
 * Design rationale + limitations: see cap_pool.h docblock.
 *
 * Concurrency model:
 *   - All public functions take pool->lock at entry, release at exit.
 *   - cap_pool_acquire may sleep briefly while scanning slots; safe
 *     under lock since the scan is bounded by pool->count (<= 24
 *     typical).
 *   - The slot pointer returned by acquire / mark_decoded /
 *     mark_exported / release is stable across the call (lock is
 *     dropped before return) but the slot's state may change between
 *     calls. Callers MUST NOT cache slot pointers across sleep/I/O --
 *     they should treat slot pointers as opaque references valid only
 *     for the immediate operation.
 *
 *   In practice, our caller pattern is:
 *     surface_object->current_slot = cap_pool_acquire(...);
 *     v4l2_queue_buffer(slot->v4l2_index, ...);
 *     // later, in SyncSurface for the same surface:
 *     v4l2_dequeue_buffer(surface_object->current_slot->v4l2_index, ...);
 *     cap_pool_mark_decoded(surface_object->current_slot);
 *
 *   surface_object->current_slot is the persistent reference; the
 *   slot's V4L2 index is stable for the slot's lifetime. The state
 *   field IS read by other threads (acquire scans for FREE) — that
 *   reads are safe because:
 *     - acquire holds the lock during the scan
 *     - mark_decoded/mark_exported/release also hold the lock
 *   So state transitions are serialized.
 */
 #include "cap_pool.h"
 #include "v4l2.h"
 #include "utils.h"
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 #include <sys/mman.h>
 #include <linux/videodev2.h>
 static uint64_t monotonic_ns(void)
 {
 	struct timespec ts;
 	if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
 		return 0;
 	return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
 }
 int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type,
 		  unsigned int count, unsigned int v4l2_buffers_count_per_slot)
 {
 	unsigned int index_base;
 	unsigned int i, j;
 	int rc;
 	if (pool == NULL || count == 0)
 		return -EINVAL;
 	memset(pool, 0, sizeof(*pool));
 	rc = pthread_mutex_init(&pool->lock, NULL);
 	if (rc != 0)
 		return -rc;
 	pool->slots = calloc(count, sizeof(*pool->slots));
 	if (pool->slots == NULL) {
 		pthread_mutex_destroy(&pool->lock);
 		return -ENOMEM;
 	}
 	pool->count = count;
 	rc = v4l2_create_buffers(video_fd, capture_type, count, &index_base);
 	if (rc < 0) {
 		free(pool->slots);
 		pthread_mutex_destroy(&pool->lock);
 		return rc;
 	}
 	for (i = 0; i < count; i++) {
 		struct cap_pool_slot *slot = &pool->slots[i];
 		slot->v4l2_index = index_base + i;
 		slot->buffers_count = v4l2_buffers_count_per_slot;
 		slot->state = CAP_SLOT_FREE;
 		slot->our_export_fd = -1;
 		slot->last_used_at_ns = 0;	/* never used → highest LRU priority */
 		slot->bound_to_surface_id = -1;
 		rc = v4l2_query_buffer(video_fd, capture_type, slot->v4l2_index,
 				       slot->map_lengths, slot->map_offsets,
 				       v4l2_buffers_count_per_slot);
 		if (rc < 0) {
 			request_log("cap_pool_init: query_buffer failed for "
 				    "slot %u (v4l2_index=%u)\n",
 				    i, slot->v4l2_index);
 			goto error_cleanup;
 		}
 		for (j = 0; j < v4l2_buffers_count_per_slot; j++) {
 			slot->map[j] = mmap(NULL, slot->map_lengths[j],
 					    PROT_READ | PROT_WRITE, MAP_SHARED,
 					    video_fd, slot->map_offsets[j]);
 			if (slot->map[j] == MAP_FAILED) {
 				request_log("cap_pool_init: mmap failed for "
 					    "slot %u plane %u\n", i, j);
 				slot->map[j] = NULL;
 				goto error_cleanup;
 			}
 		}
 	}
 	pool->initialized = true;
 	request_log("cap_pool_init: %u slots ready (v4l2_index=%u..%u, "
 		    "%u plane(s) per slot)\n",
 		    count, index_base, index_base + count - 1,
 		    v4l2_buffers_count_per_slot);
 	return 0;
 error_cleanup:
 	for (i = 0; i < count; i++) {
 		struct cap_pool_slot *slot = &pool->slots[i];
 		for (j = 0; j < v4l2_buffers_count_per_slot; j++) {
 			if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED)
 				munmap(slot->map[j], slot->map_lengths[j]);
 		}
 	}
 	(void)v4l2_request_buffers(video_fd, capture_type, 0);
 	free(pool->slots);
 	pthread_mutex_destroy(&pool->lock);
 	memset(pool, 0, sizeof(*pool));
 	return -EIO;
 }
 void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type)
 {
 	unsigned int i, j;
 	if (pool == NULL || !pool->initialized)
 		return;
 	pthread_mutex_lock(&pool->lock);
 	for (i = 0; i < pool->count; i++) {
 		struct cap_pool_slot *slot = &pool->slots[i];
 		if (slot->our_export_fd >= 0) {
 			close(slot->our_export_fd);
 			slot->our_export_fd = -1;
 		}
 		for (j = 0; j < slot->buffers_count; j++) {
 			if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED) {
 				munmap(slot->map[j], slot->map_lengths[j]);
 				slot->map[j] = NULL;
 			}
 		}
 	}
 	(void)v4l2_request_buffers(video_fd, capture_type, 0);
 	pthread_mutex_unlock(&pool->lock);
 	pthread_mutex_destroy(&pool->lock);
 	free(pool->slots);
 	pool->slots = NULL;
 	pool->count = 0;
 	pool->initialized = false;
 }
 struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id)
 {
 	struct cap_pool_slot *best = NULL;
 	uint64_t best_ts = UINT64_MAX;
 	unsigned int i;
 	if (pool == NULL || !pool->initialized)
 		return NULL;
 	pthread_mutex_lock(&pool->lock);
 	/* First pass: find the FREE slot with oldest last_used_at_ns. */
 	for (i = 0; i < pool->count; i++) {
 		struct cap_pool_slot *slot = &pool->slots[i];
 		if (slot->state != CAP_SLOT_FREE)
 			continue;
 		if (slot->last_used_at_ns < best_ts) {
 			best = slot;
 			best_ts = slot->last_used_at_ns;
 		}
 	}
 	/*
 	 * Second pass (fallback): if no FREE slot, force-recycle the
 	 * oldest EXPORTED slot. This is the documented Option A race
 	 * window — the consumer may still hold a dup'd fd to this
 	 * buffer's underlying physical memory, and the kernel will
 	 * happily DMA new content into it. For typical mpv 16-surface
 	 * playback with MIN_CAP_POOL=24, this fallback should never
 	 * fire. If it does, the visual artifact is bounded to a few
 	 * frames during recovery.
 	 */
 	if (best == NULL) {
 		best_ts = UINT64_MAX;
 		for (i = 0; i < pool->count; i++) {
 			struct cap_pool_slot *slot = &pool->slots[i];
 			if (slot->state != CAP_SLOT_EXPORTED)
 				continue;
 			if (slot->last_used_at_ns < best_ts) {
 				best = slot;
 				best_ts = slot->last_used_at_ns;
 			}
 		}
 		if (best != NULL) {
 			request_log("cap_pool_acquire: pool exhausted, "
 				    "force-recycling EXPORTED slot v4l2_index=%u "
 				    "(consumer race window may open)\n",
 				    best->v4l2_index);
 			if (best->our_export_fd >= 0) {
 				close(best->our_export_fd);
 				best->our_export_fd = -1;
 			}
 		}
 	}
 	if (best == NULL) {
 		pthread_mutex_unlock(&pool->lock);
 		request_log("cap_pool_acquire: no slot available "
 			    "(pool->count=%u, all slots IN_DECODE/DECODED?)\n",
 			    pool->count);
 		return NULL;
 	}
 	/*
 	 * Don't transition DECODED slots — they hold valid pixel content
 	 * a consumer may still be reading via DeriveImage (vaapi-copy
 	 * path). We never recycle DECODED. If a surface holds a DECODED
 	 * slot for an extended period, it stays held; the surface's
 	 * destruction (vaDestroySurfaces) is the only path that releases
 	 * it. mpv typically progresses through DECODED → EXPORTED quickly
 	 * for vaapi DMA-BUF; for vaapi-copy, DECODED → consumer reads
 	 * via mmap → consumer is done after copy_surface_to_image returns.
 	 * The vaapi-copy consumer has no explicit "I'm done" signal, so
 	 * we rely on the next BeginPicture for the same surface to
 	 * release the prior DECODED slot.
 	 */
 	best->state = CAP_SLOT_IN_DECODE;
 	best->bound_to_surface_id = surface_id;
 	best->last_used_at_ns = monotonic_ns();
 	pthread_mutex_unlock(&pool->lock);
 	return best;
 }
 void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot)
 {
 	if (pool == NULL || slot == NULL)
 		return;
 	pthread_mutex_lock(&pool->lock);
 	slot->state = CAP_SLOT_DECODED;
 	slot->last_used_at_ns = monotonic_ns();
 	pthread_mutex_unlock(&pool->lock);
 }
 void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd)
 {
 	if (pool == NULL || slot == NULL)
 		return;
 	pthread_mutex_lock(&pool->lock);
 	if (slot->our_export_fd >= 0 && slot->our_export_fd != our_fd) {
 		/*
 		 * Double-Export: a previous EXPBUF'd fd existed. Close
 		 * the old one. Consumer's old fd remains valid via
 		 * dma_buf refcount. Documented in surface.c export path.
 		 */
 		close(slot->our_export_fd);
 	}
 	slot->our_export_fd = our_fd;
 	slot->state = CAP_SLOT_EXPORTED;
 	slot->last_used_at_ns = monotonic_ns();
 	pthread_mutex_unlock(&pool->lock);
 }
 void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot)
 {
 	if (pool == NULL || slot == NULL)
 		return;
 	pthread_mutex_lock(&pool->lock);
 	if (slot->our_export_fd >= 0) {
 		close(slot->our_export_fd);
 		slot->our_export_fd = -1;
 	}
 	slot->state = CAP_SLOT_FREE;
 	slot->bound_to_surface_id = -1;
 	slot->last_used_at_ns = monotonic_ns();
 	pthread_mutex_unlock(&pool->lock);
 }
@@ -1,156 +0,0 @@
 /*
 * Iteration 2 Fix 3: decoupled CAPTURE buffer pool with LRU recycling.
 *
 * Background — the bug this fixes:
 *
 *   Pre-iteration-2, each VAAPI surface was permanently 1:1 bound to a
 *   V4L2 CAPTURE buffer index at vaCreateSurfaces2 time. Each decode
 *   cycle re-QBUF'd that same physical buffer for the same surface ID.
 *   When mpv reused a surface for a new decode while the compositor
 *   still held an EXPBUF'd dma_buf fd to the prior frame's content,
 *   the kernel wrote new decode output into the SAME physical memory
 *   the compositor was reading from — visible as stutter / "back and
 *   forth" frame swap during mpv --hwdec=vaapi --vo=gpu playback.
 *
 *   V4L2 does not enforce the constraint (it lets QBUF re-queue a
 *   buffer regardless of dma_buf refcount on EXPBUF'd fds). userspace
 *   must coordinate.
 *
 * Architecture (Sonnet Phase 5 review for iter2):
 *
 *   Pool of N CAPTURE buffers (N >= max(surfaces_count, MIN_CAP_POOL)).
 *   Each slot has a state in {FREE, IN_DECODE, DECODED, EXPORTED}.
 *   Surfaces are no longer permanently bound; each vaBeginPicture
 *   acquires a FREE slot, binds it to the current decode, transitions
 *   it through IN_DECODE → DECODED → optionally EXPORTED.
 *
 *   The DECODED state captures the window between SyncSurface DQBUF
 *   and either ExportSurfaceHandle (DMA-BUF path) or DeriveImage
 *   (vaapi-copy path). LRU recycling considers ONLY FREE slots, so
 *   DECODED slots cannot be claimed by a concurrent decode while
 *   the consumer is still using the bound surface's content.
 *
 *   Concurrency: a pthread_mutex_t protects pool state. VAAPI is
 *   re-entrant for multi-threaded consumers (mpv may BeginPicture/
 *   SyncSurface from one thread and ExportSurfaceHandle from
 *   another).
 *
 * Limitations (deferred to iteration 3+):
 *
 *   - Option-A statistical mitigation, not a correct fix. The race
 *     window narrows from "constant" to "only when pool is exhausted
 *     and force-recycle of oldest EXPORTED slot fires." For typical
 *     mpv 16-surface playback with MIN_CAP_POOL=24, this never fires
 *     in practice (Sonnet review iter2 question 3). For pathological
 *     workloads (paused-with-video-still-visible, multi-stream),
 *     race windows still possible. Iteration 3 may revisit with
 *     V4L2_MEMORY_DMABUF + userspace allocation.
 *
 *   - LRU "force-recycle" still has the race in the worst case.
 *     Closing OUR EXPBUF fd does not close the consumer's dup — the
 *     consumer's fd keeps the dma_buf alive but the V4L2 layer will
 *     happily write new data into the underlying physical memory on
 *     re-QBUF. There is no public V4L2 API to query dma_buf refcount.
 *
 *   - Multi-context concurrent use (two libva contexts open
 *     simultaneously, e.g. Firefox playing two videos in different
 *     tabs through separate RDD instances): not addressed. Each
 *     context gets its own pool, but there's only one V4L2 device.
 */
 #ifndef _CAP_POOL_H_
 #define _CAP_POOL_H_
 #include <stdbool.h>
 #include <stdint.h>
 #include <pthread.h>
 #include <linux/videodev2.h>	/* for VIDEO_MAX_PLANES */
 #define MIN_CAP_POOL 24
 enum cap_slot_state {
 	CAP_SLOT_FREE = 0,	/* available for a new decode acquisition */
 	CAP_SLOT_IN_DECODE,	/* QBUF'd to V4L2, kernel owns */
 	CAP_SLOT_DECODED,	/* DQBUF'd, valid pixel content; mapped by surface */
 	CAP_SLOT_EXPORTED,	/* EXPBUF'd; consumer holds a dma_buf fd */
 };
 struct cap_pool_slot {
 	unsigned int		v4l2_index;			/* V4L2 buffer index */
 	void			*map[VIDEO_MAX_PLANES];		/* mmap pointers */
 	unsigned int		map_lengths[VIDEO_MAX_PLANES];
 	unsigned int		map_offsets[VIDEO_MAX_PLANES];
 	unsigned int		buffers_count;			/* V4L2 buffers per logical NV12 (1 for single-plane MPLANE) */
 	enum cap_slot_state	state;
 	int			our_export_fd;			/* -1 if not exported; close on FREE transition */
 	uint64_t		last_used_at_ns;		/* CLOCK_MONOTONIC when last touched (LRU) */
 	int			bound_to_surface_id;		/* -1 if not bound; informational */
 };
 struct cap_pool {
 	struct cap_pool_slot	*slots;
 	unsigned int		count;		/* allocated slot count */
 	pthread_mutex_t		lock;
 	bool			initialized;
 };
 /*
 * cap_pool_init — allocate a pool of `count` CAPTURE buffers via
 * v4l2_create_buffers, mmap each buffer's planes, init slot states
 * to FREE. `count` is min'd against any reasonable hardware cap.
 *
 * Returns 0 on success, negative errno on failure.
 */
 int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type,
 		  unsigned int count, unsigned int v4l2_buffers_count_per_slot);
 /*
 * cap_pool_destroy — close any outstanding our_export_fds, munmap all
 * planes, REQBUFS(0), free slots. Safe to call on a non-initialized
 * pool (no-op).
 *
 * Note: closing our_export_fd does not invalidate any consumer-held
 * dup'd fds — the kernel keeps the dma_buf alive while any fd refs
 * it. munmap on our side is independent of the consumer's mmap (each
 * mmap of a dma_buf is a distinct VMA).
 */
 void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type);
 /*
 * cap_pool_acquire — find a FREE slot with the oldest last_used_at_ns
 * (LRU). If no FREE slot is available, force-recycle the oldest
 * EXPORTED slot (close our_export_fd, demote to IN_DECODE for the
 * caller). Returns NULL only if no slots can be recycled at all
 * (catastrophic — pool too small).
 *
 * The returned slot is in IN_DECODE state. Caller QBUFs it and
 * transitions to DECODED via cap_pool_mark_decoded after DQBUF.
 */
 struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id);
 /*
 * cap_pool_mark_decoded — IN_DECODE → DECODED. Touches last_used_at_ns.
 * Called from RequestSyncSurface after successful DQBUF.
 */
 void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot);
 /*
 * cap_pool_mark_exported — DECODED → EXPORTED. Stores `our_fd` so the
 * pool owns OUR copy of the EXPBUF'd fd; the consumer received a
 * dup'd / equivalent fd via the descriptor. last_used_at_ns is
 * touched again so EXPORTED slots are recycled in LRU order.
 *
 * Called from RequestExportSurfaceHandle after VIDIOC_EXPBUF.
 */
 void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd);
 /*
 * cap_pool_release — explicitly return a slot to FREE (close our
 * export fd if any). Called from RequestDestroySurfaces and from
 * RequestBeginPicture when re-acquiring (the surface's previous slot
 * is released first, then a new one acquired).
 */
 void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot);
 #endif /* _CAP_POOL_H_ */
@@ -1,52 +0,0 @@
 /*
 * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "codec.h"
 #include <linux/videodev2.h>
 unsigned int pixelformat_for_profile(VAProfile profile)
 {
 	switch (profile) {
 	case VAProfileMPEG2Simple:
 	case VAProfileMPEG2Main:
 		return V4L2_PIX_FMT_MPEG2_SLICE;
 	case VAProfileH264Main:
 	case VAProfileH264High:
 	case VAProfileH264ConstrainedBaseline:
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
 		return V4L2_PIX_FMT_H264_SLICE;
 	case VAProfileHEVCMain:
 		return V4L2_PIX_FMT_HEVC_SLICE;
 	case VAProfileVP8Version0_3:
 		return V4L2_PIX_FMT_VP8_FRAME;
 	case VAProfileVP9Profile0:
 		return V4L2_PIX_FMT_VP9_FRAME;
 	case VAProfileAV1Profile0:
 		return V4L2_PIX_FMT_AV1_FRAME;
 	default:
 		return 0;
 	}
 }
@@ -1,48 +0,0 @@
 /*
 * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef _CODEC_H_
 #define _CODEC_H_
 #include <va/va.h>
 /**
 * pixelformat_for_profile - map a VA-API VAProfile to its V4L2 OUTPUT-side
 *                            pixel format FOURCC.
 *
 * @profile: VAProfile enum value as passed to vaCreateConfig.
 *
 * Returns the V4L2_PIX_FMT_* constant that the V4L2 device's OUTPUT_MPLANE
 * (bitstream-input) queue should be set to in order for the kernel
 * stateless decoder to dispatch to the right codec_mode. Used at
 * RequestCreateConfig to populate object_config->pixelformat, and read
 * from there at RequestCreateContext when committing the OUTPUT format
 * to the V4L2 device.
 *
 * Returns 0 for an unhandled profile; caller is expected to either
 * fall back to a safe default or refuse to proceed.
 */
 unsigned int pixelformat_for_profile(VAProfile profile);
 #endif /* _CODEC_H_ */
@@ -34,9 +34,10 @@
 #include <linux/videodev2.h>
 #include <mpeg2-ctrls.h>
 #include <h264-ctrls.h>
 #include <hevc-ctrls.h>
 #include "codec.h"
 #include "utils.h"
 #include "v4l2.h"
@@ -52,56 +53,26 @@ VAStatus RequestCreateConfig(VADriverContextP context, VAProfile profile,
 	VAConfigID id;
 	int i, index;
 	request_log("CreateConfig: profile=%d entrypoint=%d attrs=%d\n",
 		    profile, entrypoint, attributes_count);
 	switch (profile) {
-	
+	case VAProfileMPEG2Simple:
 	case VAProfileMPEG2Main:
 	case VAProfileH264Main:
 	case VAProfileH264High:
 	case VAProfileH264ConstrainedBaseline:
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
-		// FIXME
+		case VAProfileHEVCMain:
-		break;
+		if (entrypoint != VAEntrypointVLD)
-	case VAProfileMPEG2Simple:
+			return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT;
 	case VAProfileMPEG2Main:
 		// fresnel-fourier iter1: MPEG-2 enabled. Same shape as H.264
 		// above — no profile-specific config validation in the libva
 		// backend; validation happens at vaCreateContext / control
 		// submission time.
 		break;
 	case VAProfileHEVCMain:
 		// fresnel-fourier iter2: HEVC enabled. Same shape as H.264/
 		// MPEG-2 above — no profile-specific config validation in the
 		// libva backend; validation happens at vaCreateContext / control
 		// submission time.
 		break;
 	case VAProfileVP8Version0_3:
 		// fresnel-fourier iter3: VP8 enabled. Same shape as iter1+iter2
 		// above — no profile-specific config validation in the libva
 		// backend; validation happens at vaCreateContext / control
 		// submission time.
 		break;
 	case VAProfileVP9Profile0:
 		// fresnel-fourier iter4: VP9 Profile 0 enabled on rkvdec.
 		// Same shape — no profile-specific validation here.
 		break;
 	case VAProfileAV1Profile0:
 		// ampere-av1-enablement: AV1 Profile 0 enabled on vpu981.
 		// Same shape — no profile-specific validation here.
 		break;
 	default:
 		return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 	}
 	/*
 	 * iter38: retarget the active V4L2 device to whichever physical
 	 * decoder (rkvdec or hantro-vpu on RK3399) serves this codec profile.
 	 * Safe no-op when the right device is already active. When a switch
 	 * is needed, output/capture pools and the video_format cache are
 	 * torn down so the next RequestCreateContext rebuilds them on the
 	 * new device.
 	 */
 	(void)request_switch_device_for_profile(driver_data, profile);
 	if (attributes_count > V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES)
 		attributes_count = V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES;
@@ -112,16 +83,6 @@ VAStatus RequestCreateConfig(VADriverContextP context, VAProfile profile,
 	config_object->profile = profile;
 	config_object->entrypoint = entrypoint;
 	/*
 	 * iter5b-β: cache the V4L2 OUTPUT-side FOURCC for this profile so
 	 * context.c::RequestCreateContext can read it without re-running
 	 * the profile→pixelformat mapping. Wires up the previously-dead
 	 * pixelformat field at config.h:46. The switch above already
 	 * rejected unsupported profiles via VA_STATUS_ERROR_UNSUPPORTED_PROFILE,
 	 * so pixelformat_for_profile here returns non-zero for every
 	 * profile that reaches this assignment.
 	 */
 	config_object->pixelformat = pixelformat_for_profile(profile);
 	config_object->attributes[0].type = VAConfigAttribRTFormat;
 	config_object->attributes[0].value = VA_RT_FORMAT_YUV420;
 	config_object->attributes_count = 1;
@@ -153,31 +114,6 @@ VAStatus RequestDestroyConfig(VADriverContextP context, VAConfigID config_id)
 	return VA_STATUS_SUCCESS;
 }
 /*
 * iter38: check whether `fmt` is supported on any of the open V4L2 device
 * fds (active + the two alt fds tracked since iter38 multi-device probe).
 * Tries both VIDEO_OUTPUT and VIDEO_OUTPUT_MPLANE.
 */
 static bool any_fd_supports_output_format(struct request_data *driver_data,
 					  unsigned int fmt)
 {
 	int fds[4] = {
 		driver_data->video_fd,
 		driver_data->video_fd_rkvdec,
 		driver_data->video_fd_hantro,
 		driver_data->video_fd_vpu981,
 	};
 	int i;
 	for (i = 0; i < 4; i++) {
 		if (fds[i] < 0) continue;
 		if (v4l2_find_format(fds[i], V4L2_BUF_TYPE_VIDEO_OUTPUT, fmt))
 			return true;
 		if (v4l2_find_format(fds[i], V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, fmt))
 			return true;
 	}
 	return false;
 }
 VAStatus RequestQueryConfigProfiles(VADriverContextP context,
 				    VAProfile *profiles, int *profiles_count)
 {
@@ -185,14 +121,24 @@ VAStatus RequestQueryConfigProfiles(VADriverContextP context,
 	unsigned int index = 0;
 	bool found;
-	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_MPEG2_SLICE);
+	found = v4l2_find_format(driver_data->video_fd,
-	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 2)) {
+				 V4L2_BUF_TYPE_VIDEO_OUTPUT,
 				 V4L2_PIX_FMT_MPEG2_SLICE) ||
 		v4l2_find_format(driver_data->video_fd,
 				 V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE,
 				 V4L2_PIX_FMT_MPEG2_SLICE);
 	if (found && index < (V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES - 2)) {
 		profiles[index++] = VAProfileMPEG2Simple;
 		profiles[index++] = VAProfileMPEG2Main;
 	}
-	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_H264_SLICE);
+	found = v4l2_find_format(driver_data->video_fd,
-	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 5)) {
+				 V4L2_BUF_TYPE_VIDEO_OUTPUT,
 				 V4L2_PIX_FMT_H264_SLICE) ||
 		v4l2_find_format(driver_data->video_fd,
 				 V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE,
 				 V4L2_PIX_FMT_H264_SLICE);
 	if (found && index < (V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES - 5)) {
 		profiles[index++] = VAProfileH264Main;
 		profiles[index++] = VAProfileH264High;
 		profiles[index++] = VAProfileH264ConstrainedBaseline;
@@ -200,29 +146,15 @@ VAStatus RequestQueryConfigProfiles(VADriverContextP context,
 		profiles[index++] = VAProfileH264StereoHigh;
 	}
-	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_HEVC_SLICE);
+	found = v4l2_find_format(driver_data->video_fd,
-	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
+				 V4L2_BUF_TYPE_VIDEO_OUTPUT,
 				 V4L2_PIX_FMT_HEVC_SLICE) ||
 		v4l2_find_format(driver_data->video_fd,
 				 V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE,
 				 V4L2_PIX_FMT_HEVC_SLICE);
 	if (found && index < (V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES - 1))
 		profiles[index++] = VAProfileHEVCMain;
 	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_VP8_FRAME);
 	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
 		profiles[index++] = VAProfileVP8Version0_3;
 	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_VP9_FRAME);
 	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
 		profiles[index++] = VAProfileVP9Profile0;
 	/*
 	 * ampere-av1-enablement: AV1 routes to vpu981 (advertised via the
 	 * new video_fd_vpu981 slot). V4L2_REQUEST_MAX_PROFILES=11 is now
 	 * EXACTLY full with this addition. Future profile additions
 	 * require bumping that constant + verifying libva consumers'
 	 * profiles[] sizing.
 	 */
 	found = any_fd_supports_output_format(driver_data, V4L2_PIX_FMT_AV1_FRAME);
 	if (found && index < (V4L2_REQUEST_MAX_PROFILES - 1))
 		profiles[index++] = VAProfileAV1Profile0;
 	*profiles_count = index;
 	return VA_STATUS_SUCCESS;
@@ -233,6 +165,8 @@ VAStatus RequestQueryConfigEntrypoints(VADriverContextP context,
 				       VAEntrypoint *entrypoints,
 				       int *entrypoints_count)
 {
 	request_log("QueryConfigEntrypoints: profile=%d\n", profile);
 	switch (profile) {
 	case VAProfileMPEG2Simple:
 	case VAProfileMPEG2Main:
@@ -242,9 +176,6 @@ VAStatus RequestQueryConfigEntrypoints(VADriverContextP context,
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
 	case VAProfileHEVCMain:
 	case VAProfileVP8Version0_3:
 	case VAProfileVP9Profile0:
 	case VAProfileAV1Profile0:
 		entrypoints[0] = VAEntrypointVLD;
 		*entrypoints_count = 1;
 		break;
@@ -295,6 +226,9 @@ VAStatus RequestGetConfigAttributes(VADriverContextP context, VAProfile profile,
 {
 	unsigned int i;
 	request_log("GetConfigAttributes: profile=%d entrypoint=%d count=%d\n",
 		    profile, entrypoint, attributes_count);
 	for (i = 0; i < attributes_count; i++) {
 		switch (attributes[i].type) {
 		case VAConfigAttribRTFormat:
@@ -43,7 +43,6 @@ struct object_config {
 	VAEntrypoint entrypoint;
 	VAConfigAttrib attributes[V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES];
 	int attributes_count;
 	unsigned int pixelformat;
 };
 VAStatus RequestCreateConfig(VADriverContextP context, VAProfile profile,
@@ -29,7 +29,6 @@
 #include "request.h"
 #include "surface.h"
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
@@ -40,6 +39,8 @@
 #include <linux/videodev2.h>
 #include <mpeg2-ctrls.h>
 #include <h264-ctrls.h>
 #include <hevc-ctrls.h>
 #include "utils.h"
@@ -54,322 +55,43 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 {
 	struct request_data *driver_data = context->pDriverData;
 	struct object_config *config_object;
 	struct object_surface *surface_object;
 	struct object_context *context_object = NULL;
 	struct video_format *video_format;
-	unsigned int destination_sizes[VIDEO_MAX_PLANES];
+	unsigned int length;
-	unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
+	unsigned int offset;
-	unsigned int destination_planes_count;
+	void *source_data = MAP_FAILED;
 	unsigned int format_width, format_height;
 	unsigned int pixelformat;
 	VASurfaceID *ids = NULL;
 	VAContextID id;
 	VAStatus status;
 	unsigned int output_type, capture_type;
-	unsigned int j;
+	unsigned int pixelformat;
-	bool found;
+	unsigned int index_base;
 	unsigned int index;
 	unsigned int i;
 	int rc;
 	/*
 	 * iter5b-β: CreateContext owns the V4L2 OUTPUT-side device-format
 	 * lifecycle (S_FMT, CAPTURE-format probe, cap_pool_init, per-surface
 	 * destination_* fill). Pre-β these lived in CreateSurfaces2 with a
 	 * resolution-change gate; β moves them here because (a) config_id
 	 * is known so the right OUTPUT pixel format can be derived from
 	 * the bound profile, and (b) STREAMON happens at the end of this
 	 * function, so the queue is never streaming when we do S_FMT.
 	 *
 	 * DestroyContext is the only per-session teardown site under β
 	 * (no in-CreateSurfaces2 teardown branch). It STREAMOFFs both
 	 * queues, calls request_pool_destroy + cap_pool_destroy, and
 	 * REQBUFS(0) — leaving the V4L2 device in a clean slate for the
 	 * next CreateContext.
 	 */
 	config_object = CONFIG(driver_data, config_id);
 	if (config_object == NULL) {
 		status = VA_STATUS_ERROR_INVALID_CONFIG;
 		goto error;
 	}
 	pixelformat = config_object->pixelformat;
 	if (pixelformat == 0) {
 		/*
 		 * Defensive: CreateConfig rejects unhandled profiles, so
 		 * pixelformat is always non-zero by the time we get here.
 		 * Belt-and-suspenders.
 		 */
 		status = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 		goto error;
 	}
 	/*
 	 * Probe the CAPTURE-side V4L2 format. video_format is a static
 	 * pointer into video.c's formats[]; it stays valid for the life of
 	 * the driver_data and is cached across CreateContext cycles. The
 	 * probe doesn't require any prior S_FMT — v4l2_find_format
 	 * enumerates the device's supported formats directly.
 	 */
 	if (!driver_data->video_format) {
 		video_format = NULL;
 		found = v4l2_find_format(driver_data->video_fd,
 					 V4L2_BUF_TYPE_VIDEO_CAPTURE,
 					 V4L2_PIX_FMT_SUNXI_TILED_NV12);
 		if (found)
 			video_format = video_format_find(V4L2_PIX_FMT_SUNXI_TILED_NV12);
 		found = v4l2_find_format(driver_data->video_fd,
 					 V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
 					 V4L2_PIX_FMT_NV12);
 		if (found)
 			video_format = video_format_find(V4L2_PIX_FMT_NV12);
 		if (video_format == NULL) {
 			status = VA_STATUS_ERROR_OPERATION_FAILED;
 			goto error;
 		}
 		driver_data->video_format = video_format;
 	}
 	video_format = driver_data->video_format;
 	if (video_format == NULL) {
 		request_log("CreateContext: video_format is NULL\n");
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	}
 	output_type = v4l2_type_video_output(video_format->v4l2_mplane);
 	capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
 	request_log("CreateContext: ENTER config=%u mplane=%d output_type=%u capture_type=%u surfaces_count=%d flags=0x%x\n",
 		    config_id, video_format->v4l2_mplane, output_type,
 		    capture_type, surfaces_count, flags);
-	/*
+	config_object = CONFIG(driver_data, config_id);
-	 * Commit the OUTPUT pixel format. picture_width/picture_height
+	if (config_object == NULL) {
-	 * are the kernel-facing dimensions for this decode session. With
+		request_log("CreateContext: config_object NULL for config_id %u\n",
-	 * profile-derived pixelformat, hantro's CAPTURE-format derivation
+			    config_id);
-	 * dispatches to the right codec_mode (pre-β hardcoded H264_SLICE
+		status = VA_STATUS_ERROR_INVALID_CONFIG;
 	 * meant hantro silently substituted MPEG2_DECODER for HEVC/VP8/VP9
 	 * → all-zero CAPTURE; rkvdec silently dropped HEVC/VP9 → same
 	 * outcome).
 	 */
 	rc = v4l2_set_format(driver_data->video_fd, output_type, pixelformat,
 			     picture_width, picture_height);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
-
+	request_log("CreateContext: profile=%d width=%d height=%d\n",
-	/*
+		    config_object->profile, picture_width, picture_height);
 	 * iter15 α-19: explicit S_FMT on CAPTURE for rkvdec.
 	 *
 	 * Original iter5b-β comment: "Do NOT VIDIOC_S_FMT on CAPTURE — hantro
 	 * reads the SPS from OUTPUT to set CAPTURE shape internally."
 	 *
 	 * Empirical finding at iter15 Phase 3 (2026-05-14): kdirect (ffmpeg-
 	 * v4l2request) does S_FMT on CAPTURE side after S_FMT(OUTPUT),
 	 * then CREATE_BUFS for CAPTURE. libva's old G_FMT-only path skipped
 	 * the S_FMT call. For hantro this was deliberate (works); for rkvdec
 	 * (HEVC + H.264 + VP9 on RK3399) the absence of explicit S_FMT puts
 	 * the driver into a state where it does NOT commit the chosen NV12
 	 * pixel format properly — and the resulting decode silently writes
 	 * garbage or zero for HEVC + H.264 (Bug 4 + Bug 5).
 	 *
 	 * Per [[feedback-per-driver-kludge-gating]]: this driver-specific
 	 * difference should be gated on driver_kind. For now use a single
 	 * always-on S_FMT call as the safe move: kdirect proves S_FMT
 	 * CAPTURE works on both hantro AND rkvdec (it's the reference path).
 	 * The iter5b-β comment is preserved-but-amended below.
 	 *
 	 * Sequence: S_FMT OUTPUT (above) → S_FMT CAPTURE (this) → G_FMT
 	 * CAPTURE (sanity read-back, matches what S_FMT committed).
 	 */
 	{
 		unsigned int capture_pixelformat = V4L2_PIX_FMT_NV12;
 		rc = v4l2_set_format(driver_data->video_fd, capture_type,
 				     capture_pixelformat, picture_width,
 				     picture_height);
 		if (rc < 0) {
 			/* Non-fatal: if the kernel rejects S_FMT CAPTURE (some
 			 * older hantro variants), fall through to G_FMT. */
 			request_log("iter15 α-19: S_FMT CAPTURE failed (continuing): %s\n",
 				    strerror(errno));
 		}
 	}
 	rc = v4l2_get_format(driver_data->video_fd, capture_type, &format_width,
 			     &format_height, destination_bytesperlines,
 			     destination_sizes, NULL);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
 	/*
 	 * iter25 α-25: synthetic-SPS injection to pre-seed ctx->image_fmt
 	 * before CAPTURE buffer allocation.
 	 *
 	 * Root cause (iter17→iter24 kernel-printk chain): rkvdec_s_ctrl for
 	 * HEVC_SPS / H264_SPS calls get_image_fmt() and, if the resolved
 	 * image_fmt differs from the cached ctx->image_fmt (default
 	 * RKVDEC_IMG_FMT_ANY), tries to reset the CAPTURE format. The reset
 	 * returns -EBUSY when vb2_is_busy(CAPTURE_queue) — i.e. any CAPTURE
 	 * buffer is allocated.
 	 *
 	 * libva (iter5b-β CAPTURE pool) pre-allocates 24 CAPTURE buffers
 	 * via cap_pool_init below — before any per-frame S_EXT_CTRLS
 	 * arrives. So the first real HEVC_SPS at decode time fails with
 	 * -EBUSY in try_or_set_cluster, breaks v4l2_ctrl_request_setup's
 	 * outer loop, and leaves ctx->ctrl_hdl[SPS..DECODE_PARAMS] at all-
 	 * zero contents. rkvdec_hevc_run reads zero, hardware sees w=0
 	 * h=0, decoded CAPTURE is all-zero (Bug 5 + Bug 4).
 	 *
 	 * Fix: while CAPTURE is still empty (before cap_pool_init), inject
 	 * a synthetic SPS containing the profile's chroma + bit_depth so
 	 * rkvdec_s_ctrl resolves image_fmt and updates ctx->image_fmt
 	 * before vb2_is_busy can return true. From then on, per-frame
 	 * SPS submissions with matching profile parameters see
 	 * image_fmt_changed=false → skip reset → commit succeeds.
 	 *
 	 * Gated by config->profile: only HEVC and H.264 paths set
 	 * get_image_fmt in their rkvdec coded_fmt_desc->ops; VP9 / MPEG-2 /
 	 * VP8 are unaffected (rkvdec_s_ctrl returns 0 immediately when
 	 * get_image_fmt is NULL, or those codecs are routed to hantro).
 	 *
 	 * Failure is best-effort: if the kernel returns -EBUSY/-EINVAL here
 	 * (e.g. driver doesn't expose the control on this DT path), we fall
 	 * through and may still hit the original bug for that codec — but
 	 * the device-init DECODE_MODE + START_CODE block below ALSO uses
 	 * void-cast best-effort, so this is consistent with prior pattern.
 	 */
 	{
 		switch (config_object->profile) {
 		case VAProfileHEVCMain: {
 			struct v4l2_ctrl_hevc_sps dummy_sps;
 			struct v4l2_ext_control dummy_ctrl;
 			memset(&dummy_sps, 0, sizeof(dummy_sps));
 			dummy_sps.chroma_format_idc = 1; /* 4:2:0 */
 			dummy_sps.bit_depth_luma_minus8 = 0; /* 8-bit */
 			dummy_sps.bit_depth_chroma_minus8 = 0;
 			dummy_sps.pic_width_in_luma_samples = picture_width;
 			dummy_sps.pic_height_in_luma_samples = picture_height;
 			dummy_ctrl.id = V4L2_CID_STATELESS_HEVC_SPS;
 			dummy_ctrl.ptr = &dummy_sps;
 			dummy_ctrl.size = sizeof(dummy_sps);
 			(void)v4l2_set_controls(driver_data->video_fd, -1,
 						&dummy_ctrl, 1);
 			break;
 		}
 		case VAProfileH264Main:
 		case VAProfileH264High:
 		case VAProfileH264ConstrainedBaseline:
 		case VAProfileH264MultiviewHigh:
 		case VAProfileH264StereoHigh: {
 			struct v4l2_ctrl_h264_sps dummy_sps;
 			struct v4l2_ext_control dummy_ctrl;
 			memset(&dummy_sps, 0, sizeof(dummy_sps));
 			dummy_sps.chroma_format_idc = 1; /* 4:2:0 */
 			dummy_sps.bit_depth_luma_minus8 = 0;
 			dummy_sps.bit_depth_chroma_minus8 = 0;
 			dummy_sps.pic_width_in_mbs_minus1 =
 				(picture_width + 15) / 16 - 1;
 			dummy_sps.pic_height_in_map_units_minus1 =
 				(picture_height + 15) / 16 - 1;
 			dummy_sps.profile_idc = 100; /* High */
 			dummy_sps.level_idc = 41;
 			/*
 			 * FRAME_MBS_ONLY required: rkvdec_h264_validate_sps
 			 * doubles height for non-frame-mbs-only streams to
 			 * compute frame-height from field-height. Without
 			 * this flag, dummy with (height_in_map_units+1)*16
 			 * = 1088 doubles to 2176 > coded_fmt 1080 → -EINVAL.
 			 */
 			dummy_sps.flags = V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY;
 			dummy_ctrl.id = V4L2_CID_STATELESS_H264_SPS;
 			dummy_ctrl.ptr = &dummy_sps;
 			dummy_ctrl.size = sizeof(dummy_sps);
 			(void)v4l2_set_controls(driver_data->video_fd, -1,
 						&dummy_ctrl, 1);
 			break;
 		}
 		default:
 			break;
 		}
 	}
 	destination_planes_count = video_format->planes_count;
 	/*
 	 * Initialize the CAPTURE buffer pool (cap_pool). Pool size =
 	 * max(surfaces_count, MIN_CAP_POOL). The headroom gives LRU
 	 * recycling enough margin to never reuse a buffer within the
 	 * consumer's compositor-hold window for typical playback
 	 * patterns. cap_pool_init does the V4L2 CREATE_BUFS + per-slot
 	 * mmap.
 	 *
 	 * `pool->initialized` is reset to false by cap_pool_destroy in
 	 * DestroyContext; subsequent CreateContext re-inits at the new
 	 * resolution.
 	 */
 	if (!driver_data->capture_pool.initialized) {
 		unsigned int pool_count = surfaces_count > MIN_CAP_POOL ?
 					  surfaces_count : MIN_CAP_POOL;
 		rc = cap_pool_init(&driver_data->capture_pool,
 				   driver_data->video_fd, capture_type,
 				   pool_count, video_format->v4l2_buffers_count);
 		if (rc < 0) {
 			status = VA_STATUS_ERROR_ALLOCATION_FAILED;
 			goto error;
 		}
 	}
 	/*
 	 * Compute format-uniform destination_* values. Same for all
 	 * surfaces of this format; written once per surface, never
 	 * changed by BeginPicture's slot acquisition.
 	 */
 	if (video_format->v4l2_buffers_count == 1) {
 		destination_sizes[0] = destination_bytesperlines[0] *
 				       format_height;
 		for (j = 1; j < destination_planes_count; j++)
 			destination_sizes[j] = destination_sizes[0] / 2;
 	}
 	/*
 	 * iter5b-β Commit D: cache the format-uniform CAPTURE geometry
 	 * in driver_data. CreateSurfaces2 calls AFTER this CreateContext
 	 * (ffmpeg vaapi-copy late-surface-allocation case) will lazy-fill
 	 * via surface_fill_format_uniform(); the surface_heap walk below
 	 * fills surfaces that pre-existed when CreateContext fired.
 	 */
 	driver_data->fmt_planes_count = destination_planes_count;
 	driver_data->fmt_buffers_count = video_format->v4l2_buffers_count;
 	driver_data->fmt_format_height = format_height;
 	for (j = 0; j < destination_planes_count; j++) {
 		driver_data->fmt_sizes[j] = destination_sizes[j];
 		driver_data->fmt_bytesperlines[j] =
 			destination_bytesperlines[j];
 	}
 	driver_data->fmt_valid = true;
 	/*
 	 * Walk the surface_heap (not just surfaces_ids[]) to populate
 	 * destination_* on every existing surface. Pre-Commit-D we walked
 	 * surfaces_ids[], which is empty for ffmpeg vaapi-copy consumers
 	 * that call vaCreateContext with surfaces_count=0 — those surfaces
 	 * exist in the heap but aren't in the param array. Walking the
 	 * heap catches both flows. Late-created surfaces (after this
 	 * CreateContext) fill via surface_fill_format_uniform in
 	 * CreateSurfaces2's per-surface init.
 	 */
 	{
 		struct object_surface *surface_iter;
 		int heap_iter;
 		surface_iter = (struct object_surface *)
 			object_heap_first(&driver_data->surface_heap,
 					  &heap_iter);
 		while (surface_iter != NULL) {
 			surface_fill_format_uniform(driver_data, surface_iter);
 			surface_iter = (struct object_surface *)
 				object_heap_next(&driver_data->surface_heap,
 						 &heap_iter);
 		}
 	}
 	id = object_heap_allocate(&driver_data->context_heap);
 	context_object = CONTEXT(driver_data, id);
@@ -378,153 +100,103 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 		goto error;
 	}
 	memset(&context_object->dpb, 0, sizeof(context_object->dpb));
 	context_object->timestamp_counter = 0;	/* iter9 α-7 */
-	/*
+	switch (config_object->profile) {
-	 * Initialize the OUTPUT (bitstream-input) buffer pool. Sized by
+
-	 * codec pipeline depth (4 H.264 frames in flight is sufficient
+	case VAProfileMPEG2Simple:
-	 * for current hantro/rkvdec scheduling); independent of caller-
+	case VAProfileMPEG2Main:
-	 * supplied surfaces_count. Pool is owned by driver_data so it
+		pixelformat = V4L2_PIX_FMT_MPEG2_SLICE;
-	 * outlives any single context destroy/recreate cycle.
+		break;
-	 *
+
-	 * This replaces the prior per-surface OUTPUT loop, which (a)
+	case VAProfileH264Main:
-	 * created an empty queue when surfaces_count==0 (ffmpeg vaapi-
+	case VAProfileH264High:
-	 * copy path) and (b) only populated surface->source_* for
+	case VAProfileH264ConstrainedBaseline:
-	 * surfaces present at vaCreateContext time, NULL-derefing on
+	case VAProfileH264MultiviewHigh:
-	 * surfaces created later.
+	case VAProfileH264StereoHigh:
-	 */
+		pixelformat = V4L2_PIX_FMT_H264_SLICE;
-	/*
+		break;
-	 * iter6: pool size 16 gives comfortable headroom over typical H.264
+
-	 * MaxDpbFrames (16) for any consumer that pipelines decode requests.
+	case VAProfileHEVCMain:
-	 * Each slot owns its own request_fd (REINIT'd per use).
+		pixelformat = V4L2_PIX_FMT_HEVC_SLICE;
-	 */
+		break;
-	rc = request_pool_init(&driver_data->output_pool,
+
-			       driver_data->video_fd, driver_data->media_fd,
+	default:
-			       output_type, 16);
+		status = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 		goto error;
 	}
 	rc = v4l2_set_format(driver_data->video_fd, output_type, pixelformat,
 			     picture_width, picture_height);
 	if (rc < 0) {
 		request_log("CreateContext: S_FMT(OUTPUT) failed for pixelformat 0x%x\n",
 			    pixelformat);
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
 	rc = v4l2_create_buffers(driver_data->video_fd, output_type,
 				 surfaces_count, &index_base);
 	if (rc < 0) {
 		request_log("CreateContext: CREATE_BUFS failed surfaces=%d\n",
 			    surfaces_count);
 		status = VA_STATUS_ERROR_ALLOCATION_FAILED;
 		goto error;
 	}
 	request_log("CreateContext: S_FMT + CREATE_BUFS ok, index_base=%u\n",
 		    index_base);
 	/*
 	 * The surface_ids array has been allocated by the caller and
 	 * we don't have any indication wrt its life time. Let's make sure
 	 * its life span is under our control.
 	 */
-	if (surfaces_count > 0) {
+	ids = malloc(surfaces_count * sizeof(VASurfaceID));
-		ids = malloc(surfaces_count * sizeof(VASurfaceID));
+	if (ids == NULL) {
-		if (ids == NULL) {
+		status = VA_STATUS_ERROR_ALLOCATION_FAILED;
 		goto error;
 	}
 	memcpy(ids, surfaces_ids, surfaces_count * sizeof(VASurfaceID));
 	for (i = 0; i < surfaces_count; i++) {
 		index = index_base + i;
 		surface_object = SURFACE(driver_data, surfaces_ids[i]);
 		if (surface_object == NULL) {
 			status = VA_STATUS_ERROR_INVALID_SURFACE;
 			goto error;
 		}
 		rc = v4l2_query_buffer(driver_data->video_fd, output_type,
 				       index, &length, &offset, 1);
 		if (rc < 0) {
 			request_log("CreateContext: QUERYBUF idx=%u failed\n", index);
 			status = VA_STATUS_ERROR_ALLOCATION_FAILED;
 			goto error;
 		}
-		memcpy(ids, surfaces_ids,
+		source_data = mmap(NULL, length, PROT_READ | PROT_WRITE,
-		       surfaces_count * sizeof(VASurfaceID));
+				   MAP_SHARED, driver_data->video_fd, offset);
 		if (source_data == MAP_FAILED) {
 			request_log("CreateContext: mmap len=%u offset=%u failed\n",
 				    length, offset);
 			status = VA_STATUS_ERROR_ALLOCATION_FAILED;
 			goto error;
 		}
 		surface_object->source_index = index;
 		surface_object->source_data = source_data;
 		surface_object->source_size = length;
 	}
-	/*
+	/* Fourier-local: defer STREAMON until first frame is actually queued.
-	 * Stateless H.264 device-wide controls. The kernel V4L2 stateless
+	 * The V4L2 stateless protocol requires OUTPUT format + at least one
-	 * framework requires DECODE_MODE and START_CODE be set on the
+	 * queued slice with SPS/PPS controls attached before STREAMON will
-	 * device fd (request_fd=-1) before VIDIOC_STREAMON; per-request
+	 * succeed on hantro. The bootlin library was written for sunxi-cedrus
-	 * controls (SPS/PPS/etc.) attached to a request_fd come later.
+	 * which used a different protocol with no such ordering constraint.
-	 *
+	 * Just being able to vaCreateContext() without erroring lets us see
-	 * hantro-vpu via rockchip,rk3568-vpu DT compatible (covers RK3568
+	 * what the next stage of the call chain expects. */
-	 * and RK3566 — PineTab2 silicon — since they're close enough)
+	request_log("CreateContext: deferred STREAMON until first QBUF\n");
 	 * accepts only DECODE_MODE_FRAME_BASED.
 	 * START_CODE_ANNEX_B preserves leading 0x00000001 in the slice
 	 * payload that h264.c assembles. Errors here are not fatal: not
 	 * every backing driver supports both controls (e.g. cedrus may
 	 * default to SLICE_BASED without exposing DECODE_MODE).
 	 */
 	{
 		struct v4l2_ext_control dev_ctrls[2] = {
 			{
 				.id = V4L2_CID_STATELESS_H264_DECODE_MODE,
 				.value = V4L2_STATELESS_H264_DECODE_MODE_FRAME_BASED,
 			},
 			{
 				.id = V4L2_CID_STATELESS_H264_START_CODE,
 				.value = V4L2_STATELESS_H264_START_CODE_ANNEX_B,
 			},
 		};
 		(void)v4l2_set_controls(driver_data->video_fd, -1,
 					dev_ctrls, 2);
 	}
 	/*
 	 * iter2: HEVC device-wide controls. Same best-effort pattern as
 	 * H.264 above — separate batched call so a kernel that does not
 	 * advertise HEVC controls (e.g. hantro-vpu-dec on RK3568/RK3399)
 	 * silently fails on this batch without invalidating the H.264
 	 * batch. rkvdec on RK3399 advertises HEVC and accepts FRAME_BASED
 	 * + ANNEX_B (only supported menu values per Phase 0 v4l2_inventory).
 	 */
 	{
 		struct v4l2_ext_control hevc_dev_ctrls[2] = {
 			{
 				.id = V4L2_CID_STATELESS_HEVC_DECODE_MODE,
 				.value = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
 			},
 			{
 				.id = V4L2_CID_STATELESS_HEVC_START_CODE,
 				.value = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
 			},
 		};
 		(void)v4l2_set_controls(driver_data->video_fd, -1,
 					hevc_dev_ctrls, 2);
 	}
 	/*
 	 * Mirror the ANNEX_B start-code mode set on the device above
 	 * into context_object->h264_start_code so picture.c::
 	 * codec_store_buffer prepends 0x00 0x00 0x01 to each slice
 	 * payload it copies into the OUTPUT buffer. Without this, the
 	 * kernel — which we just told to expect ANNEX_B — sees a raw
 	 * NAL stream with no start codes, fails to find slice
 	 * boundaries, and emits a zeroed CAPTURE buffer (visually a
 	 * flat dark-green frame).
 	 *
 	 * iter4 fix: this start-code prepend is ANNEX-B-specific and
 	 * applies to H.264 and HEVC ONLY. MPEG-2, VP8, and VP9 use raw
 	 * frame bitstreams without start codes — prepending 0x00 0x00 0x01
 	 * to a VP9 uncompressed header produces a frame_marker mismatch
 	 * (kernel reads 0x00 instead of 0x10), the rkvdec driver silently
 	 * fails to find a valid frame, and the CAPTURE slot stays at its
 	 * cap_pool init pattern (a dim 0x4c green). Phase 7 verification
 	 * caught this for VP9; iter1+iter3 transitive proof masked it for
 	 * MPEG-2/VP8 because those iters compared payload bytes, not
 	 * decoded pixels.
 	 *
 	 * h264_get_controls() exists for this purpose but is never
 	 * called in the current code path; the planned probe-then-set
 	 * commit will replace this hardcoded assignment with a runtime
 	 * read of the kernel's accepted START_CODE value.
 	 */
 	switch (config_object->profile) {
 	case VAProfileH264Main:
 	case VAProfileH264High:
 	case VAProfileH264ConstrainedBaseline:
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
 	case VAProfileHEVCMain:
 		context_object->h264_start_code = true;
 		break;
 	default:
 		context_object->h264_start_code = false;
 		break;
 	}
 	rc = v4l2_set_stream(driver_data->video_fd, output_type, true);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
 	rc = v4l2_set_stream(driver_data->video_fd, capture_type, true);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
 	context_object->config_id = config_id;
 	context_object->render_surface_id = VA_INVALID_ID;
@@ -536,10 +208,15 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	*context_id = id;
 	request_log("CreateContext: SUCCESS context_id=%u\n", id);
 	status = VA_STATUS_SUCCESS;
 	goto complete;
 error:
 	if (source_data != MAP_FAILED)
 		munmap(source_data, length);
 	if (ids != NULL)
 		free(ids);
@@ -591,51 +268,13 @@ VAStatus RequestDestroyContext(VADriverContextP context, VAContextID context_id)
 	object_heap_free(&driver_data->context_heap,
 			 (struct object_base *)context_object);
 	/*
 	 * iter5b-β: tear down the OUTPUT pool (mmap unmaps) BEFORE
 	 * REQBUFS(0) frees the kernel-side buffers. Pre-β this was done
 	 * only by surface.c's resolution-change branch — which β removed.
 	 * Without this here, the next CreateContext's request_pool_init
 	 * sees pool->initialized=true with stale slot pointers, returns
 	 * 0 without re-CREATE_BUFS, and the next QBUF EINVALs because
 	 * the slots reference buffer indices that no longer exist
 	 * (Phase 5 v2 review CRIT-2).
 	 */
 	if (driver_data->output_pool.initialized)
 		request_pool_destroy(&driver_data->output_pool);
 	rc = v4l2_request_buffers(driver_data->video_fd, output_type, 0);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;
-	/*
+	rc = v4l2_request_buffers(driver_data->video_fd, capture_type, 0);
-	 * Iter2 Fix 3 (still relevant under β): cap_pool owns the
+	if (rc < 0)
-	 * CAPTURE buffers' mmaps + any outstanding our_export_fds. Tear
+		return VA_STATUS_ERROR_OPERATION_FAILED;
 	 * it down (which also issues REQBUFS(0) on CAPTURE), so the next
 	 * CreateContext cycle sees a clean slate.
 	 */
 	cap_pool_destroy(&driver_data->capture_pool, driver_data->video_fd,
 			 capture_type);
 	/*
 	 * iter5b-β: driver_data->video_format is a static-ref pointer
 	 * into video.c's formats[]; it stays valid for the life of the
 	 * driver_data and intentionally survives DestroyContext cycles.
 	 * The next CreateContext's `if (!driver_data->video_format)`
 	 * guard skips the probe — correct, because the device's CAPTURE
 	 * format menu doesn't change.
 	 *
 	 * The pre-β surface_reset_format_cache() call here is removed:
 	 * β doesn't have a last_output_{width,height,pixelformat} cache
 	 * (those fields are deleted). Each CreateContext is a fresh
 	 * S_FMT(OUTPUT) cycle.
 	 *
 	 * Commit D: invalidate the format-uniform cache so a CreateSurfaces2
 	 * call between DestroyContext and the next CreateContext doesn't
 	 * lazy-fill with stale geometry from the now-torn-down session.
 	 * The next CreateContext re-populates the cache.
 	 */
 	driver_data->fmt_valid = false;
 	return VA_STATUS_SUCCESS;
 }
@@ -27,9 +27,6 @@
 #ifndef _CONTEXT_H_
 #define _CONTEXT_H_
 #include <stdbool.h>
 #include <stdint.h>
 #include <va/va_backend.h>
 #include "object_heap.h"
@@ -53,27 +50,6 @@ struct object_context {
 	/* H264 only */
 	struct h264_dpb dpb;
 	bool h264_start_code;
 	/*
 	 * iter9 α-7: monotonic per-context timestamp counter (us). Replaces
 	 * gettimeofday in EndPicture so DPB.reference_ts / OUTPUT QBUF ts
 	 * are small values matching ffmpeg-v4l2request's pattern. Placed
 	 * here (object_context) not driver_data per Phase 5 IMP-1 to avoid
 	 * cross-context collisions.
 	 */
 	uint64_t timestamp_counter;
 	/* fresnel-fourier iter4: VP9 loop-filter delta state, persisted across
 	 * frames per kernel UAPI <linux/v4l2-controls.h>:2578 ("If this syntax
 	 * element is not present in the bitstream, users should pass its last
 	 * value.") and VP9 spec defaults from FFmpeg vp9.c:666-671. Reset on
 	 * keyframe / error-resilient / intra-only via vp9_lf.initialized=false. */
 	struct {
 		int8_t ref_deltas[4];
 		int8_t mode_deltas[2];
 		bool initialized;
 	} vp9_lf;
 };
 VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
@@ -28,18 +28,17 @@
 #include <assert.h>
 #include <limits.h>
 #include <string.h>
 #include <stdio.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <linux/videodev2.h>
 #include <h264-ctrls.h>
 #include "request.h"
 #include "utils.h"
 #include "surface.h"
 #include "v4l2.h"
 #include "h264_slice_header.h"
 enum h264_slice_type {
 	H264_SLICE_P    = 0,
@@ -97,8 +96,7 @@ static struct h264_dpb_entry *dpb_find_entry(struct object_context *context)
 }
 static struct h264_dpb_entry *dpb_lookup(struct object_context *context,
-					 VAPictureH264 *pic, unsigned int *idx,
+					 VAPictureH264 *pic, unsigned int *idx)
 					 unsigned char *fields)
 {
 	unsigned int i;
@@ -112,16 +110,6 @@ static struct h264_dpb_entry *dpb_lookup(struct object_context *context,
 			if (idx)
 				*idx = i;
 			if (fields) {
 				//if (entry->pic.TopFieldOrderCnt < entry->pic.BottomFieldOrderCnt) {
 				//	*fields = V4L2_H264_TOP_FIELD_REF;
 				//} else if (entry->pic.TopFieldOrderCnt > entry->pic.BottomFieldOrderCnt) {
 				//	*fields = V4L2_H264_BOTTOM_FIELD_REF;
 				//} else {
 					*fields = V4L2_H264_FRAME_REF;
 				//}
 			}
 			return entry;
 		}
 	}
@@ -143,7 +131,7 @@ static void dpb_insert(struct object_context *context, VAPictureH264 *pic,
 	if (is_picture_null(pic))
 		return;
-	if (dpb_lookup(context, pic, NULL, NULL))
+	if (dpb_lookup(context, pic, NULL))
 		return;
 	if (!entry)
@@ -178,7 +166,7 @@ static void dpb_update(struct object_context *context,
 		if (is_picture_null(pic))
 			continue;
-		entry = dpb_lookup(context, pic, NULL, NULL);
+		entry = dpb_lookup(context, pic, NULL);
 		if (entry) {
 			entry->age = context->dpb.age;
 			entry->used = true;
@@ -188,61 +176,10 @@ static void dpb_update(struct object_context *context,
 	}
 }
 /*
 * Strip ffmpeg-vaapi's POC sentinel.
 *
 * ffmpeg's H264POCContext initialises prev_poc_msb to (1 << 16) =
 * 0x10000 in libavcodec/h264dec.c (lines 301 and 444 of v8.0). After
 * an IDR the idr() helper resets prev_poc_msb to that same sentinel.
 * ff_h264_init_poc (libavcodec/h264_parse.c lines 296-305) then
 * computes pc->poc_msb as prev_poc_msb when the slice header's
 * poc_lsb hasn't wrapped — which is the typical case for normal
 * content. The sentinel leaks into field_poc[] and from there into
 * VAPictureH264.TopFieldOrderCnt / BottomFieldOrderCnt at
 * libavcodec/vaapi_h264.c::fill_vaapi_pic.
 *
 * Working VAAPI backends (intel-iHD, i965 verified empirically on
 * meitner 2026-05-02) tolerate the high word — they either mask it
 * or treat POCs as relative comparisons. V4L2 stateless H.264
 * driver-side consumers (hantro_h264.c::prepare_table feeds the
 * value direct to tbl->poc[]) need the spec value, so we strip the
 * sentinel here at the libva-v4l2-request boundary.
 *
 * Detection by bit-16-set rather than blind subtraction so that a
 * future ffmpeg version that fixes the sentinel leak degrades
 * gracefully. POC values for non-degenerate H.264 content rarely
 * exceed 16 bits; bit 16 set is a strong signal of the sentinel.
 *
 * Empty DPB slots (VA_PICTURE_H264_INVALID) carry POC=0 by
 * libavcodec/vaapi_h264.c::init_vaapi_pic and need no fix-up.
 */
 static inline int32_t h264_strip_ffmpeg_poc_sentinel(int32_t poc, uint32_t flags)
 {
 	if (flags & VA_PICTURE_H264_INVALID)
 		return 0;
 	/*
 	 * iter8 α-2: pass POC values through unchanged for rkvdec. The
 	 * sentinel-subtract was added for hantro's tbl->poc[] prepare_table
 	 * which fed the value through unmasked. rkvdec writes POC to MMIO
 	 * via writel_relaxed (rkvdec-h264.c:975-978) and the macro
 	 * RKVDEC_CUR_POC is a 32-bit passthrough. kdirect (ffmpeg-v4l2request)
 	 * delivers the sentinel-encoded value directly and decodes
 	 * correctly; libva's strip was the cause of the 16x32 partial-fill
 	 * Bug 4 symptom. Hantro+H.264 isn't exercised on RK3399 (hantro-dec
 	 * doesn't advertise H.264 there) — restoring the strip per-driver
 	 * is iter9 work if it ever surfaces.
 	 */
 	return poc;
 }
 static void h264_fill_dpb(struct request_data *data,
 			  struct object_context *context,
 			  VAPictureParameterBufferH264 *VAPicture,
 			  struct v4l2_ctrl_h264_decode_params *decode)
 {
 	const int max_frame_num =
 		1 << (VAPicture->seq_fields.bits.log2_max_frame_num_minus4 + 4);
 	const int cur_frame_num = (int)VAPicture->frame_num;
 	int i;
 	for (i = 0; i < H264_DPB_SIZE; i++) {
@@ -252,26 +189,7 @@ static void h264_fill_dpb(struct request_data *data,
 			SURFACE(data, entry->pic.picture_id);
 		uint64_t timestamp;
-		/*
+		if (!entry->valid)
 		 * Skip entries no longer referenced by the consumer's
 		 * VAPictureParameterBufferH264.ReferenceFrames[]. dpb_update()
 		 * clears `used` for all entries then re-marks only those in the
 		 * current ReferenceFrames list; entries with valid=true but
 		 * used=false are stale (a frame the libva consumer has retired
 		 * from its DPB).
 		 *
 		 * Without this skip, our V4L2 dpb[] grows monotonically until
 		 * H264_DPB_SIZE; by frame_num=10 it carries 7+ entries while
 		 * SPS.max_num_ref_frames may be 4. The kernel reflist builder /
 		 * cluster validator rejects the request with EINVAL once the
 		 * count exceeds the SPS contract — which iter1+iter2+iter3
 		 * surfaced as the "frame-11 EINVAL" carryover. iter4 fix:
 		 * report only currently-used entries to match FFmpeg's
 		 * libavcodec/v4l2_request_h264.c::fill_dpb behaviour (which
 		 * iterates h->short_ref[] / h->long_ref[] — exactly the
 		 * currently-referenced set).
 		 */
 		if (!entry->valid || !entry->used)
 			continue;
 		if (surface) {
@@ -280,47 +198,8 @@ static void h264_fill_dpb(struct request_data *data,
 		}
 		dpb->frame_num = entry->pic.frame_idx;
-
+		dpb->top_field_order_cnt = entry->pic.TopFieldOrderCnt;
-		/*
+		dpb->bottom_field_order_cnt = entry->pic.BottomFieldOrderCnt;
 		 * Per ext-ctrls-codec-stateless.rst, dpb[].pic_num must
 		 * equal the H.264 spec's PicNum (8-28) for short-term refs
 		 * or LongTermPicNum (8-29) for long-term refs.
 		 *
 		 * For frames (not field-coded), PicNum = FrameNumWrap.
 		 * FrameNumWrap = (frame_num > cur_frame_num)
 		 *                ? frame_num - max_frame_num
 		 *                : frame_num
 		 * (per spec section 8.2.4.1, frame_num wraparound).
 		 *
 		 * VAAPI convention (libavcodec/vaapi_h264.c::fill_vaapi_pic
 		 * line 64): VAPictureH264.frame_idx holds long_term_frame_idx
 		 * for long-term refs and frame_num for short-term refs. So
 		 * for long-term entries we copy frame_idx straight through
 		 * as LongTermPicNum.
 		 *
 		 * fourier's previous code set pic_num to picture_id (the
 		 * VAAPI surface id) which is unrelated to H.264 PicNum;
 		 * mediatek's vdec_h264_req_common.c::dst_entry->pic_num is
 		 * one consumer that fails on that. Hantro doesn't read
 		 * pic_num at all (uses reference_ts for ref resolution),
 		 * which is why fourier's wrong value never surfaced on
 		 * PineTab2 (RK3566 via hantro/rk3568-vpu).
 		 */
 		if (entry->pic.flags & VA_PICTURE_H264_LONG_TERM_REFERENCE) {
 			dpb->pic_num = entry->pic.frame_idx;
 		} else {
 			int frame_num = (int)entry->pic.frame_idx;
 			dpb->pic_num = (frame_num > cur_frame_num)
 				? frame_num - max_frame_num
 				: frame_num;
 		}
 		dpb->top_field_order_cnt =
 			h264_strip_ffmpeg_poc_sentinel(entry->pic.TopFieldOrderCnt,
 						       entry->pic.flags);
 		dpb->bottom_field_order_cnt =
 			h264_strip_ffmpeg_poc_sentinel(entry->pic.BottomFieldOrderCnt,
 						       entry->pic.flags);
 		dpb->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
@@ -329,27 +208,6 @@ static void h264_fill_dpb(struct request_data *data,
 		if (entry->pic.flags & VA_PICTURE_H264_LONG_TERM_REFERENCE)
 			dpb->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
 		/*
 		 * Mark this DPB entry as a frame reference (both top + bottom
 		 * fields). The kernel's v4l2_h264_init_reflist_builder iterates
 		 * dpb[] and skips entries whose `fields` member is zero — they
 		 * count as "no valid field reference for this entry." For
 		 * frame-coded streams (BBB and most desktop H.264) every
 		 * reference is a frame reference; per UAPI doc
 		 * (ext-ctrls-codec-stateless.rst), fields must be set to
 		 * V4L2_H264_FRAME_REF (= TOP|BOTTOM) for frames.
 		 *
 		 * Cross-reference: FFmpeg libavcodec/v4l2_request_h264.c::
 		 * fill_dpb_entry sets entry->fields from pic->reference; for
 		 * frames pic->reference includes V4L2_H264_FRAME_REF. Without
 		 * this, P-slices that need to walk the reference list (the
 		 * first one in BBB is at frame 11) hit "no valid refs" inside
 		 * the kernel's reflist builder and S_EXT_CTRLS rejects the
 		 * whole request with EINVAL (error_idx == count, the kernel's
 		 * "application bug" sentinel).
 		 */
 		dpb->fields = V4L2_H264_FRAME_REF;
 	}
 }
@@ -361,126 +219,13 @@ static void h264_va_picture_to_v4l2(struct request_data *driver_data,
 				    struct v4l2_ctrl_h264_pps *pps,
 				    struct v4l2_ctrl_h264_sps *sps)
 {
-	unsigned char *b;
+	h264_fill_dpb(driver_data, context, decode);
 	unsigned char nal_ref_idc;
 	unsigned char nal_unit_type;
-	/* Extract missing nal_ref_idc and nal_unit_type */
+	/* num_slices is no longer carried by v4l2_ctrl_h264_decode_params; the
-	b = surface->source_data;
+	 * kernel infers slice count from the number of slice_params controls
-	if (context->h264_start_code)
+	 * queued (one per slice via the request API). */
-		b += 3;
+	decode->top_field_order_cnt = VAPicture->CurrPic.TopFieldOrderCnt;
-	nal_ref_idc = (b[0] >> 5) & 0x3;
+	decode->bottom_field_order_cnt = VAPicture->CurrPic.BottomFieldOrderCnt;
 	nal_unit_type = b[0] & 0x1f;
 	/*
 	 * Bit-parse the slice_header() to recover fields VAAPI doesn't
 	 * forward and that hantro G1 hardware reads out of DECODE_PARAMS:
 	 *
 	 *   - dec_ref_pic_marking_bit_size  -> G1_REG_DEC_CTRL5_REFPIC_MK_LEN
 	 *   - idr_pic_id                    -> G1_REG_DEC_CTRL5_IDR_PIC_ID
 	 *   - pic_order_cnt_bit_size        -> G1_REG_DEC_CTRL6_POC_LENGTH
 	 *   - pic_order_cnt_lsb / delta_pic_order_cnt_* (used by hantro
 	 *     reference-list builder for poc_type=0/1 inter prediction)
 	 *
 	 * Without these set correctly, hantro's hardware bitstream parser
 	 * walks past zero bits, lands on garbage, decodes zero pixels —
 	 * the all-zero CAPTURE output observed during 2026-05-04 Phase 0.
 	 *
 	 * Spec: ITU-T H.264 §7.3.3 slice_header. Cross-reference (proven
 	 * working): FFmpeg libavcodec/h264_slice.c populates
 	 * H264SliceContext::ref_pic_marking_bit_size and
 	 * pic_order_cnt_bit_size by the same bit-precise parse.
 	 */
 	{
 		const struct h264_slice_header_context sh_ctx = {
 			.separate_colour_plane_flag =
 				(VAPicture->seq_fields.bits.residual_colour_transform_flag != 0),
 			.log2_max_frame_num_minus4 =
 				VAPicture->seq_fields.bits.log2_max_frame_num_minus4,
 			.frame_mbs_only_flag =
 				(VAPicture->seq_fields.bits.frame_mbs_only_flag != 0),
 			.pic_order_cnt_type =
 				VAPicture->seq_fields.bits.pic_order_cnt_type,
 			.log2_max_pic_order_cnt_lsb_minus4 =
 				VAPicture->seq_fields.bits.log2_max_pic_order_cnt_lsb_minus4,
 			.delta_pic_order_always_zero_flag =
 				(VAPicture->seq_fields.bits.delta_pic_order_always_zero_flag != 0),
 			.bottom_field_pic_order_in_frame_present_flag =
 				(VAPicture->pic_fields.bits.pic_order_present_flag != 0),
 			.redundant_pic_cnt_present_flag =
 				(VAPicture->pic_fields.bits.redundant_pic_cnt_present_flag != 0),
 			.weighted_pred_flag =
 				(VAPicture->pic_fields.bits.weighted_pred_flag != 0),
 			.weighted_bipred_idc =
 				VAPicture->pic_fields.bits.weighted_bipred_idc,
 			.num_ref_idx_l0_default_active_minus1 =
 				surface->params.h264.slice.num_ref_idx_l0_active_minus1,
 			.num_ref_idx_l1_default_active_minus1 =
 				surface->params.h264.slice.num_ref_idx_l1_active_minus1,
 			.chroma_format_idc =
 				VAPicture->seq_fields.bits.chroma_format_idc,
 			.bit_depth_luma_minus8 =
 				VAPicture->bit_depth_luma_minus8,
 			.bit_depth_chroma_minus8 =
 				VAPicture->bit_depth_chroma_minus8,
 			.nal_unit_type = nal_unit_type,
 			.nal_ref_idc   = nal_ref_idc,
 		};
 		struct h264_slice_header_info sh = { 0 };
 		unsigned char *nal_payload = b + 1; /* past NAL header byte */
 		size_t nal_payload_len = surface->slices_size -
 			(size_t)((nal_payload) - (unsigned char *)surface->source_data);
 		int sh_rc = h264_parse_slice_header(nal_payload, nal_payload_len,
 						    &sh_ctx, &sh);
 		if (sh_rc == 0) {
 			decode->idr_pic_id		= sh.idr_pic_id;
 			decode->pic_order_cnt_lsb	= sh.pic_order_cnt_lsb;
 			decode->delta_pic_order_cnt_bottom = sh.delta_pic_order_cnt_bottom;
 			decode->delta_pic_order_cnt0	= sh.delta_pic_order_cnt0;
 			decode->delta_pic_order_cnt1	= sh.delta_pic_order_cnt1;
 			decode->pic_order_cnt_bit_size	= sh.pic_order_cnt_bit_size;
 			decode->dec_ref_pic_marking_bit_size = sh.dec_ref_pic_marking_bit_size;
 		} else {
 			request_log("slice_header parse FAILED rc=%d "
 				    "(payload_len=%zu) — DECODE_PARAMS bit_size "
 				    "fields left zero, hantro will likely produce zeros\n",
 				    sh_rc, nal_payload_len);
 		}
 	}
 	h264_fill_dpb(driver_data, context, VAPicture, decode);
 	/*
 	 * Populate every V4L2_CID_STATELESS_H264_DECODE_PARAMS field
 	 * we can derive from VAAPI's pre-parsed VAPictureParameterBuffer
 	 * + bitstream byte. Cross-reference: GStreamer
 	 * gstv4l2codech264dec.c::gst_v4l2_codec_h264_dec_fill_decoder_params
 	 * (lines 632-678).
 	 *
 	 * Fields not derivable from VAAPI (idr_pic_id, pic_order_cnt_lsb,
 	 * delta_pic_order_cnt_*, dec_ref_pic_marking_bit_size,
 	 * pic_order_cnt_bit_size, slice_group_change_cycle) require a
 	 * full slice_header() bit-level parse, which libva-v4l2-request
 	 * does not currently do. They are left at zero-init and the
 	 * kernel-side hantro-vpu may compute them itself when scanning
 	 * the OUTPUT bitstream — a hypothesis verified empirically by
 	 * running this patch and inspecting the CAPTURE buffer.
 	 */
 	decode->nal_ref_idc = nal_ref_idc;
 	decode->frame_num = VAPicture->frame_num;
 	decode->top_field_order_cnt =
 		h264_strip_ffmpeg_poc_sentinel(VAPicture->CurrPic.TopFieldOrderCnt,
 					       VAPicture->CurrPic.flags);
 	decode->bottom_field_order_cnt =
 		h264_strip_ffmpeg_poc_sentinel(VAPicture->CurrPic.BottomFieldOrderCnt,
 					       VAPicture->CurrPic.flags);
 	if (nal_unit_type == 5)
 		decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC;
 	if (VAPicture->pic_fields.bits.field_pic_flag)
 		decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC;
 	if (VAPicture->CurrPic.flags & VA_PICTURE_H264_BOTTOM_FIELD)
 		decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD;
 	pps->weighted_bipred_idc =
 		VAPicture->pic_fields.bits.weighted_bipred_idc;
@@ -513,7 +258,6 @@ static void h264_va_picture_to_v4l2(struct request_data *driver_data,
 	if (VAPicture->pic_fields.bits.redundant_pic_cnt_present_flag)
 		pps->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT;
 	sps->max_num_ref_frames = VAPicture->num_ref_frames;
 	sps->chroma_format_idc = VAPicture->seq_fields.bits.chroma_format_idc;
 	sps->bit_depth_luma_minus8 = VAPicture->bit_depth_luma_minus8;
 	sps->bit_depth_chroma_minus8 = VAPicture->bit_depth_chroma_minus8;
@@ -560,32 +304,6 @@ static void h264_va_matrix_to_v4l2(struct request_data *driver_data,
 	       sizeof(v4l2_matrix->scaling_list_8x8[3]));
 }
 /*
 * H.264 spec default scaling matrices: Flat_4x4_16 and Flat_8x8_16
 * (every entry = 16). When sps_scaling_matrix_present_flag and
 * pps_scaling_matrix_present_flag are both false, the bitstream
 * carries no explicit scaling lists and the decoder uses these
 * flat defaults — matching ITU-T H.264 (08/2024) §7.4.2.1.1.1
 * (sequence scaling) and §7.4.2.2 (picture scaling).
 *
 * Why we always provide the matrix: hantro G1's set_params reads
 * pps->flags & V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT to drive
 * the G1_REG_DEC_CTRL2_TYPE1_QUANT_E hardware bit. FFmpeg's
 * v4l2_request_h264.c always submits the SCALING_MATRIX control
 * with the spec default when the bitstream omits explicit lists,
 * and always sets the SCALING_MATRIX_PRESENT flag (commit
 * comment: "FFmpeg always provide a scaling matrix"). We mirror
 * that so the kernel sees a consistent control set across drivers.
 */
 static void h264_default_flat_scaling_matrix(
 	struct v4l2_ctrl_h264_scaling_matrix *v4l2_matrix)
 {
 	memset(v4l2_matrix->scaling_list_4x4, 16,
 	       sizeof(v4l2_matrix->scaling_list_4x4));
 	memset(v4l2_matrix->scaling_list_8x8, 16,
 	       sizeof(v4l2_matrix->scaling_list_8x8));
 }
 static void h264_copy_pred_table(struct v4l2_h264_weight_factors *factors,
 				 unsigned int num_refs,
 				 int16_t luma_weight[32],
@@ -613,11 +331,12 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 				  VASliceParameterBufferH264 *VASlice,
 				  VAPictureParameterBufferH264 *VAPicture,
 				  struct v4l2_ctrl_h264_slice_params *slice,
-				  struct v4l2_ctrl_h264_pred_weights *weights)
+				  struct v4l2_ctrl_h264_pred_weights *pred_weights)
 {
 	/* Slice byte size is now communicated via the OUTPUT buffer's
 	 * bytesused at QBUF time; the v4l2_ctrl_h264_slice_params struct
 	 * was slimmed during upstreaming and no longer carries it. */
 	slice->header_bit_size = VASlice->slice_data_bit_offset;
 	//if (context->h264_start_code)	
 	//	slice->header_bit_size += 3 * 8;
 	slice->first_mb_in_slice = VASlice->first_mb_in_slice;
 	slice->slice_type = VASlice->slice_type;
 	slice->cabac_init_idc = VASlice->cabac_init_idc;
@@ -638,14 +357,13 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 			VAPictureH264 *pic = &VASlice->RefPicList0[i];
 			struct h264_dpb_entry *entry;
 			unsigned int idx;
 			unsigned char fields;
-			entry = dpb_lookup(context, pic, &idx, &fields);
+			entry = dpb_lookup(context, pic, &idx);
 			if (!entry)
 				continue;
 			slice->ref_pic_list0[i].index = idx;
-			slice->ref_pic_list0[i].fields = fields;
+			slice->ref_pic_list0[i].fields = V4L2_H264_FRAME_REF;
 		}
 	}
@@ -659,28 +377,29 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 			VAPictureH264 *pic = &VASlice->RefPicList1[i];
 			struct h264_dpb_entry *entry;
 			unsigned int idx;
 			unsigned char fields;
-			entry = dpb_lookup(context, pic, &idx, &fields);
+			entry = dpb_lookup(context, pic, &idx);
 			if (!entry)
 				continue;
 			slice->ref_pic_list1[i].index = idx;
-			slice->ref_pic_list1[i].fields = fields;
+			slice->ref_pic_list1[i].fields = V4L2_H264_FRAME_REF;
 		}
 	}
 	if (VASlice->direct_spatial_mv_pred_flag)
 		slice->flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED;
-	weights->chroma_log2_weight_denom =
+	/* Predicted weight table moved out of slice_params into its own
 	 * V4L2_CID_STATELESS_H264_PRED_WEIGHTS control during upstreaming. */
 	pred_weights->chroma_log2_weight_denom =
 		VASlice->chroma_log2_weight_denom;
-	weights->luma_log2_weight_denom =
+	pred_weights->luma_log2_weight_denom =
 		VASlice->luma_log2_weight_denom;
 	if (((VASlice->slice_type % 5) == H264_SLICE_P) ||
 	    ((VASlice->slice_type % 5) == H264_SLICE_B))
-		h264_copy_pred_table(&weights->weight_factors[0],
+		h264_copy_pred_table(&pred_weights->weight_factors[0],
 				     slice->num_ref_idx_l0_active_minus1 + 1,
 				     VASlice->luma_weight_l0,
 				     VASlice->luma_offset_l0,
@@ -688,7 +407,7 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 				     VASlice->chroma_offset_l0);
 	if ((VASlice->slice_type % 5) == H264_SLICE_B)
-		h264_copy_pred_table(&weights->weight_factors[1],
+		h264_copy_pred_table(&pred_weights->weight_factors[1],
 				     slice->num_ref_idx_l1_active_minus1 + 1,
 				     VASlice->luma_weight_l1,
 				     VASlice->luma_offset_l1,
@@ -696,130 +415,21 @@ static void h264_va_slice_to_v4l2(struct request_data *driver_data,
 				     VASlice->chroma_offset_l1);
 }
 int h264_get_controls(struct request_data *driver_data,
 		      struct object_context *context)
 {
 	struct v4l2_ext_control controls[2] = {
 		{
 			.id = V4L2_CID_STATELESS_H264_DECODE_MODE,
 		}, {
 			.id = V4L2_CID_STATELESS_H264_START_CODE,
 		}
 	};
 	int rc;
 	rc = v4l2_get_controls(driver_data->video_fd, -1, controls, 2);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	switch (controls[0].value) {
 	case V4L2_STATELESS_H264_DECODE_MODE_SLICE_BASED:
 		break;
 	case V4L2_STATELESS_H264_DECODE_MODE_FRAME_BASED:
 		break;
 	default:
 		request_log("Unsupported decode mode\n");
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	}
 	switch (controls[1].value) {
 	case V4L2_STATELESS_H264_START_CODE_NONE:
 		context->h264_start_code = false;
 		break;
 	case V4L2_STATELESS_H264_START_CODE_ANNEX_B:
 		context->h264_start_code = true;
 		break;
 	default:
 		request_log("Unsupported start code\n");
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	}
 	return VA_STATUS_SUCCESS;
 }
 static inline __u8 h264_profile_to_idc(VAProfile profile)
 {
 	switch (profile) {
 	case VAProfileH264Main:
 		return 77;
 	case VAProfileH264High:
 		return 100;
 	case VAProfileH264ConstrainedBaseline:
 		return 66;
 	case VAProfileH264MultiviewHigh:
 		return 118;
 	case VAProfileH264StereoHigh:
 		return 128;
 	default:
 		return 0;
 	}
 }
 /*
 * Derive sps.level_idc from the encoded frame size in macroblocks per
 * H.264 Annex A.3 (Table A-1) MaxFS thresholds. Each level's MaxFS is
 * the maximum encoded frame size in MBs the level supports; we pick
 * the smallest level whose MaxFS contains the actual frame size.
 *
 * Level decoding for the V4L2 control: level_idc = level * 10
 *   Level 1.0 → 10, Level 4.1 → 41, Level 5.1 → 51, Level 6.0 → 60.
 *
 * VAAPI does not expose the bitstream's actual level_idc on the
 * decode side (VAPictureParameterBufferH264 has no such field) — see
 * va.h. The H.264 SPS NAL is parsed client-side by ffmpeg-vaapi /
 * mpv and only slice data is forwarded in VASliceDataBuffer, so a
 * SPS-NAL byte parser is not viable at this layer.
 *
 * Without framerate we cannot also check MaxMBPS / MaxBR / MaxCPB.
 * That gap is acceptable in practice: consumers that push
 * temporally-dense streams (high MBPS) almost always also push
 * spatially-large frames (high MaxFS), so frame-size-based level
 * selection over-allocates on the temporal axis but never
 * under-allocates a level the consumer relies on for correct
 * decode-resource sizing.
 *
 * Picks for typical content:
 *   1080p (8160 MBs) → Level 4.1 (level_idc = 41)
 *   4K   (32400 MBs) → Level 5.1 (level_idc = 51)
 *   8K  (138240 MBs) → Level 6.0 (level_idc = 60)
 *
 * Replaces the hardcoded level_idc=51 from patch 0013.
 */
 static inline __u8 h264_derive_level_idc(unsigned int width_in_mbs,
 					 unsigned int height_in_mbs)
 {
 	const unsigned int frame_size_mbs = width_in_mbs * height_in_mbs;
 	if (frame_size_mbs <= 99)     return 10;  /* Level 1.0 */
 	if (frame_size_mbs <= 396)    return 11;  /* Level 1.1 - 2.0 */
 	if (frame_size_mbs <= 792)    return 21;  /* Level 2.1 */
 	if (frame_size_mbs <= 1620)   return 22;  /* Level 2.2 - 3.0 */
 	if (frame_size_mbs <= 3600)   return 31;  /* Level 3.1 */
 	if (frame_size_mbs <= 5120)   return 32;  /* Level 3.2 */
 	if (frame_size_mbs <= 8192)   return 41;  /* Level 4.0 - 4.1 */
 	if (frame_size_mbs <= 8704)   return 42;  /* Level 4.2 */
 	if (frame_size_mbs <= 22080)  return 50;  /* Level 5.0 */
 	if (frame_size_mbs <= 36864)  return 51;  /* Level 5.1 - 5.2 */
 	if (frame_size_mbs <= 139264) return 60;  /* Level 6.0 - 6.2 */
 	return 62;                                /* > Level 6 ceiling */
 }
 int h264_set_controls(struct request_data *driver_data,
 		      struct object_context *context,
 		      VAProfile profile,
 		      struct object_surface *surface)
 {
 	struct v4l2_ctrl_h264_scaling_matrix matrix = { 0 };
 	struct v4l2_ctrl_h264_decode_params decode = { 0 };
 	struct v4l2_ctrl_h264_slice_params slice = { 0 };
-	struct v4l2_ctrl_h264_pred_weights weights = { 0 };
+	struct v4l2_ctrl_h264_pred_weights pred_weights = { 0 };
 	struct v4l2_ctrl_h264_pps pps = { 0 };
 	struct v4l2_ctrl_h264_sps sps = { 0 };
 	struct h264_dpb_entry *output;
 	int rc;
 	output = dpb_lookup(context, &surface->params.h264.picture.CurrPic,
-			    NULL, NULL);
+			    NULL);
 	if (!output)
 		output = dpb_find_entry(context);
@@ -830,171 +440,44 @@ int h264_set_controls(struct request_data *driver_data,
 	h264_va_picture_to_v4l2(driver_data, context, surface,
 				&surface->params.h264.picture,
 				&decode, &pps, &sps);
-
+	h264_va_matrix_to_v4l2(driver_data, context,
-	/*
+			       &surface->params.h264.matrix, &matrix);
 	 * Populate the scaling matrix unconditionally: from VAAPI's
 	 * VAIQMatrixBufferH264 when the consumer sent one this frame
 	 * (matrix_set), otherwise from the H.264 spec flat defaults.
 	 * Submitted to the kernel as V4L2_CID_STATELESS_H264_SCALING_MATRIX
 	 * for every request — required for FFmpeg/hantro contract parity
 	 * (see h264_default_flat_scaling_matrix() docblock).
 	 */
 	if (surface->params.h264.matrix_set)
 		h264_va_matrix_to_v4l2(driver_data, context,
 				       &surface->params.h264.matrix, &matrix);
 	else
 		h264_default_flat_scaling_matrix(&matrix);
 	h264_va_slice_to_v4l2(driver_data, context,
 			      &surface->params.h264.slice,
-			      &surface->params.h264.picture, &slice, &weights);
+			      &surface->params.h264.picture, &slice,
 			      &pred_weights);
-	/*
+	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-	 * Mirror SCALING_MATRIX_PRESENT in PPS flags. Hantro G1 set_params
+			      V4L2_CID_STATELESS_H264_DECODE_PARAMS, &decode,
-	 * gates its G1_REG_DEC_CTRL2_TYPE1_QUANT_E register bit on this;
+			      sizeof(decode));
-	 * FFmpeg sets it unconditionally with the comment "FFmpeg always
+	if (rc < 0)
-	 * provide a scaling matrix." We submit the matrix always (above),
+		return VA_STATUS_ERROR_OPERATION_FAILED;
 	 * so the flag must be set always to match.
 	 */
 	pps.flags |= V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT;
-	/*
+	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-	 * Populate pps->num_ref_idx_l0/l1_default_active_minus1. Hantro G1
+			      V4L2_CID_STATELESS_H264_SLICE_PARAMS, &slice,
-	 * writes both into G1_REG_DEC_CTRL6_REFIDX0_ACTIVE / REFIDX1_ACTIVE
+			      sizeof(slice));
-	 * MMIO registers (via "(field) + 1", so an uninitialized 0 here
+	if (rc < 0)
-	 * would advertise "1 active reference per list" to hardware, wrong
+		return VA_STATUS_ERROR_OPERATION_FAILED;
 	 * for I/IDR frames with 0 refs and wrong for B frames with >1).
 	 *
 	 * VAAPI's VAPictureParameterBufferH264 does not carry the parsed
 	 * PPS num_ref_idx_l*_default_active_minus1 fields — those are in
 	 * the bitstream's PPS NAL which VAAPI consumers parse client-side
 	 * but don't forward. The closest available source is VASlice's
 	 * num_ref_idx_l*_active_minus1, which is the per-slice override
 	 * defaulting to the PPS value (H.264 §7.4.3 num_ref_idx_active_
 	 * override_flag). For most streams these values match; mismatch
 	 * only on streams with explicit per-slice overrides.
 	 *
 	 * For IDR frames (no references), the values are not used by
 	 * hantro's reference list builder, so a wrong value here is
 	 * harmless. For inter frames it matters and slice-derived is
 	 * the best we can do without a full PPS-NAL parser.
 	 */
 	pps.num_ref_idx_l0_default_active_minus1 =
 		surface->params.h264.slice.num_ref_idx_l0_active_minus1;
 	pps.num_ref_idx_l1_default_active_minus1 =
 		surface->params.h264.slice.num_ref_idx_l1_active_minus1;
-	/*
+	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-	 * Derive PFRAME / BFRAME flags in v4l2_ctrl_h264_decode_params.flags
+			      V4L2_CID_STATELESS_H264_PRED_WEIGHTS,
-	 * from VASliceParameterBufferH264.slice_type. VAAPI's slice_type
+			      &pred_weights, sizeof(pred_weights));
-	 * matches the H.264 spec slice_type semantic: 0=P, 1=B, 2=I, 3=SP,
+	if (rc < 0)
-	 * 4=SI; values 5..9 mean "all slices in the picture have this
+		return VA_STATUS_ERROR_OPERATION_FAILED;
 	 * slice_type" (mod 5 yields the underlying type). VAAPI consumers
 	 * (ffmpeg, mpv) populate this for every slice; in FRAME_BASED mode
 	 * we only see the most-recent slice's params, but slice_type is
 	 * uniform across a single coded picture for our purposes.
 	 *
 	 * Kernel consumers that read these flags: tegra-vde
 	 * (drivers/media/platform/nvidia/tegra-vde/h264.c lines 783-799 of
 	 * 6.19.x) selects the inter-frame decode kernel. Hantro / rkvdec /
 	 * cedrus / mediatek / qcom-iris-stateless do not consume them.
 	 * Setting them keeps the libva-v4l2-request fork upstreamable
 	 * across drivers without affecting hantro behaviour.
 	 *
 	 * Cross-reference: ext-ctrls-codec-stateless.rst Decode Parameters
 	 * Flags — V4L2_H264_DECODE_PARAM_FLAG_PFRAME / _BFRAME.
 	 */
 	switch (surface->params.h264.slice.slice_type % 5) {
 	case H264_SLICE_P:
 		decode.flags |= V4L2_H264_DECODE_PARAM_FLAG_PFRAME;
 		break;
 	case H264_SLICE_B:
 		decode.flags |= V4L2_H264_DECODE_PARAM_FLAG_BFRAME;
 		break;
 	default:
 		/* I / SP / SI: no extra flag. */
 		break;
 	}
-	sps.profile_idc = h264_profile_to_idc(profile);
+	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
 			      V4L2_CID_STATELESS_H264_PPS, &pps, sizeof(pps));
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;
-	/*
+	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-	 * Derive level_idc from encoded frame size per H.264 Annex A.3.
+			      V4L2_CID_STATELESS_H264_SPS, &sps, sizeof(sps));
-	 * VAAPI doesn't expose level_idc on the decode side (see
+	if (rc < 0)
-	 * h264_derive_level_idc()'s docblock for the rationale); we pick
+		return VA_STATUS_ERROR_OPERATION_FAILED;
 	 * the smallest level whose MaxFS contains the picture dimensions.
 	 * Replaces patch 0013's intermediate hardcode of 51.
 	 */
 	sps.level_idc = h264_derive_level_idc(
 		(unsigned int)surface->params.h264.picture.picture_width_in_mbs_minus1 + 1u,
 		(unsigned int)surface->params.h264.picture.picture_height_in_mbs_minus1 + 1u);
-	/*
+	rc = v4l2_set_control(driver_data->video_fd, surface->request_fd,
-	 * Build the per-request control list incrementally:
+			      V4L2_CID_STATELESS_H264_SCALING_MATRIX, &matrix,
-	 *   - SPS, PPS, DECODE_PARAMS, SCALING_MATRIX: always required.
+			      sizeof(matrix));
 	 *     Hantro G1 reads the SCALING_MATRIX_PRESENT flag from PPS to
 	 *     gate hardware register G1_REG_DEC_CTRL2_TYPE1_QUANT_E and
 	 *     reads the matrix entries directly into hardware tables when
 	 *     decoding. FFmpeg always submits the matrix (with spec-default
 	 *     flat values when no explicit lists are in the bitstream); we
 	 *     match that — see h264_default_flat_scaling_matrix() docblock.
 	 *     Earlier patch 0012 made SCALING_MATRIX submission conditional
 	 *     on VAAPI's VAIQMatrixBuffer arrival; that was corpus-correct
 	 *     (bbb has no explicit scaling lists) but inconsistent with the
 	 *     hantro contract — replaced 2026-05-04.
 	 *   - SLICE_PARAMS: SLICE_BASED only. Kernel doc
 	 *     ext-ctrls-codec-stateless.rst (FRAME_BASED entry):
 	 *     "When this mode is selected, the
 	 *     V4L2_CID_STATELESS_H264_SLICE_PARAMS control shall not be
 	 *     set." Submitting it under FRAME_BASED triggers cluster-
 	 *     validation EINVAL at error_idx=count.
 	 *   - PRED_WEIGHTS: SLICE_BASED + V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED.
 	 *
 	 * Patch 0002 unconditionally sets the device to FRAME_BASED,
 	 * so slice_based is hardcoded false here. When the planned
 	 * probe-then-set commit lands, this becomes
 	 *     context->decode_mode == V4L2_STATELESS_H264_DECODE_MODE_SLICE_BASED.
 	 */
 	struct v4l2_ext_control controls[6] = { 0 };
 	unsigned int num_controls = 0;
 	const bool slice_based = false; /* TODO: probe via context->decode_mode */
 	controls[num_controls].id = V4L2_CID_STATELESS_H264_SPS;
 	controls[num_controls].p_h264_sps = &sps;
 	controls[num_controls].size = sizeof(sps);
 	num_controls++;
 	controls[num_controls].id = V4L2_CID_STATELESS_H264_PPS;
 	controls[num_controls].p_h264_pps = &pps;
 	controls[num_controls].size = sizeof(pps);
 	num_controls++;
 	controls[num_controls].id = V4L2_CID_STATELESS_H264_DECODE_PARAMS;
 	controls[num_controls].p_h264_decode_params = &decode;
 	controls[num_controls].size = sizeof(decode);
 	num_controls++;
 	controls[num_controls].id = V4L2_CID_STATELESS_H264_SCALING_MATRIX;
 	controls[num_controls].p_h264_scaling_matrix = &matrix;
 	controls[num_controls].size = sizeof(matrix);
 	num_controls++;
 	if (slice_based) {
 		controls[num_controls].id = V4L2_CID_STATELESS_H264_SLICE_PARAMS;
 		controls[num_controls].p_h264_slice_params = &slice;
 		controls[num_controls].size = sizeof(slice);
 		num_controls++;
 		if (V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED(&pps, &slice)) {
 			controls[num_controls].id = V4L2_CID_STATELESS_H264_PRED_WEIGHTS;
 			controls[num_controls].ptr = &weights;
 			controls[num_controls].size = sizeof(weights);
 			num_controls++;
 		}
 	}
 	rc = v4l2_set_controls(driver_data->video_fd, surface->request_fd,
 			       controls, num_controls);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;
@@ -51,11 +51,8 @@ struct h264_dpb {
 	unsigned int age;
 };
 int h264_get_controls(struct request_data *driver_data,
 		      struct object_context *context);
 int h264_set_controls(struct request_data *data,
 		      struct object_context *context,
 		      VAProfile profile,
 		      struct object_surface *surface);
 #endif
@@ -1,361 +0,0 @@
 /*
 * H.264 slice header bit-parser implementation.
 *
 * Implements just enough of ITU-T Rec. H.264 (08/2024) §7.3.3
 * slice_header to populate the V4L2 DECODE_PARAMS bit-position
 * fields (idr_pic_id, pic_order_cnt_lsb, delta_pic_order_cnt_*,
 * pic_order_cnt_bit_size, dec_ref_pic_marking_bit_size).
 *
 * Skips through ref_pic_list_modification() and pred_weight_table()
 * because dec_ref_pic_marking() (whose bit length we need) comes
 * after them. MVC extensions (nal_unit_type 20/21) are not handled
 * — this fork strips MVC alongside HEVC.
 */
 #include "h264_slice_header.h"
 #include <errno.h>
 #include <string.h>
 /*
 * Minimal RBSP bit reader. Reads bits MSB-first. Tracks bit_pos for
 * caller use (e.g. computing the size of a syntax element by
 * pre/post bit_pos delta).
 */
 struct br {
 	const uint8_t	*data;
 	size_t		length;	/* bytes */
 	size_t		bit_pos;
 	bool		error;
 };
 static uint32_t br_read_u(struct br *b, unsigned n)
 {
 	uint32_t v = 0;
 	while (n--) {
 		if (b->bit_pos >= b->length * 8) {
 			b->error = true;
 			return 0;
 		}
 		v = (v << 1) | ((b->data[b->bit_pos >> 3] >>
 				 (7 - (b->bit_pos & 7))) & 1u);
 		b->bit_pos++;
 	}
 	return v;
 }
 static uint32_t br_read_ue(struct br *b)
 {
 	unsigned zeros = 0;
 	while (br_read_u(b, 1) == 0) {
 		if (b->error || ++zeros >= 32)
 			return 0;
 	}
 	if (zeros == 0)
 		return 0;
 	return (1u << zeros) - 1u + br_read_u(b, zeros);
 }
 static int32_t br_read_se(struct br *b)
 {
 	uint32_t v = br_read_ue(b);
 	if (v & 1u)
 		return (int32_t)((v + 1u) >> 1);
 	return -(int32_t)(v >> 1);
 }
 /*
 * RBSP unescape: strip emulation prevention bytes (after every
 * 0x00 0x00 in the encoded stream, an extra 0x03 is inserted to
 * prevent byte-aligned start-code emulation; we strip those before
 * bit-parsing). Output buffer must be at least as large as input.
 *
 * Slice headers are short (<100 bits typically), so we unescape
 * only the first H264_SLICE_HEADER_SCAN_BYTES = 64 input bytes.
 * That covers any realistic slice header including
 * dec_ref_pic_marking() and a generous safety margin.
 */
 #define H264_SLICE_HEADER_SCAN_BYTES 64
 static size_t rbsp_unescape(uint8_t *out, const uint8_t *in,
 			    size_t in_len)
 {
 	size_t out_len = 0;
 	int zero_run = 0;
 	size_t i;
 	size_t cap = in_len < H264_SLICE_HEADER_SCAN_BYTES ?
 		     in_len : H264_SLICE_HEADER_SCAN_BYTES;
 	for (i = 0; i < cap; i++) {
 		if (zero_run >= 2 && in[i] == 0x03) {
 			zero_run = 0;
 			continue;
 		}
 		out[out_len++] = in[i];
 		zero_run = (in[i] == 0x00) ? zero_run + 1 : 0;
 	}
 	return out_len;
 }
 /*
 * §7.3.3.1 ref_pic_list_modification() — skip past it without
 * keeping any values. Length depends on slice_type and the loop
 * terminator modification_of_pic_nums_idc == 3.
 */
 static void skip_ref_pic_list_modification(struct br *b,
 					   uint32_t slice_type)
 {
 	uint32_t st_mod5 = slice_type % 5;
 	if (st_mod5 != 2 && st_mod5 != 4) {
 		/* P, SP, B */
 		uint32_t ref_pic_list_modification_flag_l0 = br_read_u(b, 1);
 		if (ref_pic_list_modification_flag_l0) {
 			uint32_t mod_idc;
 			do {
 				mod_idc = br_read_ue(b);
 				if (mod_idc == 0 || mod_idc == 1)
 					br_read_ue(b); /* abs_diff_pic_num_minus1 */
 				else if (mod_idc == 2)
 					br_read_ue(b); /* long_term_pic_num */
 				if (b->error)
 					return;
 			} while (mod_idc != 3);
 		}
 	}
 	if (st_mod5 == 1) {
 		/* B */
 		uint32_t ref_pic_list_modification_flag_l1 = br_read_u(b, 1);
 		if (ref_pic_list_modification_flag_l1) {
 			uint32_t mod_idc;
 			do {
 				mod_idc = br_read_ue(b);
 				if (mod_idc == 0 || mod_idc == 1)
 					br_read_ue(b);
 				else if (mod_idc == 2)
 					br_read_ue(b);
 				if (b->error)
 					return;
 			} while (mod_idc != 3);
 		}
 	}
 }
 /*
 * §7.3.3.2 pred_weight_table() — skip past it. Length depends on
 * the active reference counts and chroma_format_idc.
 */
 static void skip_pred_weight_table(struct br *b,
 				   uint32_t slice_type,
 				   uint8_t chroma_format_idc,
 				   uint8_t bit_depth_luma_minus8,
 				   uint8_t bit_depth_chroma_minus8,
 				   uint32_t num_ref_idx_l0_active_minus1,
 				   uint32_t num_ref_idx_l1_active_minus1)
 {
 	uint32_t i, j;
 	uint32_t st_mod5 = slice_type % 5;
 	(void)bit_depth_luma_minus8;
 	(void)bit_depth_chroma_minus8;
 	br_read_ue(b); /* luma_log2_weight_denom */
 	if (chroma_format_idc != 0)
 		br_read_ue(b); /* chroma_log2_weight_denom */
 	for (i = 0; i <= num_ref_idx_l0_active_minus1 && !b->error; i++) {
 		uint32_t luma_weight_l0_flag = br_read_u(b, 1);
 		if (luma_weight_l0_flag) {
 			br_read_se(b); /* luma_weight_l0 */
 			br_read_se(b); /* luma_offset_l0 */
 		}
 		if (chroma_format_idc != 0) {
 			uint32_t chroma_weight_l0_flag = br_read_u(b, 1);
 			if (chroma_weight_l0_flag) {
 				for (j = 0; j < 2; j++) {
 					br_read_se(b);
 					br_read_se(b);
 				}
 			}
 		}
 	}
 	if (st_mod5 == 1) {
 		for (i = 0; i <= num_ref_idx_l1_active_minus1 && !b->error; i++) {
 			uint32_t luma_weight_l1_flag = br_read_u(b, 1);
 			if (luma_weight_l1_flag) {
 				br_read_se(b);
 				br_read_se(b);
 			}
 			if (chroma_format_idc != 0) {
 				uint32_t chroma_weight_l1_flag = br_read_u(b, 1);
 				if (chroma_weight_l1_flag) {
 					for (j = 0; j < 2; j++) {
 						br_read_se(b);
 						br_read_se(b);
 					}
 				}
 			}
 		}
 	}
 }
 int h264_parse_slice_header(const uint8_t *nal_payload,
 			    size_t nal_payload_length,
 			    const struct h264_slice_header_context *ctx,
 			    struct h264_slice_header_info *out)
 {
 	uint8_t unescaped[H264_SLICE_HEADER_SCAN_BYTES];
 	size_t unescaped_len;
 	struct br b = { 0 };
 	bool idr_pic_flag = (ctx->nal_unit_type == 5);
 	uint32_t slice_type;
 	uint32_t num_ref_idx_l0_active_minus1;
 	uint32_t num_ref_idx_l1_active_minus1;
 	size_t pic_order_cnt_start;
 	size_t pic_order_cnt_end;
 	size_t dec_ref_pic_marking_start;
 	size_t dec_ref_pic_marking_end;
 	bool field_pic_flag = false;
 	memset(out, 0, sizeof(*out));
 	if (!nal_payload || nal_payload_length == 0)
 		return -EINVAL;
 	unescaped_len = rbsp_unescape(unescaped, nal_payload,
 				      nal_payload_length);
 	if (unescaped_len < 2)
 		return -EINVAL;
 	b.data = unescaped;
 	b.length = unescaped_len;
 	b.bit_pos = 0;
 	b.error = false;
 	/* slice_header() per §7.3.3 */
 	out->first_mb_in_slice		= br_read_ue(&b);
 	slice_type			= br_read_ue(&b);
 	out->slice_type			= slice_type;
 	out->pic_parameter_set_id	= br_read_ue(&b);
 	if (ctx->separate_colour_plane_flag)
 		(void)br_read_u(&b, 2); /* colour_plane_id */
 	out->frame_num = br_read_u(&b, ctx->log2_max_frame_num_minus4 + 4u);
 	if (!ctx->frame_mbs_only_flag) {
 		field_pic_flag = (br_read_u(&b, 1) != 0);
 		if (field_pic_flag)
 			(void)br_read_u(&b, 1); /* bottom_field_flag */
 	}
 	if (idr_pic_flag)
 		out->idr_pic_id = (uint16_t)br_read_ue(&b);
 	/*
 	 * pic_order_cnt syntax — measure bit length from the start of
 	 * pic_order_cnt_lsb / delta_pic_order_cnt[0] to the end of
 	 * delta_pic_order_cnt_bottom / delta_pic_order_cnt[1]. This is
 	 * what V4L2 calls pic_order_cnt_bit_size and what hantro G1
 	 * writes into G1_REG_DEC_CTRL6_POC_LENGTH.
 	 */
 	pic_order_cnt_start = b.bit_pos;
 	if (ctx->pic_order_cnt_type == 0) {
 		out->pic_order_cnt_lsb = (uint16_t)br_read_u(
 			&b, ctx->log2_max_pic_order_cnt_lsb_minus4 + 4u);
 		if (ctx->bottom_field_pic_order_in_frame_present_flag &&
 		    !field_pic_flag)
 			out->delta_pic_order_cnt_bottom = br_read_se(&b);
 	} else if (ctx->pic_order_cnt_type == 1 &&
 		   !ctx->delta_pic_order_always_zero_flag) {
 		out->delta_pic_order_cnt0 = br_read_se(&b);
 		if (ctx->bottom_field_pic_order_in_frame_present_flag &&
 		    !field_pic_flag)
 			out->delta_pic_order_cnt1 = br_read_se(&b);
 	}
 	pic_order_cnt_end = b.bit_pos;
 	out->pic_order_cnt_bit_size = (uint32_t)(pic_order_cnt_end -
 						 pic_order_cnt_start);
 	if (ctx->redundant_pic_cnt_present_flag)
 		(void)br_read_ue(&b); /* redundant_pic_cnt */
 	if (slice_type % 5 == 1) /* B */
 		(void)br_read_u(&b, 1); /* direct_spatial_mv_pred_flag */
 	num_ref_idx_l0_active_minus1 = ctx->num_ref_idx_l0_default_active_minus1;
 	num_ref_idx_l1_active_minus1 = ctx->num_ref_idx_l1_default_active_minus1;
 	{
 		uint32_t st = slice_type % 5;
 		if (st == 0 || st == 3 || st == 1) {
 			/* P, SP, B */
 			uint32_t override = br_read_u(&b, 1);
 			if (override) {
 				num_ref_idx_l0_active_minus1 = br_read_ue(&b);
 				if (st == 1)
 					num_ref_idx_l1_active_minus1 = br_read_ue(&b);
 			}
 		}
 	}
 	skip_ref_pic_list_modification(&b, slice_type);
 	if (b.error)
 		return -EIO;
 	{
 		uint32_t st = slice_type % 5;
 		bool do_pwt =
 			(ctx->weighted_pred_flag && (st == 0 || st == 3)) ||
 			(ctx->weighted_bipred_idc == 1 && st == 1);
 		if (do_pwt) {
 			skip_pred_weight_table(&b, slice_type,
 					       ctx->chroma_format_idc,
 					       ctx->bit_depth_luma_minus8,
 					       ctx->bit_depth_chroma_minus8,
 					       num_ref_idx_l0_active_minus1,
 					       num_ref_idx_l1_active_minus1);
 			if (b.error)
 				return -EIO;
 		}
 	}
 	/*
 	 * dec_ref_pic_marking() per §7.3.3.3 — measure bit length;
 	 * hantro G1 writes this into G1_REG_DEC_CTRL5_REFPIC_MK_LEN.
 	 */
 	dec_ref_pic_marking_start = b.bit_pos;
 	if (ctx->nal_ref_idc != 0) {
 		if (idr_pic_flag) {
 			(void)br_read_u(&b, 1); /* no_output_of_prior_pics_flag */
 			(void)br_read_u(&b, 1); /* long_term_reference_flag */
 		} else {
 			uint32_t adaptive = br_read_u(&b, 1);
 			if (adaptive) {
 				uint32_t mmco;
 				do {
 					mmco = br_read_ue(&b);
 					if (mmco == 1 || mmco == 3)
 						br_read_ue(&b); /* difference_of_pic_nums_minus1 */
 					if (mmco == 2)
 						br_read_ue(&b); /* long_term_pic_num */
 					if (mmco == 3 || mmco == 6)
 						br_read_ue(&b); /* long_term_frame_idx */
 					if (mmco == 4)
 						br_read_ue(&b); /* max_long_term_frame_idx_plus1 */
 					if (b.error)
 						return -EIO;
 				} while (mmco != 0);
 			}
 		}
 	}
 	dec_ref_pic_marking_end = b.bit_pos;
 	out->dec_ref_pic_marking_bit_size =
 		(uint32_t)(dec_ref_pic_marking_end - dec_ref_pic_marking_start);
 	if (b.error)
 		return -EIO;
 	return 0;
 }
@@ -1,95 +0,0 @@
 /*
 * H.264 slice header bit-parser for libva-v4l2-request.
 *
 * Extracts the slice-header bit-position and value fields that
 * V4L2_CID_STATELESS_H264_DECODE_PARAMS requires (idr_pic_id,
 * pic_order_cnt_lsb, delta_pic_order_cnt_*, pic_order_cnt_bit_size,
 * dec_ref_pic_marking_bit_size). VAAPI's pre-parsed
 * VAPictureParameterBufferH264 / VASliceParameterBufferH264 do not
 * carry these — they live only in the bitstream's slice_header()
 * syntax. Hantro G1 (drivers/media/platform/verisilicon/
 * hantro_g1_h264_dec.c::set_params) writes the bit_size fields
 * directly into MMIO registers G1_REG_DEC_CTRL5_REFPIC_MK_LEN and
 * G1_REG_DEC_CTRL6_POC_LENGTH; with zeros the hardware bitstream
 * parser walks past zero bits, lands on garbage, decodes nothing.
 *
 * Spec reference: ITU-T Rec. H.264 (08/2024) §7.3.3 slice_header
 * and §7.3.3.1 ref_pic_list_modification, §7.3.3.2 pred_weight_table,
 * §7.3.3.3 dec_ref_pic_marking.
 *
 * Cross-reference (proven working on hantro): FFmpeg's
 * libavcodec/h264_slice.c populates H264SliceContext::ref_pic_marking_
 * bit_size and pic_order_cnt_bit_size from its bit-precise slice
 * header parse, then v4l2_request_h264.c forwards them.
 */
 #ifndef H264_SLICE_HEADER_H
 #define H264_SLICE_HEADER_H
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 struct h264_slice_header_context {
 	/* From SPS (the active SPS at slice-time). */
 	bool		separate_colour_plane_flag;
 	uint8_t		log2_max_frame_num_minus4;
 	bool		frame_mbs_only_flag;
 	uint8_t		pic_order_cnt_type;
 	uint8_t		log2_max_pic_order_cnt_lsb_minus4;
 	bool		delta_pic_order_always_zero_flag;
 	/* From PPS (the active PPS at slice-time). */
 	bool		bottom_field_pic_order_in_frame_present_flag;
 	bool		redundant_pic_cnt_present_flag;
 	bool		weighted_pred_flag;
 	uint8_t		weighted_bipred_idc;
 	uint8_t		num_ref_idx_l0_default_active_minus1;
 	uint8_t		num_ref_idx_l1_default_active_minus1;
 	uint8_t		chroma_format_idc;
 	uint8_t		bit_depth_luma_minus8;
 	uint8_t		bit_depth_chroma_minus8;
 	/* From the NAL unit header (already extracted by the caller). */
 	uint8_t		nal_unit_type;
 	uint8_t		nal_ref_idc;
 };
 struct h264_slice_header_info {
 	uint16_t	idr_pic_id;
 	uint16_t	pic_order_cnt_lsb;
 	int32_t		delta_pic_order_cnt_bottom;
 	int32_t		delta_pic_order_cnt0;
 	int32_t		delta_pic_order_cnt1;
 	uint32_t	pic_order_cnt_bit_size;
 	uint32_t	dec_ref_pic_marking_bit_size;
 	/* Diagnostic — useful for cross-checking VAAPI vs bitstream values. */
 	uint32_t	first_mb_in_slice;
 	uint32_t	slice_type;
 	uint32_t	pic_parameter_set_id;
 	uint32_t	frame_num;
 };
 /*
 * Parse slice_header() up to dec_ref_pic_marking() (inclusive) of
 * the H.264 RBSP slice_layer_without_partitioning_rbsp() syntax,
 * extracting the V4L2 DECODE_PARAMS fields. Returns 0 on success,
 * negative errno-shaped value on parse failure (insufficient data,
 * malformed exp-Golomb, etc.).
 *
 * @nal_payload: pointer to the byte AFTER the NAL header byte
 *               (i.e. start of the RBSP proper; caller has already
 *               skipped any ANNEX_B start code and the 1-byte
 *               nal_unit_header). Will be RBSP-unescaped internally
 *               before parsing.
 * @nal_payload_length: bytes available at @nal_payload.
 * @ctx: SPS/PPS/NAL context required to drive the parse.
 * @out: filled on success. All fields zero-initialized first.
 */
 int h264_parse_slice_header(const uint8_t *nal_payload,
 			    size_t nal_payload_length,
 			    const struct h264_slice_header_context *ctx,
 			    struct h264_slice_header_info *out);
 #endif /* H264_SLICE_HEADER_H */
@@ -27,12 +27,6 @@
 #ifndef _H265_H_
 #define _H265_H_
 /* Maximum number of slices per frame the libva backend will accumulate
 * before submitting to the kernel (kernel HEVC slice_params dynamic-array
 * accepts up to 600 entries per Phase 0 V4L2 inventory; 64 is a
 * conservative cap for typical fixtures + safety bound). */
 #define HEVC_MAX_SLICES_PER_FRAME 64
 struct object_context;
 struct object_surface;
 struct request_data;
@@ -1,14 +0,0 @@
 /* Stub for <gst/base/base-prelude.h> — GStreamer base-lib prelude.
 * In upstream GStreamer, this sets up the GstBaseExport macro + GObject
 * boilerplate. We bypass all of that and provide only what our four
 * vendored .c files actually need (gst_compat.h's typedefs).
 *
 * Crucially we also #define GST_BASE_API to nothing so the function
 * declarations in gstbitreader.h / gstbytereader.h drop the
 * dllimport / visibility attribute prefix.
 */
 #ifndef LIBVA_V4L2_REQUEST_FOURIER_BASE_PRELUDE_STUB
 #define LIBVA_V4L2_REQUEST_FOURIER_BASE_PRELUDE_STUB
 #include "gst_compat.h"
 #define GST_BASE_API
 #endif
@@ -1,307 +0,0 @@
 /* GStreamer
 *
 * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk>.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #define GST_BIT_READER_DISABLE_INLINES
 #include "gstbitreader.h"
 #include <string.h>
 /**
 * SECTION:gstbitreader
 * @title: GstBitReader
 * @short_description: Reads any number of bits from a memory buffer
 * @symbols:
 * - gst_bit_reader_skip_unchecked
 * - gst_bit_reader_skip_to_byte_unchecked
 * - gst_bit_reader_get_bits_uint8_unchecked
 * - gst_bit_reader_peek_bits_uint8_unchecked
 * - gst_bit_reader_get_bits_uint16_unchecked
 * - gst_bit_reader_peek_bits_uint16_unchecked
 * - gst_bit_reader_get_bits_uint32_unchecked
 * - gst_bit_reader_peek_bits_uint32_unchecked
 * - gst_bit_reader_get_bits_uint64_unchecked
 * - gst_bit_reader_peek_bits_uint64_unchecked
 *
 * #GstBitReader provides a bit reader that can read any number of bits
 * from a memory buffer. It provides functions for reading any number of bits
 * into 8, 16, 32 and 64 bit variables.
 */
 /**
 * gst_bit_reader_new: (skip)
 * @data: (array length=size): Data from which the #GstBitReader
 *   should read
 * @size: Size of @data in bytes
 *
 * Create a new #GstBitReader instance, which will read from @data.
 *
 * Free-function: gst_bit_reader_free
 *
 * Returns: (transfer full): a new #GstBitReader instance
 */
 GstBitReader *
 gst_bit_reader_new (const guint8 * data, guint size)
 {
  GstBitReader *ret = g_new0 (GstBitReader, 1);
  ret->data = data;
  ret->size = size;
  return ret;
 }
 /**
 * gst_bit_reader_free:
 * @reader: (in) (transfer full): a #GstBitReader instance
 *
 * Frees a #GstBitReader instance, which was previously allocated by
 * gst_bit_reader_new().
 */
 void
 gst_bit_reader_free (GstBitReader * reader)
 {
  g_return_if_fail (reader != NULL);
  g_free (reader);
 }
 /**
 * gst_bit_reader_init:
 * @reader: a #GstBitReader instance
 * @data: (in) (array length=size): data from which the bit reader should read
 * @size: Size of @data in bytes
 *
 * Initializes a #GstBitReader instance to read from @data. This function
 * can be called on already initialized instances.
 */
 void
 gst_bit_reader_init (GstBitReader * reader, const guint8 * data, guint size)
 {
  g_return_if_fail (reader != NULL);
  reader->data = data;
  reader->size = size;
  reader->byte = reader->bit = 0;
 }
 /**
 * gst_bit_reader_set_pos:
 * @reader: a #GstBitReader instance
 * @pos: The new position in bits
 *
 * Sets the new position of a #GstBitReader instance to @pos in bits.
 *
 * Returns: %TRUE if the position could be set successfully, %FALSE
 * otherwise.
 */
 gboolean
 gst_bit_reader_set_pos (GstBitReader * reader, guint pos)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  if (pos > reader->size * 8)
    return FALSE;
  reader->byte = pos / 8;
  reader->bit = pos % 8;
  return TRUE;
 }
 /**
 * gst_bit_reader_get_pos:
 * @reader: a #GstBitReader instance
 *
 * Returns the current position of a #GstBitReader instance in bits.
 *
 * Returns: The current position of @reader in bits.
 */
 guint
 gst_bit_reader_get_pos (const GstBitReader * reader)
 {
  return _gst_bit_reader_get_pos_inline (reader);
 }
 /**
 * gst_bit_reader_get_remaining:
 * @reader: a #GstBitReader instance
 *
 * Returns the remaining number of bits of a #GstBitReader instance.
 *
 * Returns: The remaining number of bits of @reader instance.
 */
 guint
 gst_bit_reader_get_remaining (const GstBitReader * reader)
 {
  return _gst_bit_reader_get_remaining_inline (reader);
 }
 /**
 * gst_bit_reader_get_size:
 * @reader: a #GstBitReader instance
 *
 * Returns the total number of bits of a #GstBitReader instance.
 *
 * Returns: The total number of bits of @reader instance.
 */
 guint
 gst_bit_reader_get_size (const GstBitReader * reader)
 {
  return _gst_bit_reader_get_size_inline (reader);
 }
 /**
 * gst_bit_reader_skip:
 * @reader: a #GstBitReader instance
 * @nbits: the number of bits to skip
 *
 * Skips @nbits bits of the #GstBitReader instance.
 *
 * Returns: %TRUE if @nbits bits could be skipped, %FALSE otherwise.
 */
 gboolean
 gst_bit_reader_skip (GstBitReader * reader, guint nbits)
 {
  return _gst_bit_reader_skip_inline (reader, nbits);
 }
 /**
 * gst_bit_reader_skip_to_byte:
 * @reader: a #GstBitReader instance
 *
 * Skips until the next byte.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 gboolean
 gst_bit_reader_skip_to_byte (GstBitReader * reader)
 {
  return _gst_bit_reader_skip_to_byte_inline (reader);
 }
 /**
 * gst_bit_reader_get_bits_uint8:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint8 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val and update the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 /**
 * gst_bit_reader_get_bits_uint16:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint16 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val and update the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 /**
 * gst_bit_reader_get_bits_uint32:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint32 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val and update the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 /**
 * gst_bit_reader_get_bits_uint64:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint64 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val and update the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 /**
 * gst_bit_reader_peek_bits_uint8:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint8 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val but keep the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 /**
 * gst_bit_reader_peek_bits_uint16:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint16 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val but keep the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 /**
 * gst_bit_reader_peek_bits_uint32:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint32 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val but keep the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 /**
 * gst_bit_reader_peek_bits_uint64:
 * @reader: a #GstBitReader instance
 * @val: (out): Pointer to a #guint64 to store the result
 * @nbits: number of bits to read
 *
 * Read @nbits bits into @val but keep the current position.
 *
 * Returns: %TRUE if successful, %FALSE otherwise.
 */
 #define GST_BIT_READER_READ_BITS(bits) \
 gboolean \
 gst_bit_reader_peek_bits_uint##bits (const GstBitReader *reader, guint##bits *val, guint nbits) \
 { \
  return _gst_bit_reader_peek_bits_uint##bits##_inline (reader, val, nbits); \
 } \
 \
 gboolean \
 gst_bit_reader_get_bits_uint##bits (GstBitReader *reader, guint##bits *val, guint nbits) \
 { \
  return _gst_bit_reader_get_bits_uint##bits##_inline (reader, val, nbits); \
 }
 GST_BIT_READER_READ_BITS (8);
 GST_BIT_READER_READ_BITS (16);
 GST_BIT_READER_READ_BITS (32);
 GST_BIT_READER_READ_BITS (64);
@@ -1,328 +0,0 @@
 /* GStreamer
 *
 * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk>.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */
 #ifndef __GST_BIT_READER_H__
 #define __GST_BIT_READER_H__
 #include <gst/gst.h>
 #include <gst/base/base-prelude.h>
 /* FIXME: inline functions */
 G_BEGIN_DECLS
 #define GST_BIT_READER(reader) ((GstBitReader *) (reader))
 /**
 * GstBitReader:
 * @data: (array length=size): Data from which the bit reader will
 *   read
 * @size: Size of @data in bytes
 * @byte: Current byte position
 * @bit: Bit position in the current byte
 *
 * A bit reader instance.
 */
 typedef struct {
  const guint8 *data;
  guint size;
  guint byte;  /* Byte position */
  guint bit;   /* Bit position in the current byte */
  /* < private > */
  gpointer _gst_reserved[GST_PADDING];
 } GstBitReader;
 GST_BASE_API
 GstBitReader *  gst_bit_reader_new              (const guint8 *data, guint size) G_GNUC_MALLOC;
 GST_BASE_API
 void            gst_bit_reader_free             (GstBitReader *reader);
 GST_BASE_API
 void            gst_bit_reader_init             (GstBitReader *reader, const guint8 *data, guint size);
 GST_BASE_API
 gboolean        gst_bit_reader_set_pos          (GstBitReader *reader, guint pos);
 GST_BASE_API
 guint           gst_bit_reader_get_pos          (const GstBitReader *reader);
 GST_BASE_API
 guint           gst_bit_reader_get_remaining    (const GstBitReader *reader);
 GST_BASE_API
 guint           gst_bit_reader_get_size         (const GstBitReader *reader);
 GST_BASE_API
 gboolean        gst_bit_reader_skip             (GstBitReader *reader, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_skip_to_byte     (GstBitReader *reader);
 GST_BASE_API
 gboolean        gst_bit_reader_get_bits_uint8   (GstBitReader *reader, guint8 *val, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_get_bits_uint16  (GstBitReader *reader, guint16 *val, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_get_bits_uint32  (GstBitReader *reader, guint32 *val, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_get_bits_uint64  (GstBitReader *reader, guint64 *val, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_peek_bits_uint8  (const GstBitReader *reader, guint8 *val, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_peek_bits_uint16 (const GstBitReader *reader, guint16 *val, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_peek_bits_uint32 (const GstBitReader *reader, guint32 *val, guint nbits);
 GST_BASE_API
 gboolean        gst_bit_reader_peek_bits_uint64 (const GstBitReader *reader, guint64 *val, guint nbits);
 /**
 * GST_BIT_READER_INIT:
 * @data: Data from which the #GstBitReader should read
 * @size: Size of @data in bytes
 *
 * A #GstBitReader must be initialized with this macro, before it can be
 * used. This macro can used be to initialize a variable, but it cannot
 * be assigned to a variable. In that case you have to use
 * gst_bit_reader_init().
 */
 #define GST_BIT_READER_INIT(data, size) {data, size, 0, 0}
 /* Unchecked variants */
 static inline void
 gst_bit_reader_skip_unchecked (GstBitReader * reader, guint nbits)
 {
  reader->bit += nbits;
  reader->byte += reader->bit / 8;
  reader->bit = reader->bit % 8;
 }
 static inline void
 gst_bit_reader_skip_to_byte_unchecked (GstBitReader * reader)
 {
  if (reader->bit) {
    reader->bit = 0;
    reader->byte++;
  }
 }
 #define __GST_BIT_READER_READ_BITS_UNCHECKED(bits) \
 static inline guint##bits \
 gst_bit_reader_peek_bits_uint##bits##_unchecked (const GstBitReader *reader, guint nbits) \
 { \
  guint##bits ret = 0; \
  const guint8 *data; \
  guint byte, bit; \
  \
  data = reader->data; \
  byte = reader->byte; \
  bit = reader->bit; \
  \
  while (nbits > 0) { \
    guint toread = MIN (nbits, 8 - bit); \
    \
    ret <<= toread; \
    ret |= (data[byte] & (0xff >> bit)) >> (8 - toread - bit); \
    \
    bit += toread; \
    if (bit >= 8) { \
      byte++; \
      bit = 0; \
    } \
    nbits -= toread; \
  } \
  \
  return ret; \
 } \
 \
 static inline guint##bits \
 gst_bit_reader_get_bits_uint##bits##_unchecked (GstBitReader *reader, guint nbits) \
 { \
  guint##bits ret; \
  \
  ret = gst_bit_reader_peek_bits_uint##bits##_unchecked (reader, nbits); \
  \
  gst_bit_reader_skip_unchecked (reader, nbits); \
  \
  return ret; \
 }
 __GST_BIT_READER_READ_BITS_UNCHECKED (8)
 __GST_BIT_READER_READ_BITS_UNCHECKED (16)
 __GST_BIT_READER_READ_BITS_UNCHECKED (32)
 __GST_BIT_READER_READ_BITS_UNCHECKED (64)
 #undef __GST_BIT_READER_READ_BITS_UNCHECKED
 /* unchecked variants -- do not use */
 static inline guint
 _gst_bit_reader_get_size_unchecked (const GstBitReader * reader)
 {
  return reader->size * 8;
 }
 static inline guint
 _gst_bit_reader_get_pos_unchecked (const GstBitReader * reader)
 {
  return reader->byte * 8 + reader->bit;
 }
 static inline guint
 _gst_bit_reader_get_remaining_unchecked (const GstBitReader * reader)
 {
  return reader->size * 8 - (reader->byte * 8 + reader->bit);
 }
 /* inlined variants -- do not use directly */
 static inline guint
 _gst_bit_reader_get_size_inline (const GstBitReader * reader)
 {
  g_return_val_if_fail (reader != NULL, 0);
  return _gst_bit_reader_get_size_unchecked (reader);
 }
 static inline guint
 _gst_bit_reader_get_pos_inline (const GstBitReader * reader)
 {
  g_return_val_if_fail (reader != NULL, 0);
  return _gst_bit_reader_get_pos_unchecked (reader);
 }
 static inline guint
 _gst_bit_reader_get_remaining_inline (const GstBitReader * reader)
 {
  g_return_val_if_fail (reader != NULL, 0);
  return _gst_bit_reader_get_remaining_unchecked (reader);
 }
 static inline gboolean
 _gst_bit_reader_skip_inline (GstBitReader * reader, guint nbits)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  if (_gst_bit_reader_get_remaining_unchecked (reader) < nbits)
    return FALSE;
  gst_bit_reader_skip_unchecked (reader, nbits);
  return TRUE;
 }
 static inline gboolean
 _gst_bit_reader_skip_to_byte_inline (GstBitReader * reader)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  if (reader->byte > reader->size)
    return FALSE;
  gst_bit_reader_skip_to_byte_unchecked (reader);
  return TRUE;
 }
 #define __GST_BIT_READER_READ_BITS_INLINE(bits) \
 static inline gboolean \
 _gst_bit_reader_get_bits_uint##bits##_inline (GstBitReader *reader, guint##bits *val, guint nbits) \
 { \
  g_return_val_if_fail (reader != NULL, FALSE); \
  g_return_val_if_fail (val != NULL, FALSE); \
  g_return_val_if_fail (nbits <= bits, FALSE); \
  \
  if (_gst_bit_reader_get_remaining_unchecked (reader) < nbits) \
    return FALSE; \
 \
  *val = gst_bit_reader_get_bits_uint##bits##_unchecked (reader, nbits); \
  return TRUE; \
 } \
 \
 static inline gboolean \
 _gst_bit_reader_peek_bits_uint##bits##_inline (const GstBitReader *reader, guint##bits *val, guint nbits) \
 { \
  g_return_val_if_fail (reader != NULL, FALSE); \
  g_return_val_if_fail (val != NULL, FALSE); \
  g_return_val_if_fail (nbits <= bits, FALSE); \
  \
  if (_gst_bit_reader_get_remaining_unchecked (reader) < nbits) \
    return FALSE; \
 \
  *val = gst_bit_reader_peek_bits_uint##bits##_unchecked (reader, nbits); \
  return TRUE; \
 }
 __GST_BIT_READER_READ_BITS_INLINE (8)
 __GST_BIT_READER_READ_BITS_INLINE (16)
 __GST_BIT_READER_READ_BITS_INLINE (32)
 __GST_BIT_READER_READ_BITS_INLINE (64)
 #undef __GST_BIT_READER_READ_BITS_INLINE
 #ifndef GST_BIT_READER_DISABLE_INLINES
 #define gst_bit_reader_get_size(reader) \
    _gst_bit_reader_get_size_inline (reader)
 #define gst_bit_reader_get_pos(reader) \
    _gst_bit_reader_get_pos_inline (reader)
 #define gst_bit_reader_get_remaining(reader) \
    _gst_bit_reader_get_remaining_inline (reader)
 /* we use defines here so we can add the G_LIKELY() */
 #define gst_bit_reader_skip(reader, nbits)\
    G_LIKELY (_gst_bit_reader_skip_inline(reader, nbits))
 #define gst_bit_reader_skip_to_byte(reader)\
    G_LIKELY (_gst_bit_reader_skip_to_byte_inline(reader))
 #define gst_bit_reader_get_bits_uint8(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_get_bits_uint8_inline (reader, val, nbits))
 #define gst_bit_reader_get_bits_uint16(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_get_bits_uint16_inline (reader, val, nbits))
 #define gst_bit_reader_get_bits_uint32(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_get_bits_uint32_inline (reader, val, nbits))
 #define gst_bit_reader_get_bits_uint64(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_get_bits_uint64_inline (reader, val, nbits))
 #define gst_bit_reader_peek_bits_uint8(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_peek_bits_uint8_inline (reader, val, nbits))
 #define gst_bit_reader_peek_bits_uint16(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_peek_bits_uint16_inline (reader, val, nbits))
 #define gst_bit_reader_peek_bits_uint32(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_peek_bits_uint32_inline (reader, val, nbits))
 #define gst_bit_reader_peek_bits_uint64(reader, val, nbits) \
    G_LIKELY (_gst_bit_reader_peek_bits_uint64_inline (reader, val, nbits))
 #endif
 G_END_DECLS
 #endif /* __GST_BIT_READER_H__ */
@@ -1,67 +0,0 @@
 /* Stub for <gst/base/gstbitwriter.h>.
 *
 * The vendored nalutils.c uses GstBitWriter for NAL emulation-prevention
 * byte INSERTION during write-side (encoder) operations. The libva
 * backend never invokes those paths — we only PARSE NAL units, never
 * write them. The functions must still compile + link though, so we
 * stub them with abort() runtime guards: if any future code path
 * accidentally invokes a writer function, we fail-fast instead of
 * silently corrupting.
 *
 * Header surface mirrors upstream gstbitwriter.h minimally — enough
 * for nalutils.c to compile.
 */
 #ifndef LIBVA_V4L2_REQUEST_FOURIER_GSTBITWRITER_STUB
 #define LIBVA_V4L2_REQUEST_FOURIER_GSTBITWRITER_STUB
 #include "gst_compat.h"
 typedef struct {
    guint8 *data;
    guint  bit_size;
    guint  bit_capacity;
    gboolean auto_grow;
    gboolean owned;
 } GstBitWriter;
 static inline void
 gst_bit_writer_init(GstBitWriter *bw) { (void)bw; abort(); }
 static inline void
 gst_bit_writer_init_with_size(GstBitWriter *bw, guint size, gboolean fixed) {
    (void)bw; (void)size; (void)fixed; abort();
 }
 static inline void
 gst_bit_writer_reset(GstBitWriter *bw) { (void)bw; abort(); }
 static inline gboolean
 gst_bit_writer_put_bits_uint8(GstBitWriter *bw, guint8 value, guint nbits) {
    (void)bw; (void)value; (void)nbits; abort();
 }
 static inline gboolean
 gst_bit_writer_align_bytes(GstBitWriter *bw, guint8 trailing_bit) {
    (void)bw; (void)trailing_bit; abort();
 }
 static inline guint8 *
 gst_bit_writer_get_data(GstBitWriter *bw) { (void)bw; abort(); }
 static inline guint
 gst_bit_writer_get_size(const GstBitWriter *bw) { (void)bw; abort(); }
 static inline guint
 gst_bit_writer_reset_and_get_size(GstBitWriter *bw) { (void)bw; abort(); }
 static inline guint8 *
 gst_bit_writer_reset_and_get_data(GstBitWriter *bw) { (void)bw; abort(); }
 static inline gboolean
 gst_bit_writer_put_bits_uint16(GstBitWriter *bw, guint16 value, guint nbits) {
    (void)bw; (void)value; (void)nbits; abort();
 }
 static inline gboolean
 gst_bit_writer_put_bits_uint32(GstBitWriter *bw, guint32 value, guint nbits) {
    (void)bw; (void)value; (void)nbits; abort();
 }
 static inline gboolean
 gst_bit_writer_put_bytes(GstBitWriter *bw, const guint8 *data, guint nbytes) {
    (void)bw; (void)data; (void)nbytes; abort();
 }
 #define GST_BIT_WRITER_BIT_SIZE(bw)  ((bw)->bit_size)
 #define GST_BIT_WRITER_DATA(bw)      ((bw)->data)
 #endif
@@ -1,684 +0,0 @@
 /* GStreamer byte reader
 *
 * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk>.
 * Copyright (C) 2009 Tim-Philipp Müller <tim centricular net>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */
 #ifndef __GST_BYTE_READER_H__
 #define __GST_BYTE_READER_H__
 #include <gst/gst.h>
 #include <gst/base/base-prelude.h>
 G_BEGIN_DECLS
 #define GST_BYTE_READER(reader) ((GstByteReader *) (reader))
 /**
 * GstByteReader:
 * @data: (array length=size): Data from which the bit reader will
 *   read
 * @size: Size of @data in bytes
 * @byte: Current byte position
 *
 * A byte reader instance.
 */
 typedef struct {
  const guint8 *data;
  guint size;
  guint byte;  /* Byte position */
  /* < private > */
  gpointer _gst_reserved[GST_PADDING];
 } GstByteReader;
 GST_BASE_API
 GstByteReader * gst_byte_reader_new             (const guint8 *data, guint size) G_GNUC_MALLOC;
 GST_BASE_API
 void            gst_byte_reader_free            (GstByteReader *reader);
 GST_BASE_API
 void            gst_byte_reader_init            (GstByteReader *reader, const guint8 *data, guint size);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_sub_reader (GstByteReader * reader,
                                                 GstByteReader * sub_reader,
                                                 guint           size);
 GST_BASE_API
 gboolean        gst_byte_reader_get_sub_reader  (GstByteReader * reader,
                                                 GstByteReader * sub_reader,
                                                 guint           size);
 GST_BASE_API
 gboolean        gst_byte_reader_set_pos         (GstByteReader *reader, guint pos);
 GST_BASE_API
 guint           gst_byte_reader_get_pos         (const GstByteReader *reader);
 GST_BASE_API
 guint           gst_byte_reader_get_remaining   (const GstByteReader *reader);
 GST_BASE_API
 guint           gst_byte_reader_get_size        (const GstByteReader *reader);
 GST_BASE_API
 gboolean        gst_byte_reader_skip            (GstByteReader *reader, guint nbytes);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint8       (GstByteReader *reader, guint8 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int8        (GstByteReader *reader, gint8 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint16_le   (GstByteReader *reader, guint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int16_le    (GstByteReader *reader, gint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint16_be   (GstByteReader *reader, guint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int16_be    (GstByteReader *reader, gint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint24_le   (GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int24_le    (GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint24_be   (GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int24_be    (GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint32_le   (GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int32_le    (GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint32_be   (GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int32_be    (GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint64_le   (GstByteReader *reader, guint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int64_le    (GstByteReader *reader, gint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_uint64_be   (GstByteReader *reader, guint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_int64_be    (GstByteReader *reader, gint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint8      (const GstByteReader *reader, guint8 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int8       (const GstByteReader *reader, gint8 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint16_le  (const GstByteReader *reader, guint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int16_le   (const GstByteReader *reader, gint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint16_be  (const GstByteReader *reader, guint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int16_be   (const GstByteReader *reader, gint16 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint24_le  (const GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int24_le   (const GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint24_be  (const GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int24_be   (const GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint32_le  (const GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int32_le   (const GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint32_be  (const GstByteReader *reader, guint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int32_be   (const GstByteReader *reader, gint32 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint64_le  (const GstByteReader *reader, guint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int64_le   (const GstByteReader *reader, gint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_uint64_be  (const GstByteReader *reader, guint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_int64_be   (const GstByteReader *reader, gint64 *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_float32_le  (GstByteReader *reader, gfloat *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_float32_be  (GstByteReader *reader, gfloat *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_float64_le  (GstByteReader *reader, gdouble *val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_float64_be  (GstByteReader *reader, gdouble *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_float32_le (const GstByteReader *reader, gfloat *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_float32_be (const GstByteReader *reader, gfloat *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_float64_le (const GstByteReader *reader, gdouble *val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_float64_be (const GstByteReader *reader, gdouble *val);
 GST_BASE_API
 gboolean        gst_byte_reader_dup_data        (GstByteReader * reader, guint size, guint8       ** val);
 GST_BASE_API
 gboolean        gst_byte_reader_get_data        (GstByteReader * reader, guint size, const guint8 ** val);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_data       (const GstByteReader * reader, guint size, const guint8 ** val);
 #define gst_byte_reader_dup_string(reader,str) \
    gst_byte_reader_dup_string_utf8(reader,str)
 GST_BASE_API
 gboolean        gst_byte_reader_dup_string_utf8  (GstByteReader * reader, gchar   ** str);
 GST_BASE_API
 gboolean        gst_byte_reader_dup_string_utf16 (GstByteReader * reader, guint16 ** str);
 GST_BASE_API
 gboolean        gst_byte_reader_dup_string_utf32 (GstByteReader * reader, guint32 ** str);
 #define gst_byte_reader_skip_string(reader) \
    gst_byte_reader_skip_string_utf8(reader)
 GST_BASE_API
 gboolean        gst_byte_reader_skip_string_utf8  (GstByteReader * reader);
 GST_BASE_API
 gboolean        gst_byte_reader_skip_string_utf16 (GstByteReader * reader);
 GST_BASE_API
 gboolean        gst_byte_reader_skip_string_utf32 (GstByteReader * reader);
 #define gst_byte_reader_get_string(reader,str) \
    gst_byte_reader_get_string_utf8(reader,str)
 #define gst_byte_reader_peek_string(reader,str) \
    gst_byte_reader_peek_string_utf8(reader,str)
 GST_BASE_API
 gboolean        gst_byte_reader_get_string_utf8    (GstByteReader * reader, const gchar ** str);
 GST_BASE_API
 gboolean        gst_byte_reader_peek_string_utf8   (const GstByteReader * reader, const gchar ** str);
 GST_BASE_API
 guint           gst_byte_reader_masked_scan_uint32 (const GstByteReader * reader,
                                                    guint32               mask,
                                                    guint32               pattern,
                                                    guint                 offset,
                                                    guint                 size);
 GST_BASE_API
 guint           gst_byte_reader_masked_scan_uint32_peek (const GstByteReader * reader,
                                                         guint32 mask,
                                                         guint32 pattern,
                                                         guint offset,
                                                         guint size,
                                                         guint32 * value);
 /**
 * GST_BYTE_READER_INIT:
 * @data: Data from which the #GstByteReader should read
 * @size: Size of @data in bytes
 *
 * A #GstByteReader must be initialized with this macro, before it can be
 * used. This macro can used be to initialize a variable, but it cannot
 * be assigned to a variable. In that case you have to use
 * gst_byte_reader_init().
 */
 #define GST_BYTE_READER_INIT(data, size) {data, size, 0}
 /* unchecked variants */
 static inline void
 gst_byte_reader_skip_unchecked (GstByteReader * reader, guint nbytes)
 {
  reader->byte += nbytes;
 }
 #define __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(bits,type,lower,upper,adj) \
 \
 static inline type \
 gst_byte_reader_peek_##lower##_unchecked (const GstByteReader * reader) \
 { \
  type val = (type) GST_READ_##upper (reader->data + reader->byte); \
  adj \
  return val; \
 } \
 \
 static inline type \
 gst_byte_reader_get_##lower##_unchecked (GstByteReader * reader) \
 { \
  type val = gst_byte_reader_peek_##lower##_unchecked (reader); \
  reader->byte += bits / 8; \
  return val; \
 }
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(8,guint8,uint8,UINT8,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(8,gint8,int8,UINT8,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,guint16,uint16_le,UINT16_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,guint16,uint16_be,UINT16_BE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,gint16,int16_le,UINT16_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(16,gint16,int16_be,UINT16_BE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,guint32,uint32_le,UINT32_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,guint32,uint32_be,UINT32_BE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gint32,int32_le,UINT32_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gint32,int32_be,UINT32_BE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,guint32,uint24_le,UINT24_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,guint32,uint24_be,UINT24_BE,/* */)
 /* fix up the sign for 24-bit signed ints stored in 32-bit signed ints */
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,gint32,int24_le,UINT24_LE,
    if (val & 0x00800000) val |= 0xff000000;)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(24,gint32,int24_be,UINT24_BE,
    if (val & 0x00800000) val |= 0xff000000;)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,guint64,uint64_le,UINT64_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,guint64,uint64_be,UINT64_BE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gint64,int64_le,UINT64_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gint64,int64_be,UINT64_BE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gfloat,float32_le,FLOAT_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(32,gfloat,float32_be,FLOAT_BE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gdouble,float64_le,DOUBLE_LE,/* */)
 __GST_BYTE_READER_GET_PEEK_BITS_UNCHECKED(64,gdouble,float64_be,DOUBLE_BE,/* */)
 #undef __GET_PEEK_BITS_UNCHECKED
 static inline const guint8 *
 gst_byte_reader_peek_data_unchecked (const GstByteReader * reader)
 {
  return (const guint8 *) (reader->data + reader->byte);
 }
 static inline const guint8 *
 gst_byte_reader_get_data_unchecked (GstByteReader * reader, guint size)
 {
  const guint8 *data;
  data = gst_byte_reader_peek_data_unchecked (reader);
  gst_byte_reader_skip_unchecked (reader, size);
  return data;
 }
 static inline guint8 *
 gst_byte_reader_dup_data_unchecked (GstByteReader * reader, guint size)
 {
  gconstpointer data = gst_byte_reader_get_data_unchecked (reader, size);
  guint8 *dup_data = (guint8 *) g_malloc (size);
  memcpy (dup_data, data, size);
  return dup_data;
 }
 /* Unchecked variants that should not be used */
 static inline guint
 _gst_byte_reader_get_pos_unchecked (const GstByteReader * reader)
 {
  return reader->byte;
 }
 static inline guint
 _gst_byte_reader_get_remaining_unchecked (const GstByteReader * reader)
 {
  return reader->size - reader->byte;
 }
 static inline guint
 _gst_byte_reader_get_size_unchecked (const GstByteReader * reader)
 {
  return reader->size;
 }
 /* inlined variants (do not use directly) */
 static inline guint
 _gst_byte_reader_get_remaining_inline (const GstByteReader * reader)
 {
  g_return_val_if_fail (reader != NULL, 0);
  return _gst_byte_reader_get_remaining_unchecked (reader);
 }
 static inline guint
 _gst_byte_reader_get_size_inline (const GstByteReader * reader)
 {
  g_return_val_if_fail (reader != NULL, 0);
  return _gst_byte_reader_get_size_unchecked (reader);
 }
 #define __GST_BYTE_READER_GET_PEEK_BITS_INLINE(bits,type,name) \
 \
 static inline gboolean \
 _gst_byte_reader_peek_##name##_inline (const GstByteReader * reader, type * val) \
 { \
  g_return_val_if_fail (reader != NULL, FALSE); \
  g_return_val_if_fail (val != NULL, FALSE); \
  \
  if (_gst_byte_reader_get_remaining_unchecked (reader) < (bits / 8)) \
    return FALSE; \
 \
  *val = gst_byte_reader_peek_##name##_unchecked (reader); \
  return TRUE; \
 } \
 \
 static inline gboolean \
 _gst_byte_reader_get_##name##_inline (GstByteReader * reader, type * val) \
 { \
  g_return_val_if_fail (reader != NULL, FALSE); \
  g_return_val_if_fail (val != NULL, FALSE); \
  \
  if (_gst_byte_reader_get_remaining_unchecked (reader) < (bits / 8)) \
    return FALSE; \
 \
  *val = gst_byte_reader_get_##name##_unchecked (reader); \
  return TRUE; \
 }
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(8,guint8,uint8)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(8,gint8,int8)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,guint16,uint16_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,guint16,uint16_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,gint16,int16_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(16,gint16,int16_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,guint32,uint32_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,guint32,uint32_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gint32,int32_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gint32,int32_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,guint32,uint24_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,guint32,uint24_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,gint32,int24_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(24,gint32,int24_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,guint64,uint64_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,guint64,uint64_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gint64,int64_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gint64,int64_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gfloat,float32_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(32,gfloat,float32_be)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gdouble,float64_le)
 __GST_BYTE_READER_GET_PEEK_BITS_INLINE(64,gdouble,float64_be)
 #undef __GST_BYTE_READER_GET_PEEK_BITS_INLINE
 #ifndef GST_BYTE_READER_DISABLE_INLINES
 #define gst_byte_reader_init(reader,data,size) \
    _gst_byte_reader_init_inline(reader,data,size)
 #define gst_byte_reader_get_remaining(reader) \
    _gst_byte_reader_get_remaining_inline(reader)
 #define gst_byte_reader_get_size(reader) \
    _gst_byte_reader_get_size_inline(reader)
 #define gst_byte_reader_get_pos(reader) \
    _gst_byte_reader_get_pos_inline(reader)
 /* we use defines here so we can add the G_LIKELY() */
 #define gst_byte_reader_get_uint8(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint8_inline(reader,val))
 #define gst_byte_reader_get_int8(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int8_inline(reader,val))
 #define gst_byte_reader_get_uint16_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint16_le_inline(reader,val))
 #define gst_byte_reader_get_int16_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int16_le_inline(reader,val))
 #define gst_byte_reader_get_uint16_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint16_be_inline(reader,val))
 #define gst_byte_reader_get_int16_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int16_be_inline(reader,val))
 #define gst_byte_reader_get_uint24_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint24_le_inline(reader,val))
 #define gst_byte_reader_get_int24_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int24_le_inline(reader,val))
 #define gst_byte_reader_get_uint24_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint24_be_inline(reader,val))
 #define gst_byte_reader_get_int24_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int24_be_inline(reader,val))
 #define gst_byte_reader_get_uint32_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint32_le_inline(reader,val))
 #define gst_byte_reader_get_int32_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int32_le_inline(reader,val))
 #define gst_byte_reader_get_uint32_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint32_be_inline(reader,val))
 #define gst_byte_reader_get_int32_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int32_be_inline(reader,val))
 #define gst_byte_reader_get_uint64_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint64_le_inline(reader,val))
 #define gst_byte_reader_get_int64_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int64_le_inline(reader,val))
 #define gst_byte_reader_get_uint64_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_uint64_be_inline(reader,val))
 #define gst_byte_reader_get_int64_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_int64_be_inline(reader,val))
 #define gst_byte_reader_peek_uint8(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint8_inline(reader,val))
 #define gst_byte_reader_peek_int8(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int8_inline(reader,val))
 #define gst_byte_reader_peek_uint16_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint16_le_inline(reader,val))
 #define gst_byte_reader_peek_int16_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int16_le_inline(reader,val))
 #define gst_byte_reader_peek_uint16_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint16_be_inline(reader,val))
 #define gst_byte_reader_peek_int16_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int16_be_inline(reader,val))
 #define gst_byte_reader_peek_uint24_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint24_le_inline(reader,val))
 #define gst_byte_reader_peek_int24_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int24_le_inline(reader,val))
 #define gst_byte_reader_peek_uint24_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint24_be_inline(reader,val))
 #define gst_byte_reader_peek_int24_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int24_be_inline(reader,val))
 #define gst_byte_reader_peek_uint32_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint32_le_inline(reader,val))
 #define gst_byte_reader_peek_int32_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int32_le_inline(reader,val))
 #define gst_byte_reader_peek_uint32_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint32_be_inline(reader,val))
 #define gst_byte_reader_peek_int32_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int32_be_inline(reader,val))
 #define gst_byte_reader_peek_uint64_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint64_le_inline(reader,val))
 #define gst_byte_reader_peek_int64_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int64_le_inline(reader,val))
 #define gst_byte_reader_peek_uint64_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_uint64_be_inline(reader,val))
 #define gst_byte_reader_peek_int64_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_int64_be_inline(reader,val))
 #define gst_byte_reader_get_float32_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_float32_le_inline(reader,val))
 #define gst_byte_reader_get_float32_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_float32_be_inline(reader,val))
 #define gst_byte_reader_get_float64_le(reader,val) \
    G_LIKELY(_gst_byte_reader_get_float64_le_inline(reader,val))
 #define gst_byte_reader_get_float64_be(reader,val) \
    G_LIKELY(_gst_byte_reader_get_float64_be_inline(reader,val))
 #define gst_byte_reader_peek_float32_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_float32_le_inline(reader,val))
 #define gst_byte_reader_peek_float32_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_float32_be_inline(reader,val))
 #define gst_byte_reader_peek_float64_le(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_float64_le_inline(reader,val))
 #define gst_byte_reader_peek_float64_be(reader,val) \
    G_LIKELY(_gst_byte_reader_peek_float64_be_inline(reader,val))
 #endif /* GST_BYTE_READER_DISABLE_INLINES */
 static inline void
 _gst_byte_reader_init_inline (GstByteReader * reader, const guint8 * data, guint size)
 {
  g_return_if_fail (reader != NULL);
  reader->data = data;
  reader->size = size;
  reader->byte = 0;
 }
 static inline gboolean
 _gst_byte_reader_peek_sub_reader_inline (GstByteReader * reader,
    GstByteReader * sub_reader, guint size)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  g_return_val_if_fail (sub_reader != NULL, FALSE);
  if (_gst_byte_reader_get_remaining_unchecked (reader) < size)
    return FALSE;
  sub_reader->data = reader->data + reader->byte;
  sub_reader->byte = 0;
  sub_reader->size = size;
  return TRUE;
 }
 static inline gboolean
 _gst_byte_reader_get_sub_reader_inline (GstByteReader * reader,
    GstByteReader * sub_reader, guint size)
 {
  if (!_gst_byte_reader_peek_sub_reader_inline (reader, sub_reader, size))
    return FALSE;
  gst_byte_reader_skip_unchecked (reader, size);
  return TRUE;
 }
 static inline gboolean
 _gst_byte_reader_dup_data_inline (GstByteReader * reader, guint size, guint8 ** val)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  g_return_val_if_fail (val != NULL, FALSE);
  if (G_UNLIKELY (size > reader->size || _gst_byte_reader_get_remaining_unchecked (reader) < size))
    return FALSE;
  *val = gst_byte_reader_dup_data_unchecked (reader, size);
  return TRUE;
 }
 static inline gboolean
 _gst_byte_reader_get_data_inline (GstByteReader * reader, guint size, const guint8 ** val)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  g_return_val_if_fail (val != NULL, FALSE);
  if (G_UNLIKELY (size > reader->size || _gst_byte_reader_get_remaining_unchecked (reader) < size))
    return FALSE;
  *val = gst_byte_reader_get_data_unchecked (reader, size);
  return TRUE;
 }
 static inline gboolean
 _gst_byte_reader_peek_data_inline (const GstByteReader * reader, guint size, const guint8 ** val)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  g_return_val_if_fail (val != NULL, FALSE);
  if (G_UNLIKELY (size > reader->size || _gst_byte_reader_get_remaining_unchecked (reader) < size))
    return FALSE;
  *val = gst_byte_reader_peek_data_unchecked (reader);
  return TRUE;
 }
 static inline guint
 _gst_byte_reader_get_pos_inline (const GstByteReader * reader)
 {
  g_return_val_if_fail (reader != NULL, 0);
  return _gst_byte_reader_get_pos_unchecked (reader);
 }
 static inline gboolean
 _gst_byte_reader_skip_inline (GstByteReader * reader, guint nbytes)
 {
  g_return_val_if_fail (reader != NULL, FALSE);
  if (G_UNLIKELY (_gst_byte_reader_get_remaining_unchecked (reader) < nbytes))
    return FALSE;
  reader->byte += nbytes;
  return TRUE;
 }
 #ifndef GST_BYTE_READER_DISABLE_INLINES
 #define gst_byte_reader_dup_data(reader,size,val) \
    G_LIKELY(_gst_byte_reader_dup_data_inline(reader,size,val))
 #define gst_byte_reader_get_data(reader,size,val) \
    G_LIKELY(_gst_byte_reader_get_data_inline(reader,size,val))
 #define gst_byte_reader_peek_data(reader,size,val) \
    G_LIKELY(_gst_byte_reader_peek_data_inline(reader,size,val))
 #define gst_byte_reader_skip(reader,nbytes) \
    G_LIKELY(_gst_byte_reader_skip_inline(reader,nbytes))
 #endif /* GST_BYTE_READER_DISABLE_INLINES */
 G_END_DECLS
 #endif /* __GST_BYTE_READER_H__ */
@@ -1,9 +0,0 @@
 /* Stub for <gst/codecparsers/codecparsers-prelude.h>.
 * Same shape as base-prelude.h — drop the GObject boilerplate + define
 * the GstCodecParsersAPI macro to nothing.
 */
 #ifndef LIBVA_V4L2_REQUEST_FOURIER_CODECPARSERS_PRELUDE_STUB
 #define LIBVA_V4L2_REQUEST_FOURIER_CODECPARSERS_PRELUDE_STUB
 #include "gst_compat.h"
 #define GST_CODEC_PARSERS_API
 #endif
@@ -1,545 +0,0 @@
 /* Gstreamer
 * Copyright (C) <2011> Intel Corporation
 * Copyright (C) <2011> Collabora Ltd.
 * Copyright (C) <2011> Thibault Saunier <thibault.saunier@collabora.com>
 *
 * Some bits C-c,C-v'ed and s/4/3 from h264parse and videoparsers/h264parse.c:
 *    Copyright (C) <2010> Mark Nauwelaerts <mark.nauwelaerts@collabora.co.uk>
 *    Copyright (C) <2010> Collabora Multimedia
 *    Copyright (C) <2010> Nokia Corporation
 *
 *    (C) 2005 Michal Benes <michal.benes@itonis.tv>
 *    (C) 2008 Wim Taymans <wim.taymans@gmail.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */
 /*
 * Common code for NAL parsing from h264 and h265 parsers.
 */
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
 #include "nalutils.h"
 /****** Nal parser ******/
 void
 nal_reader_init (NalReader * nr, const guint8 * data, guint size)
 {
  nr->data = data;
  nr->size = size;
  nr->n_epb = 0;
  nr->byte = 0;
  nr->bits_in_cache = 0;
  /* fill with something other than 0 to detect emulation prevention bytes */
  nr->first_byte = 0xff;
  nr->epb_cache = 0xff;
  nr->cache = 0xff;
 }
 gboolean
 nal_reader_read (NalReader * nr, guint nbits)
 {
  if (G_UNLIKELY (nr->byte * 8 + (nbits - nr->bits_in_cache) > nr->size * 8)) {
    GST_DEBUG ("Can not read %u bits, bits in cache %u, Byte * 8 %u, size in "
        "bits %u", nbits, nr->bits_in_cache, nr->byte * 8, nr->size * 8);
    return FALSE;
  }
  while (nr->bits_in_cache < nbits) {
    guint8 byte;
  next_byte:
    if (G_UNLIKELY (nr->byte >= nr->size))
      return FALSE;
    byte = nr->data[nr->byte++];
    nr->epb_cache = (nr->epb_cache << 8) | byte;
    /* check if the byte is a emulation_prevention_three_byte */
    if ((nr->epb_cache & 0xffffff) == 0x3) {
      nr->n_epb++;
      goto next_byte;
    }
    nr->cache = (nr->cache << 8) | nr->first_byte;
    nr->first_byte = byte;
    nr->bits_in_cache += 8;
  }
  return TRUE;
 }
 /* Skips the specified amount of bits. This is only suitable to a
   cacheable number of bits */
 gboolean
 nal_reader_skip (NalReader * nr, guint nbits)
 {
  g_assert (nbits <= 8 * sizeof (nr->cache));
  if (G_UNLIKELY (!nal_reader_read (nr, nbits)))
    return FALSE;
  nr->bits_in_cache -= nbits;
  return TRUE;
 }
 /* Generic version to skip any number of bits */
 gboolean
 nal_reader_skip_long (NalReader * nr, guint nbits)
 {
  /* Leave out enough bits in the cache once we are finished */
  const guint skip_size = 4 * sizeof (nr->cache);
  guint remaining = nbits;
  nbits %= skip_size;
  while (remaining > 0) {
    if (!nal_reader_skip (nr, nbits))
      return FALSE;
    remaining -= nbits;
    nbits = skip_size;
  }
  return TRUE;
 }
 guint
 nal_reader_get_pos (const NalReader * nr)
 {
  return nr->byte * 8 - nr->bits_in_cache;
 }
 guint
 nal_reader_get_remaining (const NalReader * nr)
 {
  return (nr->size - nr->byte) * 8 + nr->bits_in_cache;
 }
 guint
 nal_reader_get_epb_count (const NalReader * nr)
 {
  return nr->n_epb;
 }
 #define NAL_READER_READ_BITS(bits) \
 gboolean \
 nal_reader_get_bits_uint##bits (NalReader *nr, guint##bits *val, guint nbits) \
 { \
  guint shift; \
  \
  if (!nal_reader_read (nr, nbits)) \
    return FALSE; \
  \
  /* bring the required bits down and truncate */ \
  shift = nr->bits_in_cache - nbits; \
  *val = nr->first_byte >> shift; \
  \
  *val |= nr->cache << (8 - shift); \
  /* mask out required bits */ \
  if (nbits < bits) \
    *val &= ((guint##bits)1 << nbits) - 1; \
  \
  nr->bits_in_cache = shift; \
  \
  return TRUE; \
 } \
 NAL_READER_READ_BITS (8);
 NAL_READER_READ_BITS (16);
 NAL_READER_READ_BITS (32);
 #define NAL_READER_PEEK_BITS(bits) \
 gboolean \
 nal_reader_peek_bits_uint##bits (const NalReader *nr, guint##bits *val, guint nbits) \
 { \
  NalReader tmp; \
  \
  tmp = *nr; \
  return nal_reader_get_bits_uint##bits (&tmp, val, nbits); \
 }
 NAL_READER_PEEK_BITS (8);
 gboolean
 nal_reader_get_ue (NalReader * nr, guint32 * val)
 {
  guint i = 0;
  guint8 bit;
  guint32 value;
  if (G_UNLIKELY (!nal_reader_get_bits_uint8 (nr, &bit, 1)))
    return FALSE;
  while (bit == 0) {
    i++;
    if (G_UNLIKELY (!nal_reader_get_bits_uint8 (nr, &bit, 1)))
      return FALSE;
  }
  if (G_UNLIKELY (i > 31))
    return FALSE;
  if (G_UNLIKELY (!nal_reader_get_bits_uint32 (nr, &value, i)))
    return FALSE;
  *val = (1 << i) - 1 + value;
  return TRUE;
 }
 gboolean
 nal_reader_get_se (NalReader * nr, gint32 * val)
 {
  guint32 value;
  if (G_UNLIKELY (!nal_reader_get_ue (nr, &value)))
    return FALSE;
  if (value % 2)
    *val = (value / 2) + 1;
  else
    *val = -(value / 2);
  return TRUE;
 }
 gboolean
 nal_reader_is_byte_aligned (NalReader * nr)
 {
  if (nr->bits_in_cache != 0)
    return FALSE;
  return TRUE;
 }
 gboolean
 nal_reader_has_more_data (NalReader * nr)
 {
  NalReader nr_tmp;
  guint remaining, nbits;
  guint8 rbsp_stop_one_bit, zero_bits;
  remaining = nal_reader_get_remaining (nr);
  if (remaining == 0)
    return FALSE;
  nr_tmp = *nr;
  nr = &nr_tmp;
  /* The spec defines that more_rbsp_data() searches for the last bit
     equal to 1, and that it is the rbsp_stop_one_bit. Subsequent bits
     until byte boundary is reached shall be zero.
     This means that more_rbsp_data() is FALSE if the next bit is 1
     and the remaining bits until byte boundary are zero. One way to
     be sure that this bit was the very last one, is that every other
     bit after we reached byte boundary are also set to zero.
     Otherwise, if the next bit is 0 or if there are non-zero bits
     afterwards, then then we have more_rbsp_data() */
  if (!nal_reader_get_bits_uint8 (nr, &rbsp_stop_one_bit, 1))
    return FALSE;
  if (!rbsp_stop_one_bit)
    return TRUE;
  nbits = --remaining % 8;
  while (remaining > 0) {
    if (!nal_reader_get_bits_uint8 (nr, &zero_bits, nbits))
      return FALSE;
    if (zero_bits != 0)
      return TRUE;
    remaining -= nbits;
    nbits = 8;
  }
  return FALSE;
 }
 /***********  end of nal parser ***************/
 gint
 scan_for_start_codes (const guint8 * data, guint size)
 {
  GstByteReader br;
  gst_byte_reader_init (&br, data, size);
  /* NALU not empty, so we can at least expect 1 (even 2) bytes following sc */
  return gst_byte_reader_masked_scan_uint32 (&br, 0xffffff00, 0x00000100,
      0, size);
 }
 void
 nal_writer_init (NalWriter * nw, guint nal_prefix_size, gboolean packetized)
 {
  g_return_if_fail (nw != NULL);
  g_return_if_fail ((packetized && nal_prefix_size > 1 && nal_prefix_size < 5)
      || (!packetized && (nal_prefix_size == 3 || nal_prefix_size == 4)));
  gst_bit_writer_init (&nw->bw);
  nw->nal_prefix_size = nal_prefix_size;
  nw->packetized = packetized;
 }
 void
 nal_writer_reset (NalWriter * nw)
 {
  g_return_if_fail (nw != NULL);
  gst_bit_writer_reset (&nw->bw);
  memset (nw, 0, sizeof (NalWriter));
 }
 gboolean
 nal_writer_do_rbsp_trailing_bits (NalWriter * nw)
 {
  g_return_val_if_fail (nw != NULL, FALSE);
  if (!gst_bit_writer_put_bits_uint8 (&nw->bw, 1, 1)) {
    GST_WARNING ("Cannot put trailing bits");
    return FALSE;
  }
  if (!gst_bit_writer_align_bytes (&nw->bw, 0)) {
    GST_WARNING ("Cannot put align bits");
    return FALSE;
  }
  return TRUE;
 }
 static gpointer
 nal_writer_create_nal_data (NalWriter * nw, guint32 * ret_size)
 {
  GstBitWriter bw;
  gint i;
  guint8 *src, *dst;
  gsize size;
  gpointer data;
  /* scan to put emulation_prevention_three_byte */
  size = GST_BIT_WRITER_BIT_SIZE (&nw->bw) >> 3;
  src = GST_BIT_WRITER_DATA (&nw->bw);
  gst_bit_writer_init_with_size (&bw, size + nw->nal_prefix_size, FALSE);
  for (i = 0; i < nw->nal_prefix_size - 1; i++)
    gst_bit_writer_put_bits_uint8 (&bw, 0, 8);
  gst_bit_writer_put_bits_uint8 (&bw, 1, 8);
  for (i = 0; i < size; i++) {
    guint pos = (GST_BIT_WRITER_BIT_SIZE (&bw) >> 3);
    dst = GST_BIT_WRITER_DATA (&bw);
    if (pos >= nw->nal_prefix_size + 2 &&
        dst[pos - 2] == 0 && dst[pos - 1] == 0 && src[i] <= 0x3) {
      gst_bit_writer_put_bits_uint8 (&bw, 0x3, 8);
    }
    gst_bit_writer_put_bits_uint8 (&bw, src[i], 8);
  }
  *ret_size = bw.bit_size >> 3;
  data = gst_bit_writer_reset_and_get_data (&bw);
  if (nw->packetized) {
    size = *ret_size - nw->nal_prefix_size;
    switch (nw->nal_prefix_size) {
      case 1:
        GST_WRITE_UINT8 (data, size);
        break;
      case 2:
        GST_WRITE_UINT16_BE (data, size);
        break;
      case 3:
        GST_WRITE_UINT24_BE (data, size);
        break;
      case 4:
        GST_WRITE_UINT32_BE (data, size);
        break;
      default:
        g_assert_not_reached ();
        break;
    }
  }
  return data;
 }
 GstMemory *
 nal_writer_reset_and_get_memory (NalWriter * nw)
 {
  guint32 size = 0;
  GstMemory *ret = NULL;
  gpointer data;
  g_return_val_if_fail (nw != NULL, NULL);
  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) >> 3) == 0) {
    GST_WARNING ("No written byte");
    goto done;
  }
  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) & 0x7) != 0) {
    GST_WARNING ("Written stream is not byte aligned");
    if (!nal_writer_do_rbsp_trailing_bits (nw))
      goto done;
  }
  data = nal_writer_create_nal_data (nw, &size);
  if (!data) {
    GST_WARNING ("Failed to create nal data");
    goto done;
  }
  ret = gst_memory_new_wrapped (0, data, size, 0, size, data, g_free);
 done:
  gst_bit_writer_reset (&nw->bw);
  return ret;
 }
 guint8 *
 nal_writer_reset_and_get_data (NalWriter * nw, guint32 * ret_size)
 {
  guint32 size = 0;
  guint8 *data = NULL;
  g_return_val_if_fail (nw != NULL, NULL);
  g_return_val_if_fail (ret_size != NULL, NULL);
  *ret_size = 0;
  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) >> 3) == 0) {
    GST_WARNING ("No written byte");
    goto done;
  }
  if ((GST_BIT_WRITER_BIT_SIZE (&nw->bw) & 0x7) != 0) {
    GST_WARNING ("Written stream is not byte aligned");
    if (!nal_writer_do_rbsp_trailing_bits (nw))
      goto done;
  }
  data = nal_writer_create_nal_data (nw, &size);
  if (!data) {
    GST_WARNING ("Failed to create nal data");
    goto done;
  }
  *ret_size = size;
 done:
  gst_bit_writer_reset (&nw->bw);
  return data;
 }
 gboolean
 nal_writer_put_bits_uint8 (NalWriter * nw, guint8 value, guint nbits)
 {
  g_return_val_if_fail (nw != NULL, FALSE);
  if (!gst_bit_writer_put_bits_uint8 (&nw->bw, value, nbits))
    return FALSE;
  return TRUE;
 }
 gboolean
 nal_writer_put_bits_uint16 (NalWriter * nw, guint16 value, guint nbits)
 {
  g_return_val_if_fail (nw != NULL, FALSE);
  if (!gst_bit_writer_put_bits_uint16 (&nw->bw, value, nbits))
    return FALSE;
  return TRUE;
 }
 gboolean
 nal_writer_put_bits_uint32 (NalWriter * nw, guint32 value, guint nbits)
 {
  g_return_val_if_fail (nw != NULL, FALSE);
  if (!gst_bit_writer_put_bits_uint32 (&nw->bw, value, nbits))
    return FALSE;
  return TRUE;
 }
 gboolean
 nal_writer_put_bytes (NalWriter * nw, const guint8 * data, guint nbytes)
 {
  g_return_val_if_fail (nw != NULL, FALSE);
  g_return_val_if_fail (data != NULL, FALSE);
  g_return_val_if_fail (nbytes != 0, FALSE);
  if (!gst_bit_writer_put_bytes (&nw->bw, data, nbytes))
    return FALSE;
  return TRUE;
 }
 gboolean
 nal_writer_put_ue (NalWriter * nw, guint32 value)
 {
  guint leading_zeros;
  guint rest;
  g_return_val_if_fail (nw != NULL, FALSE);
  count_exp_golomb_bits (value, &leading_zeros, &rest);
  /* write leading zeros */
  if (leading_zeros) {
    if (!nal_writer_put_bits_uint32 (nw, 0, leading_zeros))
      return FALSE;
  }
  /* write the rest */
  if (!nal_writer_put_bits_uint32 (nw, value + 1, rest))
    return FALSE;
  return TRUE;
 }
 gboolean
 count_exp_golomb_bits (guint32 value, guint * leading_zeros, guint * rest)
 {
  guint32 x;
  guint count = 0;
  /* https://en.wikipedia.org/wiki/Exponential-Golomb_coding */
  /* count bits of value + 1 */
  x = value + 1;
  while (x) {
    count++;
    x >>= 1;
  }
  if (leading_zeros) {
    if (count > 1)
      *leading_zeros = count - 1;
    else
      *leading_zeros = 0;
  }
  if (rest) {
    *rest = count;
  }
  return TRUE;
 }
@@ -1,269 +0,0 @@
 /* Gstreamer
 * Copyright (C) <2011> Intel Corporation
 * Copyright (C) <2011> Collabora Ltd.
 * Copyright (C) <2011> Thibault Saunier <thibault.saunier@collabora.com>
 *
 * Some bits C-c,C-v'ed and s/4/3 from h264parse and videoparsers/h264parse.c:
 *    Copyright (C) <2010> Mark Nauwelaerts <mark.nauwelaerts@collabora.co.uk>
 *    Copyright (C) <2010> Collabora Multimedia
 *    Copyright (C) <2010> Nokia Corporation
 *
 *    (C) 2005 Michal Benes <michal.benes@itonis.tv>
 *    (C) 2008 Wim Taymans <wim.taymans@gmail.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */
 /**
 * Common code for NAL parsing from h264 and h265 parsers.
 */
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
 #include <gst/base/gstbytereader.h>
 #include <gst/base/gstbitreader.h>
 #include <gst/base/gstbitwriter.h>
 typedef struct
 {
  const guint8 *data;
  guint size;
  guint n_epb;                  /* Number of emulation prevention bytes */
  guint byte;                   /* Byte position */
  guint bits_in_cache;          /* bitpos in the cache of next bit */
  guint8 first_byte;
  guint32 epb_cache;            /* cache 3 bytes to check emulation prevention bytes */
  guint64 cache;                /* cached bytes */
 } NalReader;
 typedef struct
 {
  GstBitWriter bw;
  guint nal_prefix_size;
  gboolean packetized;
 } NalWriter;
 G_GNUC_INTERNAL
 void nal_reader_init (NalReader * nr, const guint8 * data, guint size);
 G_GNUC_INTERNAL
 gboolean nal_reader_read (NalReader * nr, guint nbits);
 G_GNUC_INTERNAL
 gboolean nal_reader_skip (NalReader * nr, guint nbits);
 G_GNUC_INTERNAL
 gboolean nal_reader_skip_long (NalReader * nr, guint nbits);
 G_GNUC_INTERNAL
 guint nal_reader_get_pos (const NalReader * nr);
 G_GNUC_INTERNAL
 guint nal_reader_get_remaining (const NalReader * nr);
 G_GNUC_INTERNAL
 guint nal_reader_get_epb_count (const NalReader * nr);
 G_GNUC_INTERNAL
 gboolean nal_reader_is_byte_aligned (NalReader * nr);
 G_GNUC_INTERNAL
 gboolean nal_reader_has_more_data (NalReader * nr);
 #define NAL_READER_READ_BITS_H(bits) \
 G_GNUC_INTERNAL \
 gboolean nal_reader_get_bits_uint##bits (NalReader *nr, guint##bits *val, guint nbits)
 NAL_READER_READ_BITS_H (8);
 NAL_READER_READ_BITS_H (16);
 NAL_READER_READ_BITS_H (32);
 #define NAL_READER_PEEK_BITS_H(bits) \
 G_GNUC_INTERNAL \
 gboolean nal_reader_peek_bits_uint##bits (const NalReader *nr, guint##bits *val, guint nbits)
 NAL_READER_PEEK_BITS_H (8);
 G_GNUC_INTERNAL
 gboolean nal_reader_get_ue (NalReader * nr, guint32 * val);
 G_GNUC_INTERNAL
 gboolean nal_reader_get_se (NalReader * nr, gint32 * val);
 #define CHECK_ALLOWED_MAX_WITH_DEBUG(dbg, val, max) { \
  if (val > max) { \
    GST_WARNING ("value for '" dbg "' greater than max. value: %d, max %d", \
                     val, max); \
    goto error; \
  } \
 }
 #define CHECK_ALLOWED_MAX(val, max) \
  CHECK_ALLOWED_MAX_WITH_DEBUG (G_STRINGIFY (val), val, max)
 #define CHECK_ALLOWED_WITH_DEBUG(dbg, val, min, max) { \
  if (val < min || val > max) { \
    GST_WARNING ("value for '" dbg "' not in allowed range. value: %d, range %d-%d", \
                     val, min, max); \
    goto error; \
  } \
 }
 #define CHECK_ALLOWED(val, min, max) \
  CHECK_ALLOWED_WITH_DEBUG (G_STRINGIFY (val), val, min, max)
 #define READ_UINT8(nr, val, nbits) { \
  if (!nal_reader_get_bits_uint8 (nr, &val, nbits)) { \
    GST_WARNING ("failed to read uint8 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
    goto error; \
  } \
 }
 #define READ_UINT16(nr, val, nbits) { \
  if (!nal_reader_get_bits_uint16 (nr, &val, nbits)) { \
  GST_WARNING ("failed to read uint16 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
    goto error; \
  } \
 }
 #define READ_UINT32(nr, val, nbits) { \
  if (!nal_reader_get_bits_uint32 (nr, &val, nbits)) { \
  GST_WARNING ("failed to read uint32 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
    goto error; \
  } \
 }
 #define READ_UINT64(nr, val, nbits) { \
  if (!nal_reader_get_bits_uint64 (nr, &val, nbits)) { \
    GST_WARNING ("failed to read uint32 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
    goto error; \
  } \
 }
 #define READ_UE(nr, val) { \
  if (!nal_reader_get_ue (nr, &val)) { \
    GST_WARNING ("failed to read UE for '" G_STRINGIFY (val) "'"); \
    goto error; \
  } \
 }
 #define READ_UE_ALLOWED(nr, val, min, max) { \
  guint32 tmp; \
  READ_UE (nr, tmp); \
  CHECK_ALLOWED_WITH_DEBUG (G_STRINGIFY (val), tmp, min, max); \
  val = tmp; \
 }
 #define READ_UE_MAX(nr, val, max) { \
  guint32 tmp; \
  READ_UE (nr, tmp); \
  CHECK_ALLOWED_MAX_WITH_DEBUG (G_STRINGIFY (val), tmp, max); \
  val = tmp; \
 }
 #define READ_SE(nr, val) { \
  if (!nal_reader_get_se (nr, &val)) { \
    GST_WARNING ("failed to read SE for '" G_STRINGIFY (val) "'"); \
    goto error; \
  } \
 }
 #define READ_SE_ALLOWED(nr, val, min, max) { \
  gint32 tmp; \
  READ_SE (nr, tmp); \
  CHECK_ALLOWED_WITH_DEBUG (G_STRINGIFY (val), tmp, min, max); \
  val = tmp; \
 }
 G_GNUC_INTERNAL
 gint scan_for_start_codes (const guint8 * data, guint size);
 G_GNUC_INTERNAL
 void nal_writer_init (NalWriter * nw, guint nal_prefix_size, gboolean packetized);
 G_GNUC_INTERNAL
 void nal_writer_reset (NalWriter * nw);
 G_GNUC_INTERNAL
 gboolean nal_writer_do_rbsp_trailing_bits (NalWriter * nw);
 G_GNUC_INTERNAL
 GstMemory * nal_writer_reset_and_get_memory (NalWriter * nw);
 G_GNUC_INTERNAL
 guint8 * nal_writer_reset_and_get_data (NalWriter * nw, guint32 * ret_size);
 G_GNUC_INTERNAL
 gboolean nal_writer_put_bits_uint8 (NalWriter * nw, guint8 value, guint nbits);
 G_GNUC_INTERNAL
 gboolean nal_writer_put_bits_uint16 (NalWriter * nw, guint16 value, guint nbits);
 G_GNUC_INTERNAL
 gboolean nal_writer_put_bits_uint32 (NalWriter * nw, guint32 value, guint nbits);
 G_GNUC_INTERNAL
 gboolean nal_writer_put_bytes (NalWriter * nw, const guint8 * data, guint nbytes);
 G_GNUC_INTERNAL
 gboolean nal_writer_put_ue (NalWriter * nw, guint32 value);
 G_GNUC_INTERNAL
 gboolean count_exp_golomb_bits (guint32 value, guint * leading_zeros, guint * rest);
 #define WRITE_UINT8(nw, val, nbits) { \
  if (!nal_writer_put_bits_uint8 (nw, val, nbits)) { \
    GST_WARNING ("failed to write uint8 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
    goto error; \
  } \
 }
 #define WRITE_UINT16(nw, val, nbits) { \
  if (!nal_writer_put_bits_uint16 (nw, val, nbits)) { \
    GST_WARNING ("failed to write uint16 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
    goto error; \
  } \
 }
 #define WRITE_UINT32(nw, val, nbits) { \
  if (!nal_writer_put_bits_uint32 (nw, val, nbits)) { \
    GST_WARNING ("failed to write uint32 for '" G_STRINGIFY (val) "', nbits: %d", nbits); \
    goto error; \
  } \
 }
 #define WRITE_BYTES(nw, data, nbytes) { \
  if (!nal_writer_put_bytes (nw, data, nbytes)) { \
    GST_WARNING ("failed to write bytes for '" G_STRINGIFY (val) "', nbits: %d", nbytes); \
    goto error; \
  } \
 }
 #define WRITE_UE(nw, val) { \
  if (!nal_writer_put_ue (nw, val)) { \
    GST_WARNING ("failed to write ue for '" G_STRINGIFY (val) "'"); \
    goto error; \
  } \
 }
 static inline guint32 div_ceil (guint32 a, guint32 b)
 {
  /* http://blog.pkh.me/p/36-figuring-out-round%2C-floor-and-ceil-with-integer-division.html */
  g_assert (b > 0);
  return a / b + (a % b > 0);
 }
@@ -1,10 +0,0 @@
 /* Stub for <gst/glib-compat-private.h>.
 * In upstream GStreamer this provides backwards-compat shims for older
 * GLib versions (g_memdup2 polyfill being the load-bearing one).
 * Our gst_compat.h already defines g_memdup2 as a static inline, so
 * we just include the shim.
 */
 #ifndef LIBVA_V4L2_REQUEST_FOURIER_GLIB_COMPAT_PRIVATE_STUB
 #define LIBVA_V4L2_REQUEST_FOURIER_GLIB_COMPAT_PRIVATE_STUB
 #include "gst_compat.h"
 #endif
@@ -1,10 +0,0 @@
 /* Stub for <gst/gst.h> — redirects to the project's gst_compat shim.
 * The vendored GStreamer 1.28.2 H.265 parser was originally built against
 * full GStreamer; we only need the GLib type aliases + memory helpers +
 * macro stubs, all provided by gst_compat.h. Original gst.h would pull
 * in GObject + GstObject + the entire framework, which we don't link.
 */
 #ifndef LIBVA_V4L2_REQUEST_FOURIER_GST_H_STUB
 #define LIBVA_V4L2_REQUEST_FOURIER_GST_H_STUB
 #include "gst_compat.h"
 #endif
@@ -1,145 +0,0 @@
 /*
 * gst_compat.c — GArray implementation for the vendored GStreamer parser.
 *
 * Scope: minimal subset of GArray API exercised by gsth265parser.c
 * (g_array_new, g_array_sized_new, g_array_append_vals + the
 * g_array_append_val macro, g_array_index macro, g_array_set_size,
 * g_array_set_clear_func, g_array_free, g_array_unref).
 *
 * Non-thread-safe (matches GArray's documented semantics — GArray is
 * not thread-safe in upstream GLib either, callers must serialize).
 *
 * License: MIT (matches backend's COPYING.MIT).
 */
 #include "gst_compat.h"
 /* ===== internal helpers ===== */
 static gboolean
 garray_grow(GArray *array, guint new_capacity)
 {
    if (new_capacity <= array->capacity)
        return TRUE;
    /* round up to next power of two for amortized O(1) growth */
    guint cap = array->capacity > 0 ? array->capacity : 4;
    while (cap < new_capacity)
        cap *= 2;
    char *new_data = realloc(array->data, (size_t)cap * array->element_size);
    if (new_data == NULL)
        return FALSE;
    if (array->clear) {
        memset(new_data + (size_t)array->capacity * array->element_size, 0,
               (size_t)(cap - array->capacity) * array->element_size);
    }
    array->data = new_data;
    array->capacity = cap;
    return TRUE;
 }
 /* ===== public API ===== */
 GArray *
 g_array_sized_new(gboolean zero_terminated, gboolean clear,
                  guint element_size, guint reserved_size)
 {
    /* zero_terminated is GLib-specific (appends a zero-element sentinel
     * for trailing-NULL semantics). The vendored parser does not use it;
     * we ignore the flag. */
    (void)zero_terminated;
    GArray *a = calloc(1, sizeof(GArray));
    if (a == NULL)
        return NULL;
    a->element_size = element_size;
    a->clear = clear;
    if (reserved_size > 0) {
        if (!garray_grow(a, reserved_size)) {
            free(a);
            return NULL;
        }
    }
    return a;
 }
 GArray *
 g_array_new(gboolean zero_terminated, gboolean clear, guint element_size)
 {
    return g_array_sized_new(zero_terminated, clear, element_size, 0);
 }
 GArray *
 g_array_set_size(GArray *array, guint length)
 {
    if (length > array->capacity) {
        if (!garray_grow(array, length))
            return array;
    }
    if (array->clear_func != NULL && length < array->len) {
        for (guint i = length; i < array->len; i++)
            array->clear_func(array->data + (size_t)i * array->element_size);
    }
    if (array->clear && length > array->len) {
        memset(array->data + (size_t)array->len * array->element_size, 0,
               (size_t)(length - array->len) * array->element_size);
    }
    array->len = length;
    return array;
 }
 GArray *
 g_array_append_vals(GArray *array, gconstpointer data, guint len)
 {
    if (len == 0)
        return array;
    if (!garray_grow(array, array->len + len))
        return array;
    memcpy(array->data + (size_t)array->len * array->element_size,
           data, (size_t)len * array->element_size);
    array->len += len;
    return array;
 }
 void
 g_array_set_clear_func(GArray *array, void (*clear_func)(gpointer))
 {
    array->clear_func = clear_func;
 }
 gchar *
 g_array_free(GArray *array, gboolean free_segment)
 {
    if (array == NULL)
        return NULL;
    if (array->clear_func != NULL) {
        for (guint i = 0; i < array->len; i++)
            array->clear_func(array->data + (size_t)i * array->element_size);
    }
    gchar *data = NULL;
    if (free_segment) {
        free(array->data);
    } else {
        data = array->data;
    }
    free(array);
    return data;
 }
 GArray *
 g_array_unref(GArray *array)
 {
    /* simplified to free; the backend never sub-references shared GArrays */
    g_array_free(array, TRUE);
    return NULL;
 }
@@ -1,463 +0,0 @@
 /*
 * gst_compat.h — minimal GLib/GStreamer compatibility shim for vendored
 * GStreamer 1.28.2 H.265 parser + bitreader + bytereader + nalutils.
 *
 * Strategy: provide #defines / typedefs for the GLib API surface those
 * 4 vendored files use, so they can compile against libc + libv4l2 only
 * (no glib2 / gst-base linkage). Vendored .c files are NOT modified
 * directly; instead this header is force-included via the Makefile's
 * `-include` flag on the vendored translation units.
 *
 * Coverage scoped to what gsth265parser.c + nalutils.c + gstbitreader.c
 * + gstbytereader.c actually call. Surveyed in
 * ampere-kernel-decoders phase4 step 2 prep — see
 * ~/src/ampere-kernel-decoders/phase4_plan_iter2.md and the survey
 * commit message for the empirical inventory.
 *
 * License: this shim is original work, MIT (matching the backend's
 * COPYING.MIT). The vendored .c files keep their LGPL v2.1+ headers
 * verbatim.
 */
 #ifndef LIBVA_V4L2_REQUEST_FOURIER_GST_COMPAT_H
 #define LIBVA_V4L2_REQUEST_FOURIER_GST_COMPAT_H
 #include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 /* ===== GLib type aliases ===== */
 typedef bool          gboolean;
 typedef char          gchar;
 typedef unsigned char guchar;
 typedef int           gint;
 typedef int8_t        gint8;
 typedef int16_t       gint16;
 typedef int32_t       gint32;
 typedef int64_t       gint64;
 typedef unsigned int  guint;
 typedef uint8_t       guint8;
 typedef uint16_t      guint16;
 typedef uint32_t      guint32;
 typedef uint64_t      guint64;
 typedef size_t        gsize;
 typedef ptrdiff_t     gssize;
 typedef void *        gpointer;
 typedef const void *  gconstpointer;
 typedef double        gdouble;
 typedef float         gfloat;
 /* GLib's gint64 / guint64 formatting is platform-conditional; for our
 * aarch64 ALARM target we don't need the full G_*_FORMAT machinery, but
 * gstbytereader uses G_GSIZE_FORMAT in a debug-only printf. */
 #define G_GSIZE_FORMAT "zu"
 #ifndef TRUE
 # define TRUE  true
 #endif
 #ifndef FALSE
 # define FALSE false
 #endif
 /* ===== memory ===== */
 #define g_malloc(n)         malloc((size_t)(n))
 #define g_malloc0(n)        calloc(1, (size_t)(n))
 #define g_realloc(p, n)     realloc((p), (size_t)(n))
 /* g_free needs to be addressable (passed as a function-pointer arg by
 * nalutils.c::gst_memory_new_wrapped — even though that call site is
 * dead code we don't invoke, it must compile). Plain `free` is
 * compatible: signature is `void (void *)` either way. */
 #define g_free              free
 #define g_new(type, n)      ((type *)malloc(sizeof(type) * (size_t)(n)))
 #define g_new0(type, n)     ((type *)calloc((size_t)(n), sizeof(type)))
 #define g_slice_new(type)   ((type *)malloc(sizeof(type)))
 #define g_slice_new0(type)  ((type *)calloc(1, sizeof(type)))
 #define g_slice_free(type, p)        free(p)
 #define g_slice_free1(size, p)       free(p)
 #define g_clear_pointer(pp, freefn) \
        do { freefn(*(pp)); *(pp) = NULL; } while (0)
 /* g_memdup2 — GLib's 64-bit-safe memdup, used by gstbytereader. */
 static inline gpointer
 g_memdup2(gconstpointer mem, gsize byte_size)
 {
    if (mem == NULL || byte_size == 0)
        return NULL;
    void *copy = malloc(byte_size);
    if (copy != NULL)
        memcpy(copy, mem, byte_size);
    return copy;
 }
 /* g_strcmp0 — NULL-safe strcmp. Used by gsth265parser in profile-name lookup. */
 static inline int
 g_strcmp0(const char *a, const char *b)
 {
    if (a == b) return 0;
    if (a == NULL) return -1;
    if (b == NULL) return 1;
    return strcmp(a, b);
 }
 /* ===== asserts / return-guards =====
 *
 * Per ampere-kernel-decoders iter2 Phase 2 §"new failure modes" #5:
 * g_assert must NOT abort the process. It becomes a no-op here;
 * malformed bitstream is caught by the explicit parse-result returns
 * the parser already implements.
 *
 * g_return_if_fail / g_return_val_if_fail propagate as the original
 * GLib semantics (early return with optional value). */
 #define g_assert(cond)                ((void)0)
 #define g_assert_not_reached()        __builtin_unreachable()
 #define g_return_if_fail(cond)        do { if (!(cond)) return; } while (0)
 #define g_return_val_if_fail(cond, v) do { if (!(cond)) return (v); } while (0)
 /* ===== GStreamer logging — no-ops =====
 *
 * The parser is heavy on debug logging. We compile all of it out;
 * the backend's own logging (request_log/error_log) wraps the parser
 * calls and reports parse-failure return codes from there. */
 #define GST_DISABLE_GST_DEBUG 1
 #define GST_DEBUG_CATEGORY_STATIC(name)
 #define GST_DEBUG_CATEGORY_INIT(...) ((void)0)
 #define GST_DEBUG_CATEGORY_GET(...)  ((void)0)
 #define GST_DEBUG(...)               ((void)0)
 #define GST_INFO(...)                ((void)0)
 #define GST_WARNING(...)             ((void)0)
 #define GST_ERROR(...)               ((void)0)
 #define GST_LOG(...)                 ((void)0)
 #define GST_FIXME(...)               ((void)0)
 #define GST_MEMDUMP(...)             ((void)0)
 #define GST_CAT_DEFAULT              (NULL)
 /* ===== compiler / language helpers ===== */
 #define G_LIKELY(x)        __builtin_expect(!!(x), 1)
 #define G_UNLIKELY(x)      __builtin_expect(!!(x), 0)
 #define G_GNUC_UNUSED      __attribute__((unused))
 #define G_GNUC_INTERNAL
 #define G_GNUC_MALLOC      __attribute__((malloc))
 #define G_GNUC_NORETURN    __attribute__((noreturn))
 #define G_GNUC_DEPRECATED
 #define G_GNUC_DEPRECATED_FOR(x)
 #define G_GNUC_PURE        __attribute__((pure))
 #define G_GNUC_CONST       __attribute__((const))
 #define G_GNUC_PRINTF(a, b) __attribute__((format(printf, a, b)))
 #define G_BEGIN_DECLS
 #define G_END_DECLS
 #define G_N_ELEMENTS(arr)  (sizeof(arr) / sizeof((arr)[0]))
 #define G_STMT_START       do
 #define G_STMT_END         while (0)
 #define G_STRINGIFY(x)     G_STRINGIFY_(x)
 #define G_STRINGIFY_(x)    #x
 /* GStreamer ABI-padding slot count; upstream uses 4 reserved gpointers
 * at the end of public structs for future ABI extension. We replicate
 * the size so struct layout matches what gst_byte_reader_init / friends
 * write into. */
 #define GST_PADDING        4
 #define GST_PADDING_LARGE  20
 /* Public-symbol visibility — backend's shared module uses
 * -fvisibility=hidden, so we don't need to mark anything public from
 * within the vendored parser. The original GST_*_API macros expand to
 * extern + dllimport on Windows; on Linux ELF builds where
 * fvisibility=hidden is active, they would mark public symbols. The
 * vendored functions are never called from outside h265_parser/, so
 * leaving these empty hides them automatically. */
 #define GST_API
 #define GST_API_EXPORT     extern
 #define GST_API_IMPORT     extern
 /* ===== Opaque GStreamer pipeline types =====
 *
 * GstBuffer + GstMemory are referenced by encoder-side dead-code
 * functions in gsth265parser.c (gst_h265_parser_insert_sei_hevc).
 * We never call those; declaring them as opaque structs lets the
 * function pointers / declarations compile, and the linker keeps the
 * dead-code .text section even though it's unreachable.
 *
 * If you ever need to actually USE GstBuffer in this tree, replace
 * these opaque decls with the project's own buffer abstraction; do not
 * try to vendor in libgst itself. */
 typedef struct _GstBuffer GstBuffer;
 typedef struct _GstMemory GstMemory;
 typedef struct _GstMapInfo GstMapInfo;  /* opaque — dead-code in gsth265parser SEI insert */
 /* GLib min/max constants — dead-code unsigned-overflow guards in
 * gsth265parser.c. */
 #define G_MAXUINT8   ((guint8)0xFF)
 #define G_MAXUINT16  ((guint16)0xFFFF)
 #define G_MAXUINT32  ((guint32)0xFFFFFFFFU)
 #define G_MAXUINT64  ((guint64)0xFFFFFFFFFFFFFFFFULL)
 #define G_MAXINT8    ((gint8)0x7F)
 #define G_MAXINT16   ((gint16)0x7FFF)
 #define G_MAXINT32   ((gint32)0x7FFFFFFF)
 #define G_MAXINT64   ((gint64)0x7FFFFFFFFFFFFFFFLL)
 #define G_MININT8    ((gint8)(-0x80))
 #define G_MININT16   ((gint16)(-0x8000))
 #define G_MININT32   ((gint32)(-0x80000000))
 #define G_MAXSIZE    ((gsize)-1)
 /* GLib function-pointer typedefs used by g_list_* APIs (which our
 * gst_compat declares as abort-stubs). They show up in code paths
 * we never invoke but must compile. */
 typedef void (*GDestroyNotify)(gpointer data);
 typedef int  (*GCompareFunc)(gconstpointer a, gconstpointer b);
 typedef int  (*GCompareDataFunc)(gconstpointer a, gconstpointer b, gpointer user_data);
 /* GstMapFlags — passed to gst_memory_map / gst_buffer_map. Dead-code. */
 #define GST_MAP_READ      (1 << 0)
 #define GST_MAP_WRITE     (1 << 1)
 #define GST_MAP_READWRITE (GST_MAP_READ | GST_MAP_WRITE)
 /* Dead-code stubs for buffer / memory mapping (only referenced by
 * gst_h265_parser_insert_sei_hevc which we never call). The compile
 * needs declarations + addressable functions; abort on call. */
 static inline gboolean
 gst_memory_map(GstMemory *mem G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED,
               int flags G_GNUC_UNUSED) { abort(); }
 static inline void
 gst_memory_unmap(GstMemory *mem G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED) { abort(); }
 static inline gboolean
 gst_buffer_map(GstBuffer *buf G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED,
               int flags G_GNUC_UNUSED) { abort(); }
 static inline void
 gst_buffer_unmap(GstBuffer *buf G_GNUC_UNUSED, GstMapInfo *info G_GNUC_UNUSED) { abort(); }
 static inline GstBuffer *
 gst_buffer_new(void) { abort(); }
 static inline gboolean
 gst_buffer_copy_into(GstBuffer *dst G_GNUC_UNUSED, GstBuffer *src G_GNUC_UNUSED,
                     int flags G_GNUC_UNUSED, gsize offset G_GNUC_UNUSED,
                     gssize size G_GNUC_UNUSED) { abort(); }
 static inline void
 gst_buffer_append_memory(GstBuffer *buf G_GNUC_UNUSED, GstMemory *mem G_GNUC_UNUSED) { abort(); }
 static inline GstMemory *
 gst_memory_ref(GstMemory *mem G_GNUC_UNUSED) { abort(); }
 static inline void
 gst_memory_unref(GstMemory *mem G_GNUC_UNUSED) { abort(); }
 static inline GstMemory *
 gst_memory_copy(GstMemory *mem G_GNUC_UNUSED, gssize offset G_GNUC_UNUSED, gssize size G_GNUC_UNUSED) { abort(); }
 static inline void
 gst_clear_buffer(GstBuffer **buf) { *buf = NULL; }
 #define GST_IS_BUFFER(b) (false)
 /* GstBufferCopyFlags — used only by gst_buffer_copy_into in dead code. */
 #define GST_BUFFER_COPY_METADATA  (1 << 0)
 #define GST_BUFFER_COPY_MEMORY    (1 << 1)
 #define GST_BUFFER_COPY_DEEP      (1 << 2)
 /* gst_util_ceil_log2(n) — ceil(log2(n)) for non-zero unsigned n.
 * Used by gsth265parser.c::gst_h265_slice_parse_ref_pic_list_modification.
 * That function is in the slice-header parser which the libva backend
 * does NOT invoke (we only call parse_sps) — but the linker still
 * needs a definition. Provide a real impl: cheaper to compute than to
 * justify a dead-code stub at every call site. */
 static inline guint
 gst_util_ceil_log2(guint32 n)
 {
    if (n <= 1) return 0;
    /* __builtin_clz returns leading zeros for a 32-bit value;
     * 32 - clz(n-1) = bits needed = ceil(log2(n)). */
    return 32 - (guint)__builtin_clz(n - 1);
 }
 /* GstMapInfo's real definition is in <gst/gstmemory.h>; we need at
 * least enough to make `info->data` / `info->size` compile. */
 struct _GstMapInfo {
    GstMemory *memory;
    int        flags;
    guint8    *data;
    gsize      size;
    gsize      maxsize;
    gpointer   user_data[4];
    gpointer   _gst_reserved[GST_PADDING];
 };
 /* gst_memory_new_wrapped — dead-code stub (nalutils.c calls it from
 * the SEI-insertion path the libva backend never invokes). */
 static inline GstMemory *
 gst_memory_new_wrapped(int flags, gpointer data, gsize maxsize,
                       gsize offset, gsize size, gpointer user_data,
                       void (*notify)(gpointer))
 {
    (void)flags; (void)data; (void)maxsize; (void)offset; (void)size;
    (void)user_data; (void)notify;
    abort();
 }
 /* ===== byte-order read / write macros =====
 *
 * GStreamer provides these as static-inline functions in
 * <gst/gstutils.h>. We re-implement for aarch64 little-endian; the
 * parser is byte-stream input, so endian-conversion is mechanical.
 * The float / double variants are present in upstream but the parser
 * never invokes them — provide stubs so the address-taking sites in
 * gstbytereader.h's function table compile. */
 #define GST_READ_UINT8(data)                                    \
    (*((const guint8 *)(data)))
 #define GST_READ_UINT16_LE(data) (                              \
    ((guint16)((const guint8 *)(data))[0])           |          \
    ((guint16)((const guint8 *)(data))[1] << 8))
 #define GST_READ_UINT16_BE(data) (                              \
    ((guint16)((const guint8 *)(data))[0] << 8)      |          \
    ((guint16)((const guint8 *)(data))[1]))
 #define GST_READ_UINT24_LE(data) (                              \
    ((guint32)((const guint8 *)(data))[0])           |          \
    ((guint32)((const guint8 *)(data))[1] << 8)      |          \
    ((guint32)((const guint8 *)(data))[2] << 16))
 #define GST_READ_UINT24_BE(data) (                              \
    ((guint32)((const guint8 *)(data))[0] << 16)     |          \
    ((guint32)((const guint8 *)(data))[1] << 8)      |          \
    ((guint32)((const guint8 *)(data))[2]))
 #define GST_READ_UINT32_LE(data) (                              \
    ((guint32)((const guint8 *)(data))[0])           |          \
    ((guint32)((const guint8 *)(data))[1] << 8)      |          \
    ((guint32)((const guint8 *)(data))[2] << 16)     |          \
    ((guint32)((const guint8 *)(data))[3] << 24))
 #define GST_READ_UINT32_BE(data) (                              \
    ((guint32)((const guint8 *)(data))[0] << 24)     |          \
    ((guint32)((const guint8 *)(data))[1] << 16)     |          \
    ((guint32)((const guint8 *)(data))[2] << 8)      |          \
    ((guint32)((const guint8 *)(data))[3]))
 #define GST_READ_UINT64_LE(data) (                              \
    ((guint64)((const guint8 *)(data))[0])           |          \
    ((guint64)((const guint8 *)(data))[1] << 8)      |          \
    ((guint64)((const guint8 *)(data))[2] << 16)     |          \
    ((guint64)((const guint8 *)(data))[3] << 24)     |          \
    ((guint64)((const guint8 *)(data))[4] << 32)     |          \
    ((guint64)((const guint8 *)(data))[5] << 40)     |          \
    ((guint64)((const guint8 *)(data))[6] << 48)     |          \
    ((guint64)((const guint8 *)(data))[7] << 56))
 #define GST_READ_UINT64_BE(data) (                              \
    ((guint64)((const guint8 *)(data))[0] << 56)     |          \
    ((guint64)((const guint8 *)(data))[1] << 48)     |          \
    ((guint64)((const guint8 *)(data))[2] << 40)     |          \
    ((guint64)((const guint8 *)(data))[3] << 32)     |          \
    ((guint64)((const guint8 *)(data))[4] << 24)     |          \
    ((guint64)((const guint8 *)(data))[5] << 16)     |          \
    ((guint64)((const guint8 *)(data))[6] << 8)      |          \
    ((guint64)((const guint8 *)(data))[7]))
 /* Float / double readers — dead-code, abort if called. The function
 * table in gstbytereader.h takes the address of the underlying inline
 * which we don't need to be functional, only addressable. */
 static inline gfloat
 GST_READ_FLOAT_LE(const guint8 *data) { (void)data; abort(); }
 static inline gfloat
 GST_READ_FLOAT_BE(const guint8 *data) { (void)data; abort(); }
 static inline gdouble
 GST_READ_DOUBLE_LE(const guint8 *data) { (void)data; abort(); }
 static inline gdouble
 GST_READ_DOUBLE_BE(const guint8 *data) { (void)data; abort(); }
 /* Write side — nalutils.c writes-out SEI bytes (dead path for us but
 * must compile). */
 #define GST_WRITE_UINT8(data, val) do {                         \
    ((guint8 *)(data))[0] = (guint8)(val);                      \
 } while (0)
 #define GST_WRITE_UINT16_BE(data, val) do {                     \
    ((guint8 *)(data))[0] = (guint8)((val) >> 8);               \
    ((guint8 *)(data))[1] = (guint8)((val));                    \
 } while (0)
 #define GST_WRITE_UINT24_BE(data, val) do {                     \
    ((guint8 *)(data))[0] = (guint8)((val) >> 16);              \
    ((guint8 *)(data))[1] = (guint8)((val) >> 8);               \
    ((guint8 *)(data))[2] = (guint8)((val));                    \
 } while (0)
 #define GST_WRITE_UINT32_BE(data, val) do {                     \
    ((guint8 *)(data))[0] = (guint8)((val) >> 24);              \
    ((guint8 *)(data))[1] = (guint8)((val) >> 16);              \
    ((guint8 *)(data))[2] = (guint8)((val) >> 8);               \
    ((guint8 *)(data))[3] = (guint8)((val));                    \
 } while (0)
 #ifndef MIN
 # define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
 #ifndef MAX
 # define MAX(a, b) ((a) > (b) ? (a) : (b))
 #endif
 /* ===== GArray ===== */
 typedef struct {
    char *data;                 /* exposed via g_array_index / GArray->data */
    guint len;                  /* element count */
    guint capacity;             /* allocated element slots */
    guint element_size;
    gboolean clear;             /* zero-fill on grow */
    void (*clear_func)(gpointer);
 } GArray;
 GArray *g_array_new(gboolean zero_terminated, gboolean clear, guint element_size);
 GArray *g_array_sized_new(gboolean zero_terminated, gboolean clear,
                          guint element_size, guint reserved_size);
 GArray *g_array_set_size(GArray *array, guint length);
 GArray *g_array_append_vals(GArray *array, gconstpointer data, guint len);
 void    g_array_set_clear_func(GArray *array, void (*clear_func)(gpointer));
 gchar  *g_array_free(GArray *array, gboolean free_segment);
 GArray *g_array_unref(GArray *array);
 #define g_array_append_val(a, v) g_array_append_vals((a), &(v), 1)
 #define g_array_index(a, t, i)   (((t *)(void *)(a)->data)[i])
 /* ===== GList — stubs that abort if reached =====
 *
 * Surveyed call sites: gsth265parser.c uses g_list_prepend / g_list_sort /
 * g_list_free_full in code paths the libva backend does not invoke for
 * basic SPS parsing (likely SEI message accumulation). Stub to abort so
 * any future call surfaces immediately rather than silently corrupting. */
 /* GList — full struct (not opaque) so callers can do `list->data`.
 * The functions still abort because we never construct a GList. */
 typedef struct _GList GList;
 struct _GList {
    gpointer data;
    GList   *next;
    GList   *prev;
 };
 static inline GList *g_list_prepend(GList *list G_GNUC_UNUSED, gpointer data G_GNUC_UNUSED) { abort(); }
 static inline GList *g_list_sort(GList *list G_GNUC_UNUSED, int (*cmp)(gconstpointer, gconstpointer) G_GNUC_UNUSED) { abort(); }
 static inline void g_list_free_full(GList *list G_GNUC_UNUSED, void (*free_func)(gpointer) G_GNUC_UNUSED) { abort(); }
 /* ===== g_once_init_enter / g_once_init_leave =====
 *
 * GLib's lazy-init guards. The parser uses these for one-shot static
 * initialization (e.g. profile-name table). Our backend is single-
 * threaded at the parser-init site (driver_init), so we can simplify
 * to a plain run-once gate. */
 #define g_once_init_enter(loc)        (*(loc) == 0)
 #define g_once_init_leave(loc, val)   (*(loc) = (val))
 /* ===== conversions ===== */
 #define GINT_TO_POINTER(i) ((gpointer)(uintptr_t)(gint)(i))
 #define GPOINTER_TO_INT(p) ((gint)(uintptr_t)(p))
 #endif  /* LIBVA_V4L2_REQUEST_FOURIER_GST_COMPAT_H */
@@ -1,90 +0,0 @@
 /*
 * v4l2-hevc-ext-controls.h — verbatim mirror of Linux 7.0+ V4L2 stateless
 * HEVC extended-SPS RPS control definitions, shipped as an internal
 * header so this libva backend can be built against pre-7.0
 * linux-api-headers packages (currently ampere ships 6.19-1).
 *
 * Upstream source: linux kernel, include/uapi/linux/v4l2-controls.h
 * As-of: Linux 7.0-rc3 (Detlev Casanova / Collabora "VDPU381/VDPU383"
 * series, see lkml.org/lkml/2026/1/9/1334). The two CIDs + two structs
 * + two flag macros below are byte-for-byte the kernel UAPI definitions.
 *
 * Once linux-api-headers >= 7.0 is the floor across the fleet, this
 * shim becomes redundant — `<linux/v4l2-controls.h>` will provide the
 * same symbols. The include order in h265.c is: this header BEFORE
 * <linux/v4l2-controls.h>, so when the system catches up, the macro
 * guards below silently no-op and we use the system definitions.
 *
 * License: MIT (matches backend's COPYING.MIT). Per LGPL § 3.b., the
 * kernel UAPI struct definitions themselves are excepted from the
 * kernel's overall GPL and may be copied verbatim into userspace
 * binaries without inheriting GPL.
 *
 * Rationale + iter2 plan: see
 *   ~/src/ampere-kernel-decoders/phase4_plan_iter2.md (§Step 3)
 *   ~/src/ampere-kernel-decoders/phase0_findings_iter2.md
 */
 #ifndef LIBVA_V4L2_REQUEST_FOURIER_V4L2_HEVC_EXT_CONTROLS_H
 #define LIBVA_V4L2_REQUEST_FOURIER_V4L2_HEVC_EXT_CONTROLS_H
 #include <linux/types.h>
 #include <linux/v4l2-controls.h>
 #ifndef V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS
 # define V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS \
    (V4L2_CID_CODEC_STATELESS_BASE + 408)
 #endif
 #ifndef V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS
 # define V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS \
    (V4L2_CID_CODEC_STATELESS_BASE + 409)
 #endif
 #ifndef V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED
 # define V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED 0x1
 #endif
 #ifndef V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT
 # define V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT 0x1
 #endif
 /*
 * struct v4l2_ctrl_hevc_ext_sps_st_rps — HEVC short-term RPS parameters.
 *
 * Dynamic-size 1-dimension array. Number of elements is
 *   v4l2_ctrl_hevc_sps::num_short_term_ref_pic_sets
 * Can contain up to 65 elements (the H.265 spec § 7.4.3.2.1 maximum).
 */
 #ifndef V4L2_HEVC_EXT_SPS_ST_RPS_DEFINED
 # define V4L2_HEVC_EXT_SPS_ST_RPS_DEFINED 1
 struct v4l2_ctrl_hevc_ext_sps_st_rps {
 	__u8	delta_idx_minus1;
 	__u8	delta_rps_sign;
 	__u8	num_negative_pics;
 	__u8	num_positive_pics;
 	__u32	used_by_curr_pic;
 	__u32	use_delta_flag;
 	__u16	abs_delta_rps_minus1;
 	__u16	delta_poc_s0_minus1[16];
 	__u16	delta_poc_s1_minus1[16];
 	__u16	flags;
 };
 #endif
 /*
 * struct v4l2_ctrl_hevc_ext_sps_lt_rps — HEVC long-term RPS parameters.
 *
 * Dynamic-size 1-dimension array. Number of elements is
 *   v4l2_ctrl_hevc_sps::num_long_term_ref_pics_sps
 * Can contain up to 33 elements (the H.265 spec § 7.4.3.2.1 maximum).
 */
 #ifndef V4L2_HEVC_EXT_SPS_LT_RPS_DEFINED
 # define V4L2_HEVC_EXT_SPS_LT_RPS_DEFINED 1
 struct v4l2_ctrl_hevc_ext_sps_lt_rps {
 	__u16	lt_ref_pic_poc_lsb_sps;
 	__u16	flags;
 };
 #endif
 #endif  /* LIBVA_V4L2_REQUEST_FOURIER_V4L2_HEVC_EXT_CONTROLS_H */
@@ -31,13 +31,7 @@
 #include "video.h"
 #include <assert.h>
 #include <fcntl.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 #include <linux/dma-buf.h>
 #include "tiled_yuv.h"
 #include "utils.h"
@@ -46,6 +40,7 @@
 VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,
 			    int width, int height, VAImage *image)
 {
 	request_log("ENTER RequestCreateImage\n");
 	struct request_data *driver_data = context->pDriverData;
 	unsigned int destination_sizes[VIDEO_MAX_PLANES];
 	unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
@@ -131,7 +126,7 @@ VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,
 VAStatus RequestDestroyImage(VADriverContextP context, VAImageID image_id)
 {
-
+	request_log("ENTER RequestDestroyImage\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_image *image_object;
 	VAStatus status;
@@ -156,111 +151,12 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
 {
 	struct object_buffer *buffer_object;
 	unsigned int i;
 	int sync_fds[VIDEO_MAX_PLANES];
 	unsigned int n_sync_fds = 0;
 	buffer_object = BUFFER(driver_data, image->buf);
 	if (buffer_object == NULL)
 		return VA_STATUS_ERROR_INVALID_BUFFER;
 	for (i = 0; i < VIDEO_MAX_PLANES; i++)
 		sync_fds[i] = -1;
 	/*
 	 * iter13 α-17: explicit cache sync around the CAPTURE buffer read.
 	 *
 	 * The CAPTURE buffer is V4L2_MEMORY_MMAP and was mapped at
 	 * cap_pool_init time with cached attributes. Kernel decode writes to
 	 * the buffer via DMA, which doesn't propagate to the CPU's cache
 	 * observer for that virtual mapping. Reading from
 	 * surface_object->destination_data[] without an explicit cache
 	 * invalidation returns stale data — observed empirically as Bug 4
 	 * (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva went
 	 * through the SAME readback path that kdirect ffmpeg-v4l2request +
 	 * DRM_PRIME-mmap successfully reads (kdirect's drm-prime mmap
 	 * implicitly handles sync).
 	 *
 	 * DMA_BUF_IOCTL_SYNC(START | READ) makes the CPU mapping coherent
 	 * with the producing engine's writes; END releases the sync.
 	 * Per V4L2 + dma-buf spec, this is the userspace contract for
 	 * cached-mmap'd buffers (Tomasz Figa, linaro-mm-sig 2024-07-11).
 	 *
 	 * Requires a dma-buf fd: get one via VIDIOC_EXPBUF, sync, close.
 	 * Per-call cost is one ioctl pair + one fd open/close per plane.
 	 * Could be optimised by caching the EXPBUF fd on the cap_pool slot,
 	 * but doing it just-in-time keeps the lifecycle uncomplicated. The
 	 * EXPBUF fd's dup count doesn't affect the V4L2 buffer's underlying
 	 * pages; closing the fd is a no-op on memory.
 	 *
 	 * If EXPBUF fails (e.g., consumer-held EXPBUF prevents a second one
 	 * — only true for hantro G1 oddity), we skip the sync silently. The
 	 * existing pre-iter13 behavior is preserved on the error path.
 	 */
 	if (surface_object->current_slot != NULL &&
 	    driver_data->video_format != NULL) {
 		unsigned int capture_type =
 			v4l2_type_video_capture(driver_data->video_format->v4l2_mplane);
 		if (v4l2_export_buffer(driver_data->video_fd, capture_type,
 				       surface_object->destination_index,
 				       O_RDONLY, sync_fds,
 				       surface_object->destination_buffers_count) >= 0) {
 			n_sync_fds = surface_object->destination_buffers_count;
 			for (i = 0; i < n_sync_fds; i++) {
 				struct dma_buf_sync s = {
 					.flags = DMA_BUF_SYNC_START |
 						 DMA_BUF_SYNC_READ,
 				};
 				/* failure is non-fatal: we continue with the read */
 				(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
 			}
 		}
 	}
 	/*
 	 * AV1 film_grain: when this surface is the display surface of a
 	 * decode (current_display_picture != current_frame with apply_grain=1),
 	 * its slot is NULL because BeginPicture only fired on the decode
 	 * surface. Follow the back-link set in av1_set_controls and borrow
 	 * the decode surface's destination_data + sizes for the copy.
 	 */
 	if (surface_object->current_slot == NULL &&
 	    surface_object->linked_decode_surface_id != VA_INVALID_SURFACE) {
 		struct object_surface *decode_surface =
 			SURFACE(driver_data,
 				surface_object->linked_decode_surface_id);
 		if (decode_surface != NULL &&
 		    decode_surface->current_slot != NULL) {
 			/* Mirror the fields we read below. The surface heap
 			 * pointer is stable for the surface's lifetime; we
 			 * only need destination_data + destination_sizes +
 			 * destination_planes_count from it. */
 			surface_object->destination_planes_count =
 				decode_surface->destination_planes_count;
 			for (i = 0; i < decode_surface->destination_planes_count; i++) {
 				surface_object->destination_data[i] =
 					decode_surface->destination_data[i];
 				surface_object->destination_sizes[i] =
 					decode_surface->destination_sizes[i];
 			}
 		}
 	}
 	for (i = 0; i < surface_object->destination_planes_count; i++) {
 		/* AV1 Phase 3 diag: surface NULL-deref hunt. */
 		if (buffer_object->data == NULL ||
 		    surface_object->destination_data[i] == NULL) {
 			request_log("copy_surface_to_image NULL i=%u "
 				    "buf_data=%p dest_data=%p dest_size=%u "
 				    "planes=%u slot=%p linked=0x%x\n",
 				    i, (void *)buffer_object->data,
 				    (void *)surface_object->destination_data[i],
 				    surface_object->destination_sizes[i],
 				    surface_object->destination_planes_count,
 				    (void *)surface_object->current_slot,
 				    surface_object->linked_decode_surface_id);
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		}
 #ifdef __arm__
 		if (!video_format_is_linear(driver_data->video_format))
 			tiled_to_planar(surface_object->destination_data[i],
 					buffer_object->data + image->offsets[i],
@@ -268,22 +164,10 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
 					i == 0 ? image->height :
 						 image->height / 2);
 		else {
 #endif
 			memcpy(buffer_object->data + image->offsets[i],
 			       surface_object->destination_data[i],
 			       surface_object->destination_sizes[i]);
 #ifdef __arm__
 		}
 #endif
 	}
 	/* iter13 α-17: release cache sync. END pairs with each START. */
 	for (i = 0; i < n_sync_fds; i++) {
 		struct dma_buf_sync s = {
 			.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ,
 		};
 		(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
 		close(sync_fds[i]);
 	}
 	return VA_STATUS_SUCCESS;
@@ -292,13 +176,13 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
 VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
 			    VAImage *image)
 {
 	request_log("ENTER RequestDeriveImage\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;
 	struct object_buffer *buffer_object;
 	VAImageFormat format;
 	VAStatus status;
 	surface_object = SURFACE(driver_data, surface_id);
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
@@ -309,33 +193,16 @@ VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
 			return status;
 	}
 	/* Fully populate VAImageFormat to match QueryImageFormats output. */
 	memset(&format, 0, sizeof(format));
 	format.fourcc = VA_FOURCC_NV12;
 	format.byte_order = VA_LSB_FIRST;
 	format.bits_per_pixel = 12;
 	status = RequestCreateImage(context, &format, surface_object->width,
 				    surface_object->height, image);
 	if (status != VA_STATUS_SUCCESS)
 		return status;
-	/*
+	status = copy_surface_to_image (driver_data, surface_object, image);
-	 * Iter2 Fix 3: skip the surface→image copy when no CAPTURE slot is
+	if (status != VA_STATUS_SUCCESS)
-	 * bound. ffmpeg's av_hwframe_ctx_init probes vaDeriveImage on a
+		return status;
 	 * never-decoded surface to learn the format; it doesn't read the
 	 * data. With the cap_pool decoupling, destination_data[] is NULL
 	 * until BeginPicture binds a slot — copying from a NULL source
 	 * crashed in memcpy. The image's buffer remains zero-initialized;
 	 * subsequent post-decode DeriveImage on the same surface (after
 	 * BeginPicture has bound a slot) does the real copy.
 	 */
 	if (surface_object->current_slot != NULL) {
 		status = copy_surface_to_image (driver_data, surface_object,
 						image);
 		if (status != VA_STATUS_SUCCESS)
 			return status;
 	}
 	surface_object->status = VASurfaceReady;
@@ -348,25 +215,8 @@ VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
 VAStatus RequestQueryImageFormats(VADriverContextP context,
 				  VAImageFormat *formats, int *formats_count)
 {
-
+	request_log("ENTER RequestQueryImageFormats\n");
 	/*
 	 * Populate the VAImageFormat fully per VAAPI spec for NV12 —
 	 * not just .fourcc. Consumers (FFmpeg's hwcontext_vaapi, mpv,
 	 * Firefox) read .byte_order and .bits_per_pixel; leaving them
 	 * uninitialized inherits whatever caller-stack garbage is in
 	 * the buffer and produces non-deterministic behavior. Reference:
 	 * Mesa's gallium/frontends/va/image.c::vlVaQueryImageFormats and
 	 * intel-vaapi-driver's i965_drv_video.c — both publish NV12
 	 * with byte_order=VA_LSB_FIRST and bits_per_pixel=12.
 	 *
 	 * For YUV formats, depth/red_mask/green_mask/blue_mask/alpha_mask
 	 * are not meaningful (those describe RGB bit layouts); leave them
 	 * zeroed via memset before populating.
 	 */
 	memset(&formats[0], 0, sizeof(formats[0]));
 	formats[0].fourcc = VA_FOURCC_NV12;
 	formats[0].byte_order = VA_LSB_FIRST;
 	formats[0].bits_per_pixel = 12;
 	*formats_count = 1;
 	return VA_STATUS_SUCCESS;
@@ -375,6 +225,7 @@ VAStatus RequestQueryImageFormats(VADriverContextP context,
 VAStatus RequestSetImagePalette(VADriverContextP context, VAImageID image_id,
 				unsigned char *palette)
 {
 	request_log("ENTER RequestSetImagePalette\n");
 	return VA_STATUS_ERROR_UNIMPLEMENTED;
 }
@@ -382,12 +233,12 @@ VAStatus RequestGetImage(VADriverContextP context, VASurfaceID surface_id,
 			 int x, int y, unsigned int width, unsigned int height,
 			 VAImageID image_id)
 {
 	request_log("ENTER RequestGetImage\n");
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;
 	struct object_image *image_object;
 	VAImage *image;
 	surface_object = SURFACE(driver_data, surface_id);
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
@@ -409,5 +260,6 @@ VAStatus RequestPutImage(VADriverContextP context, VASurfaceID surface_id,
 			 int dst_x, int dst_y, unsigned int dst_width,
 			 unsigned int dst_height)
 {
 	request_log("ENTER RequestPutImage\n");
 	return VA_STATUS_ERROR_UNIMPLEMENTED;
 }
@@ -26,7 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <poll.h>
+#include <sys/select.h>
 #include <linux/media.h>
@@ -78,19 +78,19 @@ int media_request_queue(int request_fd)
 int media_request_wait_completion(int request_fd)
 {
-	/* poll() instead of select(): Firefox's RDD seccomp policy admits
+	struct timeval tv = { 0, 300000 };
-	 * poll/ppoll but not select/pselect6 (as of FF150). Functionally
+	fd_set except_fds;
 	 * equivalent here — the media request fd signals completion via
 	 * exceptional condition, mapped to POLLPRI for poll(). */
 	struct pollfd pfd = { .fd = request_fd, .events = POLLPRI };
 	int rc;
-	rc = poll(&pfd, 1, 300 /* ms */);
+	FD_ZERO(&except_fds);
 	FD_SET(request_fd, &except_fds);
 	rc = select(request_fd + 1, NULL, NULL, &except_fds, &tv);
 	if (rc == 0) {
 		request_log("Timeout when waiting for media request\n");
 		return -1;
 	} else if (rc < 0) {
-		request_log("Unable to poll media request: %s\n",
+		request_log("Unable to select media request: %s\n",
 			    strerror(errno));
 		return -1;
 	}
@@ -44,23 +44,7 @@ sources = [
 	'v4l2.c',
 	'mpeg2.c',
 	'h264.c',
-	'h264_slice_header.c',
+#	'h265.c'	# Fourier-local: HEVC stripped (see commit log)
 	'request_pool.c',
 	'cap_pool.c',
 	'h265.c',
 	'vp8.c',
 	'vp9.c',
 	'av1.c',
 	'codec.c',
 	# Vendored GStreamer 1.28.2 H.265 parser + utilities (LGPL v2.1+,
 	# see src/h265_parser/gst_compat.h for sourcing notes + per-iter2
 	# adaptation strategy).
 	'h265_parser/gst_compat.c',
 	'h265_parser/gst/base/gstbitreader.c',
 	'h265_parser/gst/base/gstbytereader.c',
 	'h265_parser/gst/codecparsers/nalutils.c',
 	'h265_parser/gst/codecparsers/gsth265parser.c'
 ]
 headers = [
@@ -80,39 +64,11 @@ headers = [
 	'v4l2.h',
 	'mpeg2.h',
 	'h264.h',
-	'h264_slice_header.h',
+#	'h265.h'	# Fourier-local: HEVC stripped (see commit log)
 	'request_pool.h',
 	'cap_pool.h',
 	'h265.h',
 	'vp8.h',
 	'vp9.h',
 	'av1.h',
 	'codec.h',
 	# Internal mirror of Linux 7.0 V4L2 HEVC EXT_SPS_*_RPS UAPI defs
 	# (allows building against pre-7.0 linux-api-headers; redundant
 	# once the host headers are 7.0+).
 	'hevc-ctrls/v4l2-hevc-ext-controls.h',
 	# Vendored GStreamer + project shim headers (see sources above).
 	'h265_parser/gst_compat.h',
 	'h265_parser/gst/gst.h',
 	'h265_parser/gst/glib-compat-private.h',
 	'h265_parser/gst/base/base-prelude.h',
 	'h265_parser/gst/base/gstbitreader.h',
 	'h265_parser/gst/base/gstbytereader.h',
 	'h265_parser/gst/base/gstbitwriter.h',
 	'h265_parser/gst/codecparsers/codecparsers-prelude.h',
 	'h265_parser/gst/codecparsers/gsth265parser.h',
 	'h265_parser/gst/codecparsers/nalutils.h'
 ]
 includes = [
-	include_directories('../include'),
+	include_directories('../include')
 	# Vendored GStreamer parser tree — the parser's #include <gst/base/...>
 	# style references resolve here via stub headers that redirect to
 	# gst_compat.h.
 	include_directories('h265_parser')
 ]
 cflags = [
@@ -23,34 +23,6 @@
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 /*
 * fresnel-fourier iter1 Phase 6 commit B: rewrite against new split
 * V4L2_CID_STATELESS_MPEG2_{SEQUENCE,PICTURE,QUANTISATION} stateless
 * controls (mainline kernel <linux/v4l2-controls.h>:1985-2105).
 *
 * Replaces the staging-era V4L2_CID_MPEG_VIDEO_MPEG2_{SLICE_PARAMS,
 * QUANTIZATION} combined-struct API that the fork previously used
 * via include/mpeg2-ctrls.h (deleted in commit C).
 *
 * Per-frame submission: one batched VIDIOC_S_EXT_CTRLS with three
 * controls (12-byte SEQUENCE + 32-byte PICTURE + 256-byte QUANTISATION),
 * matching FFmpeg libavcodec/v4l2_request_mpeg2.c:130-155 reference
 * implementation. Verified empirically in fresnel-fourier Phase 0
 * cross-validator sweep and Phase 3 Baseline C verbatim payload.
 *
 * Quantisation matrix order: zigzag scanning order per kernel doc
 * v4l2-controls.h:2076. VAAPI VAIQMatrixBufferMPEG2 also stores in
 * zigzag scanning order (per VAAPI spec). Direct memcpy works; no
 * permutation in the libva backend. Kernel hantro_mpeg2.c::
 * hantro_mpeg2_dec_copy_qtable applies the zigzag-to-raster
 * permutation when copying to the hardware quantisation table.
 *
 * Default matrices (when iqmatrix_set==false): MPEG-2 spec defaults
 * per ISO/IEC 13818-2 Table 7-3, transcribed from Phase 3 Baseline C
 * QUANTISATION verbatim payload (256 bytes captured from
 * ffmpeg-v4l2request decode of bbb_720p10s_mpeg2.ts).
 */
 #include "mpeg2.h"
 #include "context.h"
 #include "request.h"
@@ -63,187 +35,120 @@
 #include <sys/mman.h>
 #include <linux/videodev2.h>
-#include <linux/v4l2-controls.h>
+#include <mpeg2-ctrls.h>
 #include "v4l2.h"
 /*
 * MPEG-2 default intra quantisation matrix in zigzag scanning order
 * (ISO/IEC 13818-2 Table 7-3, verified empirically against
 * fresnel-fourier Phase 3 Baseline C QUANTISATION payload bytes 0..63
 * from a ffmpeg-v4l2request decode of the BBB 720p10s MPEG-2 fixture).
 */
 static const __u8 mpeg2_default_intra_matrix[64] = {
 	  8,  16,  16,  19,  16,  19,  22,  22,
 	 22,  22,  22,  22,  26,  24,  26,  27,
 	 27,  27,  26,  26,  26,  26,  27,  27,
 	 27,  29,  29,  29,  34,  34,  34,  29,
 	 29,  29,  27,  27,  29,  29,  32,  32,
 	 34,  34,  37,  38,  37,  35,  35,  34,
 	 35,  38,  38,  40,  40,  40,  48,  48,
 	 46,  46,  56,  56,  58,  69,  69,  83,
 };
 /*
 * MPEG-2 default non-intra quantisation matrix is uniformly 16 in spec.
 * Verified against Phase 3 Baseline C QUANTISATION payload bytes
 * 64..127 (all 0x10 = 16). Same applies to chroma_non_intra
 * (bytes 192..255). Filled at runtime via memset rather than a
 * separate const array to keep the binary smaller.
 */
 int mpeg2_set_controls(struct request_data *driver_data,
 		       struct object_context *context_object,
 		       struct object_surface *surface_object)
 {
 	VAPictureParameterBufferMPEG2 *picture =
 		&surface_object->params.mpeg2.picture;
 	VASliceParameterBufferMPEG2 *slice =
 		&surface_object->params.mpeg2.slice;
 	VAIQMatrixBufferMPEG2 *iqmatrix =
 		&surface_object->params.mpeg2.iqmatrix;
 	bool iqmatrix_set = surface_object->params.mpeg2.iqmatrix_set;
-
+	struct v4l2_ctrl_mpeg2_slice_params slice_params;
-	/* Clause 2: v4l2_ctrl_mpeg2_sequence (12 bytes) */
+	struct v4l2_ctrl_mpeg2_quantization quantization;
 	struct v4l2_ctrl_mpeg2_sequence sequence;
 	/* Clause 3: v4l2_ctrl_mpeg2_picture (32 bytes; reserved[5] must be zero) */
 	struct v4l2_ctrl_mpeg2_picture pic;
 	/* Clause 4: v4l2_ctrl_mpeg2_quantisation (256 bytes) */
 	struct v4l2_ctrl_mpeg2_quantisation quant;
 	struct object_surface *forward_reference_surface;
 	struct object_surface *backward_reference_surface;
 	uint64_t timestamp;
 	unsigned int i;
 	int rc;
-	memset(&sequence, 0, sizeof sequence);
+	memset(&slice_params, 0, sizeof(slice_params));
 	memset(&pic, 0, sizeof pic);  /* zeros pic.reserved[5] per Clause 3 */
 	memset(&quant, 0, sizeof quant);
-	/* === Clause 2: SEQUENCE ===
+	slice_params.bit_size = surface_object->slices_size * 8;
-	 *
+	slice_params.data_bit_offset = 0;
-	 * VAAPI's VAPictureParameterBufferMPEG2 doesn't expose the
+
-	 * sequence-extension's progressive_sequence flag separately;
+	slice_params.sequence.horizontal_size = picture->horizontal_size;
-	 * use progressive_frame from the picture-coding extension as a
+	slice_params.sequence.vertical_size = picture->vertical_size;
-	 * proxy. They're identical for typical streams (BBB is
+	slice_params.sequence.vbv_buffer_size = SOURCE_SIZE_MAX;
-	 * progressive throughout).
+
-	 */
+	slice_params.sequence.profile_and_level_indication = 0;
-	sequence.horizontal_size = picture->horizontal_size;
+	slice_params.sequence.progressive_sequence = 0;
-	sequence.vertical_size = picture->vertical_size;
+	slice_params.sequence.chroma_format = 1; // 4:2:0
-	sequence.vbv_buffer_size = surface_object->source_size;
+
-	sequence.profile_and_level_indication = 0;  /* not exposed by VAAPI */
+	slice_params.picture.picture_coding_type = picture->picture_coding_type;
-	sequence.chroma_format = 1;  /* 4:2:0 — campaign codec scope */
+	slice_params.picture.f_code[0][0] = (picture->f_code >> 12) & 0x0f;
-	if (picture->picture_coding_extension.bits.progressive_frame)
+	slice_params.picture.f_code[0][1] = (picture->f_code >> 8) & 0x0f;
-		sequence.flags |= V4L2_MPEG2_SEQ_FLAG_PROGRESSIVE;
+	slice_params.picture.f_code[1][0] = (picture->f_code >> 4) & 0x0f;
 	slice_params.picture.f_code[1][1] = (picture->f_code >> 0) & 0x0f;
 	slice_params.picture.intra_dc_precision =
 		picture->picture_coding_extension.bits.intra_dc_precision;
 	slice_params.picture.picture_structure =
 		picture->picture_coding_extension.bits.picture_structure;
 	slice_params.picture.top_field_first =
 		picture->picture_coding_extension.bits.top_field_first;
 	slice_params.picture.frame_pred_frame_dct =
 		picture->picture_coding_extension.bits.frame_pred_frame_dct;
 	slice_params.picture.concealment_motion_vectors =
 		picture->picture_coding_extension.bits
 			.concealment_motion_vectors;
 	slice_params.picture.q_scale_type =
 		picture->picture_coding_extension.bits.q_scale_type;
 	slice_params.picture.intra_vlc_format =
 		picture->picture_coding_extension.bits.intra_vlc_format;
 	slice_params.picture.alternate_scan =
 		picture->picture_coding_extension.bits.alternate_scan;
 	slice_params.picture.repeat_first_field =
 		picture->picture_coding_extension.bits.repeat_first_field;
 	slice_params.picture.progressive_frame =
 		picture->picture_coding_extension.bits.progressive_frame;
 	slice_params.quantiser_scale_code = slice->quantiser_scale_code;
 	/* === Clause 3: PICTURE ===
 	 *
 	 * Behavioral correction vs. previous mpeg2.c at this iter1:
 	 * old code self-referenced surface_object->timestamp when the
 	 * VAAPI ref picture was VA_INVALID_ID. New code sets ts = 0 for
 	 * missing refs, matching kernel doc's 0-as-sentinel convention
 	 * (verified against Phase 3 Baseline C frame 1: I-frame has both
 	 * forward_ref_ts and backward_ref_ts == 0; FFmpeg
 	 * libavcodec/v4l2_request_mpeg2.c:98-108 uses same convention).
 	 */
 	forward_reference_surface =
 		SURFACE(driver_data, picture->forward_reference_picture);
-	if (forward_reference_surface != NULL)
+	if (forward_reference_surface == NULL)
-		pic.forward_ref_ts =
+		forward_reference_surface = surface_object;
-			v4l2_timeval_to_ns(&forward_reference_surface->timestamp);
+
 	timestamp = v4l2_timeval_to_ns(&forward_reference_surface->timestamp);
 	slice_params.forward_ref_ts = timestamp;
 	backward_reference_surface =
 		SURFACE(driver_data, picture->backward_reference_picture);
-	if (backward_reference_surface != NULL)
+	if (backward_reference_surface == NULL)
-		pic.backward_ref_ts =
+		backward_reference_surface = surface_object;
 			v4l2_timeval_to_ns(&backward_reference_surface->timestamp);
-	if (picture->picture_coding_extension.bits.top_field_first)
+	timestamp = v4l2_timeval_to_ns(&backward_reference_surface->timestamp);
-		pic.flags |= V4L2_MPEG2_PIC_FLAG_TOP_FIELD_FIRST;
+	slice_params.backward_ref_ts = timestamp;
 	if (picture->picture_coding_extension.bits.frame_pred_frame_dct)
 		pic.flags |= V4L2_MPEG2_PIC_FLAG_FRAME_PRED_DCT;
 	if (picture->picture_coding_extension.bits.concealment_motion_vectors)
 		pic.flags |= V4L2_MPEG2_PIC_FLAG_CONCEALMENT_MV;
 	if (picture->picture_coding_extension.bits.q_scale_type)
 		pic.flags |= V4L2_MPEG2_PIC_FLAG_Q_SCALE_TYPE;
 	if (picture->picture_coding_extension.bits.intra_vlc_format)
 		pic.flags |= V4L2_MPEG2_PIC_FLAG_INTRA_VLC;
 	if (picture->picture_coding_extension.bits.alternate_scan)
 		pic.flags |= V4L2_MPEG2_PIC_FLAG_ALT_SCAN;
 	if (picture->picture_coding_extension.bits.repeat_first_field)
 		pic.flags |= V4L2_MPEG2_PIC_FLAG_REPEAT_FIRST;
 	if (picture->picture_coding_extension.bits.progressive_frame)
 		pic.flags |= V4L2_MPEG2_PIC_FLAG_PROGRESSIVE;
-	pic.f_code[0][0] = (picture->f_code >> 12) & 0x0f;
+	rc = v4l2_set_control(driver_data->video_fd, surface_object->request_fd,
-	pic.f_code[0][1] = (picture->f_code >>  8) & 0x0f;
+			      V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS,
-	pic.f_code[1][0] = (picture->f_code >>  4) & 0x0f;
+			      &slice_params, sizeof(slice_params));
 	pic.f_code[1][1] = (picture->f_code >>  0) & 0x0f;
 	pic.picture_coding_type = picture->picture_coding_type;
 	pic.picture_structure =
 		picture->picture_coding_extension.bits.picture_structure;
 	pic.intra_dc_precision =
 		picture->picture_coding_extension.bits.intra_dc_precision;
 	/* pic.reserved[5] zeroed by memset above */
 	/* === Clause 4: QUANTISATION ===
 	 *
 	 * Kernel always reads all four matrices unconditionally
 	 * (no load_* flags in the new API; kernel hantro_mpeg2.c
 	 * doesn't synthesize defaults). When VAAPI's consumer didn't
 	 * send VAIQMatrixBufferType (iqmatrix_set==false), populate
 	 * with MPEG-2 spec default matrices.
 	 *
 	 * VAAPI VAIQMatrixBufferMPEG2 stores matrices in zigzag scanning
 	 * order (per VAAPI spec). Kernel expects zigzag scanning order
 	 * (per v4l2-controls.h:2076). Direct memcpy.
 	 */
 	if (iqmatrix_set) {
 		memcpy(quant.intra_quantiser_matrix,
 		       iqmatrix->intra_quantiser_matrix, 64);
 		memcpy(quant.non_intra_quantiser_matrix,
 		       iqmatrix->non_intra_quantiser_matrix, 64);
 		memcpy(quant.chroma_intra_quantiser_matrix,
 		       iqmatrix->chroma_intra_quantiser_matrix, 64);
 		memcpy(quant.chroma_non_intra_quantiser_matrix,
 		       iqmatrix->chroma_non_intra_quantiser_matrix, 64);
 	} else {
 		memcpy(quant.intra_quantiser_matrix,
 		       mpeg2_default_intra_matrix, 64);
 		memset(quant.non_intra_quantiser_matrix, 16, 64);
 		memcpy(quant.chroma_intra_quantiser_matrix,
 		       mpeg2_default_intra_matrix, 64);
 		memset(quant.chroma_non_intra_quantiser_matrix, 16, 64);
 	}
 	/* === Clause 1+5: batched submission ===
 	 *
 	 * One VIDIOC_S_EXT_CTRLS with all three controls. Matches
 	 * src/h264.c:986 pattern (single v4l2_set_controls call) and
 	 * FFmpeg ff_v4l2_request_decode_frame contract. Bound to the
 	 * surface's permanent request_fd (iter6 per-OUTPUT-slot binding;
 	 * picture.c:284 sets surface_object->request_fd at BeginPicture).
 	 */
 	struct v4l2_ext_control ctrls[3] = {
 		{
 			.id = V4L2_CID_STATELESS_MPEG2_SEQUENCE,
 			.ptr = &sequence,
 			.size = sizeof sequence,
 		},
 		{
 			.id = V4L2_CID_STATELESS_MPEG2_PICTURE,
 			.ptr = &pic,
 			.size = sizeof pic,
 		},
 		{
 			.id = V4L2_CID_STATELESS_MPEG2_QUANTISATION,
 			.ptr = &quant,
 			.size = sizeof quant,
 		},
 	};
 	rc = v4l2_set_controls(driver_data->video_fd,
 			       surface_object->request_fd,
 			       ctrls, 3);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	if (iqmatrix_set) {
 		quantization.load_intra_quantiser_matrix =
 			iqmatrix->load_intra_quantiser_matrix;
 		quantization.load_non_intra_quantiser_matrix =
 			iqmatrix->load_non_intra_quantiser_matrix;
 		quantization.load_chroma_intra_quantiser_matrix =
 			iqmatrix->load_chroma_intra_quantiser_matrix;
 		quantization.load_chroma_non_intra_quantiser_matrix =
 			iqmatrix->load_chroma_non_intra_quantiser_matrix;
 		for (i = 0; i < 64; i++) {
 			quantization.intra_quantiser_matrix[i] =
 				iqmatrix->intra_quantiser_matrix[i];
 			quantization.non_intra_quantiser_matrix[i] =
 				iqmatrix->non_intra_quantiser_matrix[i];
 			quantization.chroma_intra_quantiser_matrix[i] =
 				iqmatrix->chroma_intra_quantiser_matrix[i];
 			quantization.chroma_non_intra_quantiser_matrix[i] =
 				iqmatrix->chroma_non_intra_quantiser_matrix[i];
 		}
 		rc = v4l2_set_control(driver_data->video_fd,
 				      surface_object->request_fd,
 				      V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION,
 				      &quantization, sizeof(quantization));
 	}
 	return 0;
 }
@@ -34,13 +34,8 @@
 #include "h264.h"
 #include "h265.h"
 #include "mpeg2.h"
 #include "vp8.h"
 #include "vp9.h"
 #include "av1.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
@@ -56,7 +51,6 @@
 #include "autoconfig.h"
 static VAStatus codec_store_buffer(struct request_data *driver_data,
 				   struct object_context *context,
 				   VAProfile profile,
 				   struct object_surface *surface_object,
 				   struct object_buffer *buffer_object)
@@ -69,47 +63,6 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 		 * RenderPicture), we can't use a V4L2 buffer directly
 		 * and have to copy from a regular buffer.
 		 */
 		if (context->h264_start_code) {
 			static const char start_code[3] = { 0x00, 0x00, 0x01 };
 			memcpy(surface_object->source_data +
 			       surface_object->slices_size,
 			       start_code, sizeof(start_code));
 			surface_object->slices_size += sizeof(start_code);
 		}
 		/*
 		 * iter33 α-30: VP8 OUTPUT buffer needs the uncompressed
 		 * frame header that ffmpeg-vaapi stripped before submitting
 		 * VASliceData. Hantro's vp8_dec_run reads OUTPUT[0..N] with
 		 * an assumed offset of 10 bytes (keyframe) or 3 bytes
 		 * (interframe) before the first_partition data — see
 		 * rockchip_vpu2_hw_vp8_dec.c:349.
 		 *
 		 * ffmpeg-vaapi (vaapi_vp8.c:191-192) strips
 		 *   header_size = 3 + 7 * s->keyframe
 		 * before submitting the slice data, so libva needs to
 		 * pre-pad the OUTPUT with that many bytes. Hantro only
 		 * uses these bytes for offset arithmetic, not parsing,
 		 * so zero-filled placeholder is sufficient.
 		 *
 		 * ffmpeg-v4l2request (kdirect path) does NOT strip the
 		 * header, hence its OUTPUT is byte-equal to SW reference
 		 * and decode works correctly. This is the only material
 		 * difference between the two front-ends for VP8.
 		 *
 		 * key_frame in VAAPI's pic_fields.bits is INVERTED:
 		 *   0 → keyframe, 1 → interframe.
 		 */
 		if (profile == VAProfileVP8Version0_3 &&
 		    surface_object->params.vp8.iqmatrix_set /* picture parsed by now */) {
 			unsigned int header_size =
 				surface_object->params.vp8.picture.pic_fields.bits.key_frame == 0 ?
 					10 : 3;
 			memset(surface_object->source_data +
 			       surface_object->slices_size,
 			       0, header_size);
 			surface_object->slices_size += header_size;
 		}
 		memcpy(surface_object->source_data +
 			       surface_object->slices_size,
 		       buffer_object->data,
@@ -138,33 +91,6 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			       sizeof(surface_object->params.h264.picture));
 			break;
 		case VAProfileHEVCMain:
 			memcpy(&surface_object->params.h265.picture,
 			       buffer_object->data,
 			       sizeof(surface_object->params.h265.picture));
 			break;
 		case VAProfileVP8Version0_3:
 			memcpy(&surface_object->params.vp8.picture,
 			       buffer_object->data,
 			       sizeof(surface_object->params.vp8.picture));
 			break;
 		case VAProfileVP9Profile0:
 			memcpy(&surface_object->params.vp9.picture,
 			       buffer_object->data,
 			       sizeof(surface_object->params.vp9.picture));
 			break;
 		case VAProfileAV1Profile0:
 			memcpy(&surface_object->params.av1.picture,
 			       buffer_object->data,
 			       sizeof(surface_object->params.av1.picture));
 			/* Reset per-frame tile group entry array on each new
 			 * picture parameter buffer (start of a new frame). */
 			surface_object->params.av1.num_tile_group_entries = 0;
 			break;
 		default:
 			break;
 		}
@@ -182,45 +108,6 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			       sizeof(surface_object->params.h264.slice));
 			break;
 		case VAProfileHEVCMain: {
 			unsigned int n = surface_object->params.h265.num_slices;
 			if (n < HEVC_MAX_SLICES_PER_FRAME) {
 				memcpy(&surface_object->params.h265.slices[n],
 				       buffer_object->data,
 				       sizeof(VASliceParameterBufferHEVC));
 				surface_object->params.h265.num_slices = n + 1;
 			}
 			/* Keep .slice mirror populated as last-slice ref for
 			 * h265_fill_pps which reads dependent_slice_segment_flag */
 			memcpy(&surface_object->params.h265.slice,
 			       buffer_object->data,
 			       sizeof(surface_object->params.h265.slice));
 			break;
 		}
 		case VAProfileVP8Version0_3:
 			memcpy(&surface_object->params.vp8.slice,
 			       buffer_object->data,
 			       sizeof(surface_object->params.vp8.slice));
 			break;
 		case VAProfileVP9Profile0:
 			memcpy(&surface_object->params.vp9.slice,
 			       buffer_object->data,
 			       sizeof(surface_object->params.vp9.slice));
 			break;
 		case VAProfileAV1Profile0: {
 			unsigned int n = surface_object->params.av1.num_tile_group_entries;
 			if (n < AV1_MAX_TILES) {
 				memcpy(&surface_object->params.av1.tile_group_entries[n],
 				       buffer_object->data,
 				       sizeof(VASliceParameterBufferAV1));
 				surface_object->params.av1.num_tile_group_entries = n + 1;
 			}
 			break;
 		}
 		default:
 			break;
 		}
@@ -244,35 +131,6 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			memcpy(&surface_object->params.h264.matrix,
 			       buffer_object->data,
 			       sizeof(surface_object->params.h264.matrix));
 			surface_object->params.h264.matrix_set = true;
 			break;
 		case VAProfileHEVCMain:
 			memcpy(&surface_object->params.h265.iqmatrix,
 			       buffer_object->data,
 			       sizeof(surface_object->params.h265.iqmatrix));
 			surface_object->params.h265.iqmatrix_set = true;
 			break;
 		case VAProfileVP8Version0_3:
 			memcpy(&surface_object->params.vp8.iqmatrix,
 			       buffer_object->data,
 			       sizeof(surface_object->params.vp8.iqmatrix));
 			surface_object->params.vp8.iqmatrix_set = true;
 			break;
 		default:
 			break;
 		}
 		break;
 	case VAProbabilityBufferType:
 		switch (profile) {
 		case VAProfileVP8Version0_3:
 			memcpy(&surface_object->params.vp8.probability,
 			       buffer_object->data,
 			       sizeof(surface_object->params.vp8.probability));
 			surface_object->params.vp8.probability_set = true;
 			break;
 		default:
@@ -307,34 +165,14 @@ static VAStatus codec_set_controls(struct request_data *driver_data,
 	case VAProfileH264ConstrainedBaseline:
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
-		rc = h264_set_controls(driver_data, context, profile,
+		rc = h264_set_controls(driver_data, context, surface_object);
 				       surface_object);
 		if (rc < 0)
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;
-	case VAProfileHEVCMain:
+	/* HEVC stripped: kernel V4L2_CID_MPEG_VIDEO_HEVC_* CIDs were renamed
-		rc = h265_set_controls(driver_data, context, surface_object);
+	 * to V4L2_CID_STATELESS_HEVC_* upstream, and ohm's hantro VPU has no
-		if (rc < 0)
+	 * HEVC support anyway. Falls through to the default case below. */
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;
 	case VAProfileVP8Version0_3:
 		rc = vp8_set_controls(driver_data, context, surface_object);
 		if (rc < 0)
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;
 	case VAProfileVP9Profile0:
 		rc = vp9_set_controls(driver_data, context, surface_object);
 		if (rc < 0)
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;
 	case VAProfileAV1Profile0:
 		rc = av1_set_controls(driver_data, context, surface_object);
 		if (rc < 0)
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		break;
 	default:
 		return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
@@ -349,9 +187,6 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id,
 	struct request_data *driver_data = context->pDriverData;
 	struct object_context *context_object;
 	struct object_surface *surface_object;
 	struct request_pool_slot *slot;
 	int slot_index;
 	context_object = CONTEXT(driver_data, context_id);
 	if (context_object == NULL)
@@ -361,115 +196,9 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id,
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
 	/* AV1 Phase 3 diag */
 	request_log("BeginPicture id=0x%x prev_slot=%p status=%d\n",
 		    surface_object->base.id,
 		    (void *)surface_object->current_slot,
 		    surface_object->status);
 	if (surface_object->status == VASurfaceRendering)
 		RequestSyncSurface(context, surface_id);
 	/*
 	 * Iter2 Fix 3: acquire a CAPTURE-pool slot for this decode cycle.
 	 * If the surface still holds a slot from a prior decode (DECODED
 	 * or EXPORTED — the consumer is done with it by definition since
 	 * we got back to BeginPicture for the same surface), release it
 	 * first. The new slot is bound and its V4L2 index + mmap pointers
 	 * are mirrored into surface_object->destination_* so the existing
 	 * QBUF/DQBUF/EXPBUF code paths see no behavioral change.
 	 *
 	 * AV1 Phase 3 finding: LIBVA_SKIP_REBIND=1 experiment (do NOT
 	 * unbind on rebind) did not improve PASS count for the av1_larger
 	 * film_grain stress vector — proving the iter2 Fix 3 release is
 	 * NOT the source of the inter-frame divergence. The issue is
 	 * deeper in ffmpeg-vaapi's AV1 hwaccel: per byte-equal OUTPUT
 	 * comparison with the patched-ffmpeg-v4l2request reference run
 	 * (LD_LIBRARY_PATH override on a debug libavcodec.so), 7/7 first
 	 * EndPicture submissions are byte-identical, libva has 2 EXTRA.
 	 */
 	if (surface_object->current_slot != NULL)
 		surface_unbind_slot(driver_data, surface_object);
 	/*
 	 * AV1 Phase 5 review Amendment 4: clear any stale
 	 * linked_decode_surface_id from a prior film_grain display→decode
 	 * link. If ffmpeg-vaapi recycles a former display surface as a
 	 * decode target, BeginPicture binds a fresh slot — but without
 	 * this reset, copy_surface_to_image's link-follow would still
 	 * borrow from the now-stale linked surface and serve wrong data.
 	 * Cleared unconditionally (cheap) so the next AV1 grain frame
 	 * re-establishes the link if needed.
 	 */
 	surface_object->linked_decode_surface_id = VA_INVALID_SURFACE;
 	{
 		struct cap_pool_slot *cap_slot =
 			cap_pool_acquire(&driver_data->capture_pool, surface_id);
 		if (cap_slot == NULL)
 			return VA_STATUS_ERROR_ALLOCATION_FAILED;
 		surface_bind_slot(surface_object, cap_slot);
 		/*
 		 * iter8 Phase 7 IMP-1 experiment: env-gated CAPTURE buffer
 		 * pre-zero. LIBVA_V4L2_ZERO_CAPTURE=1 wipes the slot's mmap'd
 		 * region before kernel decode. Discriminates "kernel writes
 		 * partial then aborts" from "kernel writes nothing and we
 		 * see stale residue."
 		 */
 		{
 			static const char *zero_env = NULL;
 			static bool zero_env_checked = false;
 			if (!zero_env_checked) {
 				zero_env = getenv("LIBVA_V4L2_ZERO_CAPTURE");
 				zero_env_checked = true;
 			}
 			if (zero_env != NULL && zero_env[0] == '1') {
 				unsigned int b;
 				for (b = 0; b < cap_slot->buffers_count; b++)
 					if (cap_slot->map[b] != NULL)
 						memset(cap_slot->map[b], 0,
 						       cap_slot->map_lengths[b]);
 			}
 		}
 	}
 	/*
 	 * Borrow an OUTPUT (bitstream-input) slot from the driver-wide
 	 * pool for the duration of this Begin/Render/End cycle. The
 	 * surface's source_* fields hold the borrow's mmap pointer/size/
 	 * V4L2 buffer index until RequestSyncSurface releases it after
 	 * VIDIOC_DQBUF.
 	 */
 	slot_index = request_pool_acquire(&driver_data->output_pool);
 	if (slot_index < 0)
 		return VA_STATUS_ERROR_ALLOCATION_FAILED;
 	slot = request_pool_slot(&driver_data->output_pool,
 				 (unsigned int)slot_index);
 	if (slot == NULL) {
 		request_pool_release(&driver_data->output_pool,
 				     (unsigned int)slot_index);
 		return VA_STATUS_ERROR_ALLOCATION_FAILED;
 	}
 	surface_object->source_index = slot->index;
 	surface_object->source_data = slot->data;
 	surface_object->source_size = slot->size;
 	/*
 	 * iter6: bind the slot's permanent request_fd to this surface for the
 	 * duration of the decode cycle. Replaces the iter4 close+alloc-per-
 	 * frame model. The fd is REINIT'd (not closed) at RequestSyncSurface,
 	 * so the kernel-side request object is reset in place — no fd-reuse
 	 * race with another slot's pending decode.
 	 */
 	surface_object->request_fd = slot->request_fd;
 	surface_object->slices_size = 0;
 	surface_object->slices_count = 0;
 	surface_object->params.h264.matrix_set = false;
 	surface_object->params.h265.num_slices = 0;
 	surface_object->params.vp8.iqmatrix_set = false;
 	surface_object->params.vp8.probability_set = false;
 	surface_object->status = VASurfaceRendering;
 	context_object->render_surface_id = surface_id;
@@ -505,8 +234,7 @@ VAStatus RequestRenderPicture(VADriverContextP context, VAContextID context_id,
 		if (buffer_object == NULL)
 			return VA_STATUS_ERROR_INVALID_BUFFER;
-		rc = codec_store_buffer(driver_data, context_object,
+		rc = codec_store_buffer(driver_data, config_object->profile,
 					config_object->profile,
 					surface_object, buffer_object);
 		if (rc != VA_STATUS_SUCCESS)
 			return rc;
@@ -547,75 +275,22 @@ VAStatus RequestEndPicture(VADriverContextP context, VAContextID context_id)
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
-	/*
+	gettimeofday(&surface_object->timestamp, NULL);
 	 * iter9 α-7: monotonic per-context counter instead of gettimeofday,
 	 * so DPB.reference_ts / OUTPUT QBUF ts stay small (matches
 	 * ffmpeg-v4l2request's pattern). Confirmed in iter30 sweep
 	 * (1×, 1000×, 1000000× multipliers all produce identical output);
 	 * the counter scheme works on both rkvdec and hantro vb2_find_buffer.
 	 */
 	context_object->timestamp_counter++;
 	surface_object->timestamp.tv_sec =
 		(time_t)(context_object->timestamp_counter / 1000000);
 	surface_object->timestamp.tv_usec =
 		(suseconds_t)(context_object->timestamp_counter % 1000000);
 	/*
 	 * iter6: request_fd was bound to the surface in BeginPicture from
 	 * the OUTPUT pool slot's permanent fd. Per-frame allocation is gone.
 	 */
 	request_fd = surface_object->request_fd;
-	if (request_fd < 0)
+	if (request_fd < 0) {
-		return VA_STATUS_ERROR_OPERATION_FAILED;
+		request_fd = media_request_alloc(driver_data->media_fd);
 		if (request_fd < 0)
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		surface_object->request_fd = request_fd;
 	}
 	rc = codec_set_controls(driver_data, context_object,
 				config_object->profile, surface_object);
 	if (rc != VA_STATUS_SUCCESS)
 		return rc;
 	/*
 	 * iter14 α-16: env-gated dump of OUTPUT bitstream bytes immediately
 	 * before QBUF. LIBVA_V4L2_DUMP_OUTPUT=<dir> writes source_data[0..
 	 * slices_size] to <dir>/output_<profile>_<surface>_<frame>.bin.
 	 * Discriminates whether libva writes the same H.264/HEVC slice bytes
 	 * as kdirect — if YES, Bug 4/5 are not in the OUTPUT-side; if NO,
 	 * narrow to which slice-write path produces the divergence.
 	 *
 	 * Off by default; no behavior change when env unset.
 	 */
 	{
 		static const char *dump_env = NULL;
 		static bool dump_env_checked = false;
 		if (!dump_env_checked) {
 			dump_env = getenv("LIBVA_V4L2_DUMP_OUTPUT");
 			dump_env_checked = true;
 		}
 		if (dump_env != NULL && dump_env[0] != '\0' &&
 		    surface_object->source_data != NULL &&
 		    surface_object->slices_size > 0) {
 			char path[256];
 			snprintf(path, sizeof(path),
 				 "%s/output_p%d_s%u_t%llu.bin",
 				 dump_env, (int)config_object->profile,
 				 (unsigned int)surface_object->base.id,
 				 (unsigned long long)context_object->timestamp_counter);
 			FILE *fp = fopen(path, "wb");
 			if (fp != NULL) {
 				size_t w = fwrite(surface_object->source_data,
 						  1, surface_object->slices_size,
 						  fp);
 				request_log("α-16: dumped %zu bytes to %s "
 					    "(slices_count=%u)\n",
 					    w, path,
 					    surface_object->slices_count);
 				fclose(fp);
 			} else {
 				request_log("α-16: fopen(%s) failed: %s\n",
 					    path, strerror(errno));
 			}
 		}
 	}
 	rc = v4l2_queue_buffer(driver_data->video_fd, -1, capture_type, NULL,
 			       surface_object->destination_index, 0,
 			       surface_object->destination_buffers_count);
@@ -25,12 +25,10 @@
 */
 #include "buffer.h"
 #include "cap_pool.h"
 #include "config.h"
 #include "context.h"
 #include "image.h"
 #include "picture.h"
 #include "request_pool.h"
 #include "subpicture.h"
 #include "surface.h"
@@ -41,9 +39,9 @@
 #include "request.h"
 #include "utils.h"
 #include "v4l2.h"
 #include "video.h"
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -54,524 +52,8 @@
 #include <sys/ioctl.h>
 #include <linux/media.h>
 #include <linux/videodev2.h>
 #include "hevc-ctrls/v4l2-hevc-ext-controls.h"
 /*
 * fresnel-fourier iter4 Phase 6 commit Z + iter7 Phase 6 (B1a): device-path
 * auto-detect via media controller topology with decoder-entity discrimination.
 *
 * Pre-iter4 the backend hardcoded /dev/video0 + /dev/media0. On Linux 7.0 the
 * udev/probe order changed and rockchip-rga (an RGB color converter, no codec
 * support) now claims /dev/video0 — the legacy default returns an empty
 * profile list. iter4 commit Z replaced enumeration-order discovery with
 * media-topology discovery.
 *
 * iter7 (B1a): the iter4 walk treated the hantro-vpu driver name as a single
 * unit, but hantro-vpu registers BOTH encoder and decoder entities under one
 * /dev/mediaN on RK3399. iter4's "pick the first V4L_VIDEO interface" could
 * land on the encoder. iter7 walks ENTITIES looking for
 * MEDIA_ENT_F_PROC_VIDEO_DECODER, then follows the kernel's link graph
 * (data link from proc to IO entity, interface link from IO entity to V4L
 * interface) to the correct /dev/videoN.
 *
 * Two-pass to prefer rkvdec: pass 1 accepts only "rkvdec" (multi-codec
 * decoder, 3 of 5 codecs); pass 2 accepts any known decoder driver. On
 * RK3399 this makes auto-detect always pick rkvdec when available.
 *
 * iter4-B1b (multi-decoder routing — open BOTH rkvdec AND hantro from one
 * backend instance, dispatch per codec) is still deferred. Post-iter7 the
 * backend opens one decoder per process; MPEG-2/VP8 (hantro) still need
 * explicit LIBVA_V4L2_REQUEST_VIDEO_PATH override when iter7's pass-1
 * lands on rkvdec.
 *
 * Escape hatch: LIBVA_V4L2_REQUEST_NO_AUTODETECT=1 reverts to legacy
 * hardcoded /dev/video0 + /dev/media0 for callers that relied on it.
 */
 static const char * const known_decoder_drivers[] = {
 	"rkvdec",
 	"hantro-vpu",
 	"cedrus",
 	"sun4i_csi",
 	NULL
 };
 static int resolve_dev_node(uint32_t major, uint32_t minor, char *out, size_t out_sz)
 {
 	char sysfs_path[64], target[256];
 	ssize_t n;
 	const char *base;
 	snprintf(sysfs_path, sizeof sysfs_path, "/sys/dev/char/%u:%u", major, minor);
 	n = readlink(sysfs_path, target, sizeof target - 1);
 	if (n < 0)
 		return -1;
 	target[n] = '\0';
 	base = strrchr(target, '/');
 	base = base ? base + 1 : target;
 	snprintf(out, out_sz, "/dev/%s", base);
 	return 0;
 }
 /*
 * iter7 B1a: walk topology graph from decoder-proc entity to its V4L_VIDEO
 * interface. Returns 0 + sets video_out on success, -1 if this media device
 * has no decoder entity (e.g. encoder-only device).
 *
 * Algorithm (per Phase 5 review, empirically validated against
 * boltzmann:~/src/linux-rockchip):
 *   1. For each entity E with function == MEDIA_ENT_F_PROC_VIDEO_DECODER:
 *   2.   Find IO entity neighbors via DATA links (entity↔entity).
 *   3.   Find the V4L_VIDEO interface via INTERFACE links from those IO
 *        neighbors.
 *   4.   Resolve interface.devnode.major:minor to /dev/videoN.
 *
 * Two-call MEDIA_IOC_G_TOPOLOGY pattern (Phase 5 IMP-3): first call gets
 * counts; second call fills the three arrays after we allocate them.
 *
 * Link discrimination via MEDIA_LNK_FL_INTERFACE_LINK (1U<<28):
 * data links have flags & MEDIA_LNK_FL_INTERFACE_LINK == 0; interface
 * links have it set. source_id/sink_id ordering is not guaranteed —
 * check both endpoints.
 */
 static int find_decoder_video_node_via_topology(int media_fd,
 						char *video_out,
 						size_t video_out_sz)
 {
 	struct media_v2_topology topo;
 	struct media_v2_entity *entities = NULL;
 	struct media_v2_interface *interfaces = NULL;
 	struct media_v2_link *links = NULL;
 	struct media_v2_pad *pads = NULL;
 	int ret = -1;
 	unsigned int i, j;
 	memset(&topo, 0, sizeof topo);
 	if (ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topo) < 0)
 		return -1;
 	if (topo.num_entities == 0 || topo.num_interfaces == 0 ||
 	    topo.num_links == 0 || topo.num_pads == 0)
 		return -1;
 	entities   = calloc(topo.num_entities,   sizeof *entities);
 	interfaces = calloc(topo.num_interfaces, sizeof *interfaces);
 	links      = calloc(topo.num_links,      sizeof *links);
 	pads       = calloc(topo.num_pads,       sizeof *pads);
 	if (!entities || !interfaces || !links || !pads)
 		goto out;
 	topo.ptr_entities   = (uintptr_t)entities;
 	topo.ptr_interfaces = (uintptr_t)interfaces;
 	topo.ptr_links      = (uintptr_t)links;
 	topo.ptr_pads       = (uintptr_t)pads;
 	if (ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topo) < 0)
 		goto out;
 	for (i = 0; i < topo.num_entities; i++) {
 		uint32_t proc_id;
 		uint32_t proc_pad_ids[16];
 		uint32_t io_entity_ids[16];
 		unsigned int proc_pad_count = 0;
 		unsigned int io_count = 0;
 		if (entities[i].function != MEDIA_ENT_F_PROC_VIDEO_DECODER)
 			continue;
 		proc_id = entities[i].id;
 		/* Step 2a: collect pads belonging to the proc entity. Data
 		 * links connect PADs, not entities directly. */
 		for (j = 0; j < topo.num_pads; j++) {
 			if (pads[j].entity_id != proc_id)
 				continue;
 			if (proc_pad_count < (sizeof proc_pad_ids /
 					      sizeof proc_pad_ids[0]))
 				proc_pad_ids[proc_pad_count++] = pads[j].id;
 		}
 		/* Step 2b: walk data links. For each link with either endpoint
 		 * in proc_pad_ids[], the other endpoint is a pad belonging to
 		 * an IO neighbor. Resolve that pad's entity_id via pads[]. */
 		for (j = 0; j < topo.num_links; j++) {
 			uint32_t other_pad = 0;
 			unsigned int k;
 			if (links[j].flags & MEDIA_LNK_FL_INTERFACE_LINK)
 				continue;
 			for (k = 0; k < proc_pad_count; k++) {
 				if (links[j].source_id == proc_pad_ids[k])
 					other_pad = links[j].sink_id;
 				else if (links[j].sink_id == proc_pad_ids[k])
 					other_pad = links[j].source_id;
 				if (other_pad != 0)
 					break;
 			}
 			if (other_pad == 0)
 				continue;
 			/* Resolve other_pad to its entity_id. */
 			for (k = 0; k < topo.num_pads; k++) {
 				if (pads[k].id != other_pad)
 					continue;
 				if (io_count < (sizeof io_entity_ids /
 						sizeof io_entity_ids[0]))
 					io_entity_ids[io_count++] =
 						pads[k].entity_id;
 				break;
 			}
 		}
 		/* Step 3-4: find an interface link from any IO entity neighbor;
 		 * resolve devnode for the linked V4L_VIDEO interface.
 		 * Interface links connect interfaces↔entities directly (not
 		 * via pads), so source_id/sink_id is an entity ID on one side
 		 * and an interface ID on the other. */
 		for (j = 0; j < topo.num_links; j++) {
 			uint32_t intf_id = 0;
 			unsigned int k;
 			if (!(links[j].flags & MEDIA_LNK_FL_INTERFACE_LINK))
 				continue;
 			for (k = 0; k < io_count; k++) {
 				if (links[j].source_id == io_entity_ids[k])
 					intf_id = links[j].sink_id;
 				else if (links[j].sink_id == io_entity_ids[k])
 					intf_id = links[j].source_id;
 				if (intf_id != 0)
 					break;
 			}
 			if (intf_id == 0)
 				continue;
 			for (k = 0; k < topo.num_interfaces; k++) {
 				if (interfaces[k].id != intf_id)
 					continue;
 				if (interfaces[k].intf_type !=
 				    MEDIA_INTF_T_V4L_VIDEO)
 					break;
 				if (resolve_dev_node(
 					    interfaces[k].devnode.major,
 					    interfaces[k].devnode.minor,
 					    video_out, video_out_sz) == 0)
 					ret = 0;
 				break;
 			}
 			if (ret == 0)
 				goto out;
 		}
 	}
 out:
 	free(entities);
 	free(interfaces);
 	free(links);
 	free(pads);
 	return ret;
 }
 /*
 * iter7 B1a: two-pass walk of /dev/media0..N. Pass 1 accepts only "rkvdec"
 * (multi-codec decoder serving 3 of 5 codecs). Pass 2 accepts any
 * known_decoder_drivers entry. Within each pass, the chosen media device
 * must ALSO contain at least one MEDIA_ENT_F_PROC_VIDEO_DECODER entity —
 * guards against encoder-only devices that happen to share the same driver
 * name (e.g. hantro-vpu encoder vs decoder inside one /dev/mediaN).
 */
 /*
 * iter38: locate a /dev/mediaN whose driver name matches `want_driver`
 * AND exposes at least one MEDIA_ENT_F_PROC_VIDEO_DECODER entity (rules
 * out encoder-only devices sharing the same driver name). Resolves the
 * matching /dev/videoM via topology graph walk.
 *
 * `want_driver`:
 *   - non-NULL → match only that exact driver name
 *   - NULL     → match any name in known_decoder_drivers[]
 */
 /*
 * iter2 (ampere-kernel-decoders campaign) — runtime probe for the
 * V4L2 stateless HEVC EXT_SPS_{ST,LT}_RPS controls added in
 * Linux 7.0 (Casanova VDPU381/VDPU383 series). Returns true iff BOTH
 * controls are registered on the given fd. Stored per-fd on
 * driver_data so the multi-device-probe model (iter38) doesn't
 * silently misbehave when codec routing switches devices.
 *
 * The two CIDs together are the gate — neither alone is meaningful
 * without the other (st-RPS + lt-RPS arrays both need to be set to
 * match the SPS num_short_term_ref_pic_sets / num_long_term_ref_pics_sps
 * counts). Old kernels (RK3399 rkvdec on linux 6.x) register neither;
 * RK3588 rkvdec (VDPU381/383 path) registers both.
 *
 * Reference: phase4_plan_iter2.md §Step 3 in
 * ~/src/ampere-kernel-decoders/.
 */
 static bool probe_hevc_ext_sps_rps_controls(int video_fd)
 {
 	struct v4l2_queryctrl q;
 	if (video_fd < 0)
 		return false;
 	memset(&q, 0, sizeof(q));
 	q.id = V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS;
 	if (ioctl(video_fd, VIDIOC_QUERYCTRL, &q) < 0)
 		return false;
 	memset(&q, 0, sizeof(q));
 	q.id = V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS;
 	if (ioctl(video_fd, VIDIOC_QUERYCTRL, &q) < 0)
 		return false;
 	return true;
 }
 /*
 * Inspect a /dev/videoN's OUTPUT formats for `want_pixfmt`. Returns true
 * iff at least one OUTPUT/OUTPUT_MPLANE format matches.
 *
 * Used to discriminate between multiple devices sharing a driver name —
 * RK3588 has 3 hantro-vpu instances and only one of them is vpu981 (the
 * dedicated AV1 decoder advertising V4L2_PIX_FMT_AV1_FRAME).
 */
 static bool video_node_supports_output_fmt(int video_fd, uint32_t want_pixfmt)
 {
 	struct v4l2_fmtdesc desc;
 	const enum v4l2_buf_type types[] = {
 		V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE,
 		V4L2_BUF_TYPE_VIDEO_OUTPUT,
 	};
 	unsigned int t, i;
 	for (t = 0; t < sizeof(types) / sizeof(types[0]); t++) {
 		for (i = 0; i < 64; i++) {
 			memset(&desc, 0, sizeof desc);
 			desc.index = i;
 			desc.type = types[t];
 			if (ioctl(video_fd, VIDIOC_ENUM_FMT, &desc) < 0)
 				break;
 			if (desc.pixelformat == want_pixfmt)
 				return true;
 		}
 	}
 	return false;
 }
 static int find_decoder_device_by_driver(const char *want_driver,
 					 char *video_out, size_t video_out_sz,
 					 char *media_out, size_t media_out_sz)
 {
 	struct media_device_info info;
 	char path[32];
 	const char * const *kd;
 	int fd, i;
 	for (i = 0; i < 16; i++) {
 		bool match;
 		snprintf(path, sizeof path, "/dev/media%d", i);
 		fd = open(path, O_RDWR | O_NONBLOCK);
 		if (fd < 0)
 			continue;
 		memset(&info, 0, sizeof info);
 		if (ioctl(fd, MEDIA_IOC_DEVICE_INFO, &info) != 0) {
 			close(fd);
 			continue;
 		}
 		if (want_driver != NULL) {
 			match = (strcmp(info.driver, want_driver) == 0);
 		} else {
 			match = false;
 			for (kd = known_decoder_drivers; *kd; kd++) {
 				if (strcmp(info.driver, *kd) == 0) {
 					match = true;
 					break;
 				}
 			}
 		}
 		if (!match) {
 			close(fd);
 			continue;
 		}
 		if (find_decoder_video_node_via_topology(
 			    fd, video_out, video_out_sz) == 0) {
 			snprintf(media_out, media_out_sz, "%s", path);
 			close(fd);
 			return 0;
 		}
 		close(fd);
 	}
 	return -1;
 }
 /*
 * ampere-av1-enablement Phase 2 — like find_decoder_device_by_driver but
 * additionally verifies the resolved /dev/videoN advertises `want_pixfmt`
 * as an OUTPUT format. Required for RK3588 where 3 hantro-vpu instances
 * share the driver name but only one is vpu981 (AV1 decoder).
 *
 * Walks all /dev/media* with matching driver name; takes the first hit
 * whose OUTPUT formats include `want_pixfmt`. Non-matching candidates
 * (encoder-only nodes, legacy hantro for MPEG2/VP8) are skipped.
 */
 static int find_decoder_device_by_driver_with_fmt(const char *want_driver,
 						  uint32_t want_pixfmt,
 						  char *video_out,
 						  size_t video_out_sz,
 						  char *media_out,
 						  size_t media_out_sz)
 {
 	struct media_device_info info;
 	char path[32];
 	char vpath[32];
 	int fd, vfd, i;
 	for (i = 0; i < 16; i++) {
 		snprintf(path, sizeof path, "/dev/media%d", i);
 		fd = open(path, O_RDWR | O_NONBLOCK);
 		if (fd < 0)
 			continue;
 		memset(&info, 0, sizeof info);
 		if (ioctl(fd, MEDIA_IOC_DEVICE_INFO, &info) != 0) {
 			close(fd);
 			continue;
 		}
 		if (strcmp(info.driver, want_driver) != 0) {
 			close(fd);
 			continue;
 		}
 		if (find_decoder_video_node_via_topology(fd, vpath,
 							 sizeof vpath) != 0) {
 			close(fd);
 			continue;
 		}
 		close(fd);
 		/* Capability check: does this /dev/videoN advertise the
 		 * codec-specific OUTPUT format? */
 		vfd = open(vpath, O_RDWR | O_NONBLOCK);
 		if (vfd < 0)
 			continue;
 		if (video_node_supports_output_fmt(vfd, want_pixfmt)) {
 			close(vfd);
 			snprintf(video_out, video_out_sz, "%s", vpath);
 			snprintf(media_out, media_out_sz, "%s", path);
 			return 0;
 		}
 		close(vfd);
 	}
 	return -1;
 }
 static int find_codec_device(char *video_out, size_t video_out_sz,
 			     char *media_out, size_t media_out_sz)
 {
 	if (find_decoder_device_by_driver("rkvdec",
 					  video_out, video_out_sz,
 					  media_out, media_out_sz) == 0)
 		return 0;
 	return find_decoder_device_by_driver(NULL,
 					     video_out, video_out_sz,
 					     media_out, media_out_sz);
 }
 /*
 * iter38: profile → which physical decoder device should serve it on
 * RK3399. Returns 'r' for rkvdec, 'h' for hantro, '?' for unknown.
 *
 * This is RK3399-shaped knowledge — a more general impl would interrogate
 * each open device's supported OUTPUT formats. For the campaign-scope
 * five codecs, the mapping is stable and explicit.
 */
 char request_device_kind_for_profile(VAProfile profile);
 char request_device_kind_for_profile(VAProfile profile)
 {
 	switch (profile) {
 	case VAProfileH264Main:
 	case VAProfileH264High:
 	case VAProfileH264ConstrainedBaseline:
 	case VAProfileH264MultiviewHigh:
 	case VAProfileH264StereoHigh:
 	case VAProfileHEVCMain:
 	case VAProfileVP9Profile0:
 		return 'r';
 	case VAProfileMPEG2Simple:
 	case VAProfileMPEG2Main:
 	case VAProfileVP8Version0_3:
 		return 'h';
 	case VAProfileAV1Profile0:
 		return 'a';   /* ampere-av1-enablement: vpu981 dedicated AV1 */
 	default:
 		return '?';
 	}
 }
 /*
 * iter38: retarget driver_data->{video,media}_fd to the device kind
 * required by `profile`. If a switch is needed, tear down any per-device
 * pool state so the next RequestCreateContext rebuilds it against the
 * new device. Returns 0 on success, -1 if the required device wasn't
 * probed (e.g. trying VP8 on a system without hantro).
 *
 * Safe to call repeatedly with the same profile: if the active fd
 * already matches, the function is a no-op.
 */
 int request_switch_device_for_profile(struct request_data *driver_data,
 				      VAProfile profile);
 int request_switch_device_for_profile(struct request_data *driver_data,
 				      VAProfile profile)
 {
 	char kind = request_device_kind_for_profile(profile);
 	int target_video, target_media;
 	if (kind == 'r') {
 		target_video = driver_data->video_fd_rkvdec;
 		target_media = driver_data->media_fd_rkvdec;
 	} else if (kind == 'h') {
 		target_video = driver_data->video_fd_hantro;
 		target_media = driver_data->media_fd_hantro;
 	} else if (kind == 'a') {
 		target_video = driver_data->video_fd_vpu981;
 		target_media = driver_data->media_fd_vpu981;
 	} else {
 		return -1;
 	}
 	/* Either side never probed (e.g. env-override single-device init,
 	 * or this kind isn't present on the running kernel) → tolerate by
 	 * staying on whatever's already active. RequestCreateConfig still
 	 * accepted the profile via the format probe, so the active fd
 	 * supports it. */
 	if (target_video < 0 || target_media < 0)
 		return 0;
 	if (driver_data->video_fd == target_video &&
 	    driver_data->media_fd == target_media)
 		return 0;  /* already active, nothing to do */
 	/*
 	 * Tear down any per-device pool state. cap_pool needs capture_type,
 	 * which comes from video_format. Both rkvdec and hantro use
 	 * V4L2_PIX_FMT_NV12 MPLANE on RK3399 (verified Phase 0 inventory)
 	 * so the MPLANE form is always right here.
 	 */
 	if (driver_data->capture_pool.initialized) {
 		cap_pool_destroy(&driver_data->capture_pool,
 				 driver_data->video_fd,
 				 V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
 	}
 	if (driver_data->output_pool.initialized)
 		request_pool_destroy(&driver_data->output_pool);
 	/* video_format is a static-ref pointer; re-probe on next
 	 * CreateContext since the new device's format menu may differ. */
 	driver_data->video_format = NULL;
 	driver_data->fmt_valid = false;
 	driver_data->video_fd = target_video;
 	driver_data->media_fd = target_media;
 	return 0;
 }
 /* Set default visibility for the init function only. */
 VAStatus __attribute__((visibility("default")))
 VA_DRIVER_INIT_FUNC(VADriverContextP context);
@@ -665,23 +147,9 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 	object_heap_init(&driver_data->image_heap, sizeof(struct object_image),
 			 IMAGE_ID_OFFSET);
 	static char auto_video[32], auto_media[32];
 	bool auto_media_set = false;
 	video_path = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
-	if (video_path == NULL) {
+	if (video_path == NULL)
-		if (getenv("LIBVA_V4L2_REQUEST_NO_AUTODETECT")) {
+		video_path = "/dev/video0";
 			video_path = "/dev/video0";
 		} else if (find_codec_device(auto_video, sizeof auto_video,
 					     auto_media, sizeof auto_media) == 0) {
 			video_path = auto_video;
 			auto_media_set = true;
 			request_log("auto-selected codec device: %s + %s\n",
 				    auto_video, auto_media);
 		} else {
 			video_path = "/dev/video0";
 		}
 	}
 	video_fd = open(video_path, O_RDWR | O_NONBLOCK);
 	if (video_fd < 0)
@@ -702,12 +170,8 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 	}
 	media_path = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
-	if (media_path == NULL) {
+	if (media_path == NULL)
-		if (auto_media_set)
+		media_path = "/dev/media0";
 			media_path = auto_media;
 		else
 			media_path = "/dev/media0";
 	}
 	media_fd = open(media_path, O_RDWR | O_NONBLOCK);
 	if (media_fd < 0)
@@ -715,138 +179,19 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 	driver_data->video_fd = video_fd;
 	driver_data->media_fd = media_fd;
 	driver_data->video_fd_rkvdec = -1;
 	driver_data->media_fd_rkvdec = -1;
 	driver_data->video_fd_hantro = -1;
 	driver_data->media_fd_hantro = -1;
 	driver_data->video_fd_vpu981 = -1;
 	driver_data->media_fd_vpu981 = -1;
-	/*
+	/* Probe the CAPTURE pixel format eagerly so vaCreateContext doesn't
-	 * iter38: probe BOTH rkvdec and hantro-vpu so a single libva session
+	 * race ahead of vaCreateSurfaces. Fine if this returns NULL — we
-	 * can serve all 5 codecs. Tag the primary fd (already opened above)
+	 * still let the driver init succeed and let surface creation try
-	 * by inspecting which driver the media_fd is on, then probe the OTHER
+	 * again later (preserves the original lazy behaviour for any caller
-	 * driver and open its fds if present. RequestCreateConfig retargets
+	 * that needs it). */
-	 * driver_data->{video,media}_fd to the right pair per profile.
+	driver_data->video_format = video_format_probe(video_fd);
-	 *
+	if (driver_data->video_format != NULL)
-	 * Skip the alt-probe when the user provided explicit
+		request_log("Init: detected CAPTURE format %s (mplane=%d)\n",
-	 * LIBVA_V4L2_REQUEST_VIDEO_PATH / MEDIA_PATH overrides — they signal
+			    driver_data->video_format->description,
-	 * a specific single device intent.
+			    driver_data->video_format->v4l2_mplane);
-	 */
+	else
-	if (!getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH") &&
+		request_log("Init: no CAPTURE format detected at probe time\n");
 	    !getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH")) {
 		struct media_device_info info;
 		const char *primary_driver = NULL;
 		const char *alt_driver = NULL;
 		memset(&info, 0, sizeof info);
 		if (ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &info) == 0) {
 			if (strcmp(info.driver, "rkvdec") == 0) {
 				primary_driver = "rkvdec";
 				alt_driver = "hantro-vpu";
 				driver_data->video_fd_rkvdec = video_fd;
 				driver_data->media_fd_rkvdec = media_fd;
 			} else if (strcmp(info.driver, "hantro-vpu") == 0) {
 				primary_driver = "hantro-vpu";
 				alt_driver = "rkvdec";
 				driver_data->video_fd_hantro = video_fd;
 				driver_data->media_fd_hantro = media_fd;
 			}
 		}
 		if (alt_driver != NULL) {
 			static char alt_video[32], alt_media[32];
 			if (find_decoder_device_by_driver(alt_driver,
 							  alt_video, sizeof alt_video,
 							  alt_media, sizeof alt_media) == 0) {
 				int alt_v = open(alt_video, O_RDWR | O_NONBLOCK);
 				int alt_m = (alt_v >= 0) ? open(alt_media, O_RDWR | O_NONBLOCK) : -1;
 				if (alt_v >= 0 && alt_m >= 0) {
 					if (strcmp(alt_driver, "rkvdec") == 0) {
 						driver_data->video_fd_rkvdec = alt_v;
 						driver_data->media_fd_rkvdec = alt_m;
 					} else {
 						driver_data->video_fd_hantro = alt_v;
 						driver_data->media_fd_hantro = alt_m;
 					}
 					request_log("iter38: also opened %s decoder at %s + %s\n",
 						    alt_driver, alt_video, alt_media);
 				} else {
 					if (alt_v >= 0) close(alt_v);
 					if (alt_m >= 0) close(alt_m);
 				}
 			}
 		}
 		(void)primary_driver;
 		/*
 		 * ampere-av1-enablement Phase 2 — additionally probe for
 		 * vpu981 (RK3588's dedicated AV1 decoder). Driver name
 		 * "hantro-vpu" alone is ambiguous on RK3588 (3 instances:
 		 * legacy MPEG2/VP8, encoder, vpu981 AV1). Discriminate by
 		 * V4L2_PIX_FMT_AV1_FRAME capability. If the primary or alt
 		 * hantro happens to BE vpu981 (unlikely but possible on
 		 * non-RK3588 boards), this probe finds it again and we just
 		 * dedupe via the fd value.
 		 */
 		{
 			static char av1_video[32], av1_media[32];
 			if (find_decoder_device_by_driver_with_fmt(
 				    "hantro-vpu", V4L2_PIX_FMT_AV1_FRAME,
 				    av1_video, sizeof av1_video,
 				    av1_media, sizeof av1_media) == 0) {
 				int av1_v = open(av1_video, O_RDWR | O_NONBLOCK);
 				int av1_m = (av1_v >= 0)
 					? open(av1_media, O_RDWR | O_NONBLOCK)
 					: -1;
 				if (av1_v >= 0 && av1_m >= 0) {
 					driver_data->video_fd_vpu981 = av1_v;
 					driver_data->media_fd_vpu981 = av1_m;
 					request_log(
 					    "ampere-av1: vpu981 AV1 decoder "
 					    "at %s + %s\n",
 					    av1_video, av1_media);
 				} else {
 					if (av1_v >= 0) close(av1_v);
 					if (av1_m >= 0) close(av1_m);
 				}
 			}
 		}
 	}
 	/*
 	 * iter2 (ampere-kernel-decoders): probe the new HEVC EXT_SPS_RPS
 	 * controls on each rkvdec/hantro fd. Result is consumed by
 	 * h265_set_controls per-codec gate. Per-fd storage matches the
 	 * iter38 multi-device-probe pattern (Phase 5 review item).
 	 */
 	driver_data->has_hevc_ext_sps_rps_rkvdec =
 		probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rkvdec);
 	driver_data->has_hevc_ext_sps_rps_hantro =
 		probe_hevc_ext_sps_rps_controls(driver_data->video_fd_hantro);
 	if (driver_data->has_hevc_ext_sps_rps_rkvdec) {
 		request_log("iter2: kernel registers HEVC EXT_SPS_{ST,LT}_RPS "
 			    "controls on rkvdec fd (will route through "
 			    "vendored GStreamer parser)\n");
 	}
 	/*
 	 * ampere-av1 Phase 2.1: probe V4L2_CID_STATELESS_AV1_FILM_GRAIN
 	 * on the vpu981 fd. Per Janet v3 amendment, this runs at backend
 	 * init (not lazily) so any race window with concurrent device
 	 * switching can't observe an inconsistent flag.
 	 */
 	driver_data->has_av1_film_grain = false;
 	if (driver_data->video_fd_vpu981 >= 0) {
 		struct v4l2_query_ext_ctrl qec;
 		if (v4l2_query_ext_ctrl(driver_data->video_fd_vpu981,
 					V4L2_CID_STATELESS_AV1_FILM_GRAIN,
 					&qec) == 0) {
 			driver_data->has_av1_film_grain = true;
 			request_log("ampere-av1: vpu981 advertises FILM_GRAIN "
 				    "control (will include in per-frame batch)\n");
 		}
 	}
 	status = VA_STATUS_SUCCESS;
 	goto complete;
@@ -874,40 +219,8 @@ VAStatus RequestTerminate(VADriverContextP context)
 	struct object_config *config_object;
 	int iterator;
-	/*
+	close(driver_data->video_fd);
-	 * Tear down the OUTPUT buffer pool before closing video_fd so
+	close(driver_data->media_fd);
 	 * the munmap calls in request_pool_destroy() can still touch the
 	 * mmap regions (which are tied to that fd's lifetime).
 	 */
 	request_pool_destroy(&driver_data->output_pool);
 	/*
 	 * iter38: close both probed device pairs. video_fd / media_fd above
 	 * are ACTIVE pointers into one of these pairs; close the underlying
 	 * fds explicitly. Each may be -1 if its device wasn't found.
 	 */
 	if (driver_data->video_fd_rkvdec >= 0)
 		close(driver_data->video_fd_rkvdec);
 	if (driver_data->media_fd_rkvdec >= 0)
 		close(driver_data->media_fd_rkvdec);
 	if (driver_data->video_fd_hantro >= 0)
 		close(driver_data->video_fd_hantro);
 	if (driver_data->media_fd_hantro >= 0)
 		close(driver_data->media_fd_hantro);
 	if (driver_data->video_fd_vpu981 >= 0)
 		close(driver_data->video_fd_vpu981);
 	if (driver_data->media_fd_vpu981 >= 0)
 		close(driver_data->media_fd_vpu981);
 	/* Fall back to direct close if neither alt fd captured the active
 	 * pair (env-override path). */
 	if (driver_data->video_fd_rkvdec < 0 &&
 	    driver_data->video_fd_hantro < 0 &&
 	    driver_data->video_fd_vpu981 < 0) {
 		if (driver_data->video_fd >= 0)
 			close(driver_data->video_fd);
 		if (driver_data->media_fd >= 0)
 			close(driver_data->media_fd);
 	}
 	/* Cleanup leftover buffers. */
@@ -31,15 +31,11 @@
 #include "context.h"
 #include "object_heap.h"
 #include "request_pool.h"
 #include "cap_pool.h"
 #include "video.h"
 #include <va/va.h>
 #include <linux/videodev2.h>
 #include "hevc-ctrls/v4l2-hevc-ext-controls.h"
 #define V4L2_REQUEST_STR_VENDOR			"v4l2-request"
 #define V4L2_REQUEST_MAX_PROFILES		11
@@ -58,163 +54,10 @@ struct request_data {
 	int video_fd;
 	int media_fd;
 	/*
 	 * iter38: multi-device probe. RK3399 has two V4L2 stateless decoders:
 	 *   - rkvdec → H264 / HEVC / VP9
 	 *   - hantro-vpu (rk3399-vpu-dec) → MPEG-2 / VP8
 	 * At VA_DRIVER_INIT we probe both, open their fds, and store them
 	 * here. driver_data->video_fd / media_fd above are the "active" fds
 	 * (point at one of the pairs below). RequestCreateConfig retargets
 	 * them based on the profile's required device. Pools and video_format
 	 * are torn down at retarget time so the next CreateContext rebuilds
 	 * them against the right device.
 	 *
 	 * -1 means that device kind isn't present on this kernel boot.
 	 * Honours LIBVA_V4L2_REQUEST_VIDEO_PATH / MEDIA_PATH explicit
 	 * overrides — when those are set, only the single requested device
 	 * is opened and the alt fds stay -1.
 	 */
 	int video_fd_rkvdec;
 	int media_fd_rkvdec;
 	int video_fd_hantro;
 	int media_fd_hantro;
 	/*
 	 * ampere-av1-enablement Phase 2 — vpu981 is a THIRD physical
 	 * hantro-vpu instance on RK3588 (separate from the legacy MPEG2/VP8
 	 * hantro at /dev/video2). It's the dedicated AV1 decoder at
 	 * /dev/video4 with card name "rockchip,rk3588-av1-vpu-dec".
 	 *
 	 * Driver-name alone ("hantro-vpu") is ambiguous on RK3588 — three
 	 * instances share the name. The probe discriminates by capability:
 	 * which OUTPUT format does the device advertise? Only vpu981
 	 * exposes V4L2_PIX_FMT_AV1_FRAME.
 	 */
 	int video_fd_vpu981;
 	int media_fd_vpu981;
 	/*
 	 * iter2 (ampere-kernel-decoders campaign) — per-fd probe result
 	 * for the V4L2_CID_STATELESS_HEVC_EXT_SPS_{ST,LT}_RPS controls
 	 * introduced in Linux 7.0 (Casanova VDPU381/VDPU383 series).
 	 * RK3399 rkvdec doesn't have them and the probe returns false;
 	 * RK3588 rkvdec (VDPU381/383) registers them and the probe is
 	 * true. h265_set_controls consults only the rkvdec entry because
 	 * HEVC routes through rkvdec only — hantro's entry stays false
 	 * naturally (it doesn't have rkvdec-specific controls).
 	 *
 	 * The pair-of-flags layout mirrors video_fd_rkvdec /
 	 * video_fd_hantro above (iter38 multi-device-probe pattern,
 	 * memory feedback_multi_device_probe_design). Phase 5 review
 	 * surfaced this as a correctness item: a single scalar on
 	 * driver_data would silently misbehave across device-switch
 	 * boundaries; per-fd storage is the safe shape.
 	 */
 	bool has_hevc_ext_sps_rps_rkvdec;
 	bool has_hevc_ext_sps_rps_hantro;
 	/*
 	 * ampere-av1 Phase 2.1: probe result for the optional
 	 * V4L2_CID_STATELESS_AV1_FILM_GRAIN control on the vpu981 fd.
 	 * Probed at VA_DRIVER_INIT (per Janet v3 amendment — init-time
 	 * not lazy). Consumed by av1_set_controls to conditionally include
 	 * the 4th control in the per-frame batch.
 	 *
 	 * True iff vpu981 advertises the control via VIDIOC_QUERY_EXT_CTRL.
 	 * False for non-RK3588 hosts (no vpu981 fd) or older kernels.
 	 */
 	bool has_av1_film_grain;
 	/*
 	 * iter2 — cached SPS-derived RPS arrays. SPS NALs only appear in
 	 * source_data on IDR frames; non-IDR frames' h265_set_controls
 	 * reuse the cached arrays so we don't submit zero-filled RPS to
 	 * the kernel (which would re-trigger the OOPS the iter2 fix is
 	 * designed to prevent). Single-slot cache (sps_id 0 only) —
 	 * adequate for the BBB / typical-stream case; multi-SPS streams
 	 * would need expanding to a [16] cache keyed by sps_id.
 	 *
 	 * The cache stores the post-mapped V4L2 control struct arrays
 	 * (not the intermediate GstH265SPS) so request.h doesn't need
 	 * to know about the vendored GStreamer parser types — only the
 	 * V4L2 UAPI structs from hevc-ctrls/v4l2-hevc-ext-controls.h
 	 * included above.
 	 *
 	 * Owned by h265.c; freed at RequestTerminate.
 	 */
 	struct v4l2_ctrl_hevc_ext_sps_st_rps *hevc_rps_cache_st;
 	unsigned int                          hevc_rps_cache_st_count;
 	struct v4l2_ctrl_hevc_ext_sps_lt_rps *hevc_rps_cache_lt;
 	unsigned int                          hevc_rps_cache_lt_count;
 	bool                                  hevc_rps_cache_valid;
 	struct video_format *video_format;
 	/*
 	 * OUTPUT (bitstream-input) buffer pool, decoupled from VA
 	 * surfaces. Sized by codec pipeline depth, populated on first
 	 * RequestCreateContext, torn down at driver Terminate.
 	 */
 	struct request_pool output_pool;
 	/*
 	 * CAPTURE (decoded-frame) buffer pool, decoupled from VA
 	 * surfaces (iter2 Fix 3). Each surface acquires a slot at
 	 * vaBeginPicture time and releases it on the next acquisition
 	 * or vaDestroySurfaces. Pool sized to max(surfaces_count,
 	 * MIN_CAP_POOL) at first vaCreateSurfaces2; torn down at
 	 * vaDestroyContext.
 	 *
 	 * Background: pre-iter2 each surface was 1:1 bound to one
 	 * CAPTURE buffer index; mpv re-using a surface for a new decode
 	 * caused V4L2 to re-QBUF the same physical buffer while a
 	 * compositor still held an EXPBUF'd dma_buf fd, producing
 	 * visible stutter on mpv vaapi --vo=gpu.
 	 */
 	struct cap_pool capture_pool;
 	/*
 	 * iter5b-β: the pre-β last_output_{width,height} cache fields
 	 * and surface_reset_format_cache() helper are deleted. They
 	 * existed because CreateSurfaces2 owned the OUTPUT-side V4L2
 	 * device-format lifecycle and needed to gate re-S_FMT on
 	 * resolution change. β moves that lifecycle to CreateContext,
 	 * which is naturally one-shot per context cycle; no caching is
 	 * required. DestroyContext + next CreateContext rebuild from
 	 * scratch.
 	 *
 	 * iter5b-β Commit D: cache the format-uniform CAPTURE-side
 	 * geometry from v4l2_get_format so CreateSurfaces2 can populate
 	 * a newly-created surface's destination_* fields without
 	 * re-querying the device. Set by CreateContext after the
 	 * v4l2_get_format(CAPTURE) call; consumed by both:
 	 *   1. CreateContext's surface_heap walk (fills surfaces that
 	 *      pre-exist when CreateContext fires);
 	 *   2. CreateSurfaces2's per-surface init (fills surfaces
 	 *      created AFTER CreateContext, e.g. ffmpeg vaapi-copy
 	 *      pool dynamics where the consumer passes surfaces_count=0
 	 *      to vaCreateContext and creates surfaces lazily).
 	 *
 	 * fmt_valid is true once CreateContext has populated the cache;
 	 * CreateSurfaces2 only lazy-fills when fmt_valid is true.
 	 */
 	bool fmt_valid;
 	unsigned int fmt_format_height;
 	unsigned int fmt_planes_count;
 	unsigned int fmt_buffers_count;
 	unsigned int fmt_sizes[VIDEO_MAX_PLANES];
 	unsigned int fmt_bytesperlines[VIDEO_MAX_PLANES];
 };
 VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context);
 VAStatus RequestTerminate(VADriverContextP context);
 /*
 * iter38: retarget driver_data->{video,media}_fd to the device required by
 * `profile`. Returns 0 on success, -1 on profile not mappable to any kind.
 * Defined in request.c.
 */
 int request_switch_device_for_profile(struct request_data *driver_data,
 				      VAProfile profile);
 #endif
@@ -1,226 +0,0 @@
 /*
 * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
 */
 #include "request_pool.h"
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include "media.h"
 #include "utils.h"
 #include "v4l2.h"
 int request_pool_init(struct request_pool *pool, int video_fd, int media_fd,
 		      unsigned int output_type, unsigned int count)
 {
 	unsigned int index_base;
 	unsigned int length;
 	unsigned int offset;
 	unsigned int i;
 	int rc;
 	if (pool == NULL || count == 0)
 		return -1;
 	if (pool->initialized)
 		return 0;
 	pool->slots = calloc(count, sizeof(*pool->slots));
 	if (pool->slots == NULL)
 		return -1;
 	pool->count = count;
 	pool->next = 0;
 	pool->media_fd = media_fd;	/* iter7: kept for force_release re-alloc */
 	for (i = 0; i < count; i++)
 		pool->slots[i].request_fd = -1;
 	rc = v4l2_create_buffers(video_fd, output_type, count, &index_base);
 	if (rc < 0)
 		goto error;
 	for (i = 0; i < count; i++) {
 		pool->slots[i].index = index_base + i;
 		pool->slots[i].busy = false;
 		rc = v4l2_query_buffer(video_fd, output_type,
 				       pool->slots[i].index,
 				       &length, &offset, 1);
 		if (rc < 0)
 			goto error;
 		pool->slots[i].data = mmap(NULL, length,
 					   PROT_READ | PROT_WRITE,
 					   MAP_SHARED, video_fd, offset);
 		if (pool->slots[i].data == MAP_FAILED) {
 			pool->slots[i].data = NULL;
 			goto error;
 		}
 		pool->slots[i].size = length;
 		/*
 		 * iter6: each pool slot owns a permanent media-request fd,
 		 * allocated once here and REINIT'd between uses in
 		 * RequestSyncSurface. Replaces the iter4 close+alloc-per-
 		 * frame model, whose lowest-free fd reuse was racing with
 		 * the kernel's per-buffer state-machine teardown when the
 		 * consumer rotated through multiple OUTPUT pool slots
 		 * faster than the kernel cleanup drained (Firefox's
 		 * MediaSource pattern). 1:1 slot-to-fd binding eliminates
 		 * cross-slot fd reuse.
 		 */
 		pool->slots[i].request_fd = media_request_alloc(media_fd);
 		if (pool->slots[i].request_fd < 0)
 			goto error;
 	}
 	pool->initialized = true;
 	return 0;
 error:
 	request_pool_destroy(pool);
 	return -1;
 }
 void request_pool_destroy(struct request_pool *pool)
 {
 	unsigned int i;
 	if (pool == NULL || pool->slots == NULL)
 		return;
 	for (i = 0; i < pool->count; i++) {
 		if (pool->slots[i].request_fd >= 0)
 			close(pool->slots[i].request_fd);
 		if (pool->slots[i].data != NULL && pool->slots[i].size > 0)
 			munmap(pool->slots[i].data, pool->slots[i].size);
 	}
 	free(pool->slots);
 	pool->slots = NULL;
 	pool->count = 0;
 	pool->next = 0;
 	pool->initialized = false;
 }
 int request_pool_acquire(struct request_pool *pool)
 {
 	unsigned int start;
 	unsigned int i;
 	if (pool == NULL || !pool->initialized || pool->count == 0)
 		return -1;
 	start = pool->next;
 	for (i = 0; i < pool->count; i++) {
 		unsigned int slot = (start + i) % pool->count;
 		if (!pool->slots[slot].busy) {
 			pool->slots[slot].busy = true;
 			pool->next = (slot + 1) % pool->count;
 			return (int)pool->slots[slot].index;
 		}
 	}
 	/* All slots busy; caller must wait for an in-flight DQBUF. */
 	return -1;
 }
 void request_pool_release(struct request_pool *pool, unsigned int index)
 {
 	unsigned int i;
 	if (pool == NULL || pool->slots == NULL)
 		return;
 	for (i = 0; i < pool->count; i++) {
 		if (pool->slots[i].index == index) {
 			pool->slots[i].busy = false;
 			return;
 		}
 	}
 }
 void request_pool_force_release(struct request_pool *pool, unsigned int index)
 {
 	struct request_pool_slot *slot;
 	unsigned int i;
 	if (pool == NULL || pool->slots == NULL)
 		return;
 	slot = NULL;
 	for (i = 0; i < pool->count; i++) {
 		if (pool->slots[i].index == index) {
 			slot = &pool->slots[i];
 			break;
 		}
 	}
 	if (slot == NULL)
 		return;
 	/*
 	 * Try to recover the kernel-side request object via REINIT first.
 	 * REINIT is the cheap path: kernel resets the request in place,
 	 * fd stays valid, slot can be reused immediately.
 	 */
 	if (slot->request_fd >= 0 && media_request_reinit(slot->request_fd) == 0) {
 		slot->busy = false;
 		return;
 	}
 	/*
 	 * REINIT failed (or slot's fd was already invalid). Close the fd
 	 * and try to allocate a fresh one. This costs an extra ioctl pair
 	 * relative to the REINIT happy path but keeps the slot usable.
 	 *
 	 * NOTE: alloc may return the same lowest-free fd number that was
 	 * just closed. That's fine here because (a) this is a rare error-
 	 * recovery path, not the per-frame happy path, and (b) the slot's
 	 * V4L2 buffer has already been DQBUF'd by this point (or is in an
 	 * indeterminate state we can't recover from regardless), so the
 	 * iter6 race condition (cross-slot fd-reuse against a kernel
 	 * buffer in mid-cleanup) does not apply.
 	 */
 	if (slot->request_fd >= 0)
 		close(slot->request_fd);
 	slot->request_fd = media_request_alloc(pool->media_fd);
 	if (slot->request_fd < 0) {
 		/*
 		 * Realloc failed. Slot is now permanently dead — leave
 		 * busy=true so acquire skips it. Pool capacity is
 		 * effectively reduced by 1 until pool destroy.
 		 */
 		return;
 	}
 	slot->busy = false;
 }
 struct request_pool_slot *request_pool_slot(struct request_pool *pool,
 					    unsigned int index)
 {
 	unsigned int i;
 	if (pool == NULL || pool->slots == NULL)
 		return NULL;
 	for (i = 0; i < pool->count; i++) {
 		if (pool->slots[i].index == index)
 			return &pool->slots[i];
 	}
 	return NULL;
 }
@@ -1,107 +0,0 @@
 /*
 * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
 */
 #ifndef _REQUEST_POOL_H_
 #define _REQUEST_POOL_H_
 #include <stdbool.h>
 /*
 * OUTPUT (bitstream-input) buffer pool, decoupled from caller-allocated
 * VA surfaces. Sizing is driven by codec pipeline depth (typically 4
 * for H.264), not by the consumer's surface count.
 *
 * The pool owns the V4L2 buffer indices and their mmap pointers. A
 * decode request "borrows" a slot at vaBeginPicture, fills it across
 * vaRenderPicture calls, queues it at vaEndPicture, and releases it
 * after VIDIOC_DQBUF returns.
 *
 * This replaces the per-surface OUTPUT-buffer ownership model in the
 * pre-refactor code, where object_surface.source_* fields permanently
 * held a single OUTPUT buffer per surface — incorrect because OUTPUT
 * buffers are request-time resources, not picture-time resources, and
 * because the per-surface loop in RequestCreateContext only ran when
 * surfaces_count > 0 (breaking ffmpeg's vaapi-copy num_render_targets=0
 * convention).
 */
 struct request_pool_slot {
 	unsigned int	index;		/* V4L2 buffer index in OUTPUT queue */
 	void		*data;		/* mmap pointer for this slot */
 	unsigned int	size;		/* mmap size in bytes */
 	bool		busy;		/* true while borrowed for a request */
 	int		request_fd;	/* per-slot media-request fd, allocated
 					 * once at pool init, REINIT'd between
 					 * uses. iter6: replaces iter4 close+
 					 * alloc-per-frame to eliminate cross-
 					 * slot fd-reuse race that broke Firefox
 					 * MediaSource's multi-surface decode. */
 };
 struct request_pool {
 	struct request_pool_slot	*slots;
 	unsigned int			 count;
 	unsigned int			 next;	/* round-robin acquire cursor */
 	int				 media_fd;	/* iter7: kept for
 							 * force_release re-alloc */
 	bool				 initialized;
 };
 /*
 * Allocate count OUTPUT buffers via VIDIOC_CREATE_BUFS, query and mmap
 * each, populate pool->slots[]. Caller must have already done
 * VIDIOC_S_FMT on the OUTPUT queue. Returns 0 on success, -1 on
 * failure.
 */
 int request_pool_init(struct request_pool *pool, int video_fd, int media_fd,
 		      unsigned int output_type, unsigned int count);
 /*
 * Munmap all slots and free the slots array. Idempotent.
 */
 void request_pool_destroy(struct request_pool *pool);
 /*
 * Claim the next free slot (round-robin). Returns the slot's V4L2
 * buffer index on success (slot in pool->slots[] is determined by
 * the returned index), or -1 if all slots are busy.
 */
 int request_pool_acquire(struct request_pool *pool);
 /*
 * Mark the slot at pool->slots[i] free for reuse. Caller must pass the
 * V4L2 buffer index returned earlier from request_pool_acquire().
 */
 void request_pool_release(struct request_pool *pool, unsigned int index);
 /*
 * iter7: error-recovery release. Called from RequestSyncSurface error
 * paths when media_request_reinit or VIDIOC_DQBUF failed mid-cycle and
 * the slot's request_fd is now in an undefined state. REINITs the fd;
 * if REINIT fails (kernel-side request object too far gone), close
 * the fd and re-alloc a fresh one. If realloc also fails, the slot
 * is left busy=true (effectively dead, count decremented by 1) — pool
 * survives but with reduced capacity until driver terminate. Other
 * slots are unaffected.
 *
 * Caller passes the V4L2 buffer index from request_pool_acquire().
 */
 void request_pool_force_release(struct request_pool *pool,
 				unsigned int index);
 /*
 * Look up the pool slot owning a given V4L2 buffer index. Returns
 * pointer to the slot on success, NULL if the index is out of range.
 * The returned pointer is valid until pool destruction; do not free.
 */
 struct request_pool_slot *request_pool_slot(struct request_pool *pool,
 					    unsigned int index);
 #endif
@@ -29,7 +29,6 @@
 #include <assert.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -47,119 +46,6 @@
 #include "v4l2.h"
 #include "video.h"
 /*
 * iter5b-β: the OUTPUT-side V4L2 device-format lifecycle moved out
 * of this file. Pre-β CreateSurfaces2 owned the S_FMT(OUTPUT) +
 * CAPTURE-format probe + cap_pool_init + per-surface destination_*
 * fill; now that responsibility lives in context.c::RequestCreateContext
 * where the bound config (and therefore the active VAProfile) is
 * known via config_id. CreateSurfaces2 retains only surface object
 * ID allocation and per-surface bookkeeping. The previous
 * `surface_reset_format_cache` helper and `last_output_width/height`
 * fields are deleted (β doesn't gate re-S_FMT on
 * resolution — the lifecycle is CreateContext-centric and natural
 * setup/teardown happens at each context cycle).
 */
 /*
 * Iter2 Fix 3 helpers — bind / unbind a cap_pool_slot to an
 * object_surface. Called from BeginPicture (acquire+bind) and
 * DestroySurfaces (unbind). Populates surface_object->destination_*
 * fields from the slot so existing code paths (the QBUF in
 * picture.c::EndPicture, the EXPBUF in ExportSurfaceHandle, the
 * mmap-read in copy_surface_to_image) continue to work unchanged.
 *
 * surface_bind_slot is called only from BeginPicture; the surface's
 * format-uniform fields (destination_planes_count, destination_sizes,
 * destination_offsets, destination_bytesperlines) are already set
 * by CreateSurfaces2 and stay constant.
 */
 void surface_bind_slot(struct object_surface *surface_object,
 		       struct cap_pool_slot *slot)
 {
 	unsigned int j;
 	surface_object->current_slot = slot;
 	surface_object->destination_index = slot->v4l2_index;
 	surface_object->destination_buffers_count = slot->buffers_count;
 	for (j = 0; j < slot->buffers_count; j++) {
 		surface_object->destination_map[j] = slot->map[j];
 		surface_object->destination_map_lengths[j] = slot->map_lengths[j];
 		surface_object->destination_map_offsets[j] = slot->map_offsets[j];
 	}
 	/*
 	 * destination_data[j] is the per-plane CPU pointer used by
 	 * copy_surface_to_image. For single-buffer MPLANE NV12 (our
 	 * common case), all planes live in slot->map[0] at varying
 	 * offsets recorded in destination_offsets[].
 	 */
 	if (slot->buffers_count == 1) {
 		for (j = 0; j < surface_object->destination_planes_count; j++)
 			surface_object->destination_data[j] =
 				(unsigned char *)slot->map[0] +
 				surface_object->destination_offsets[j];
 	} else {
 		for (j = 0; j < surface_object->destination_planes_count; j++)
 			surface_object->destination_data[j] = slot->map[j];
 	}
 }
 void surface_unbind_slot(struct request_data *driver_data,
 			 struct object_surface *surface_object)
 {
 	if (surface_object->current_slot == NULL)
 		return;
 	/* AV1 Phase 3 diag: log every unbind with surface id + slot idx
 	 * + status — confirms whether BeginPicture rebind is racing the
 	 * consumer's vaGetImage on the previous frame. */
 	request_log("surface_unbind_slot id=0x%x status=%d slot_idx=%u\n",
 		    surface_object->base.id,
 		    surface_object->status,
 		    surface_object->current_slot->v4l2_index);
 	cap_pool_release(&driver_data->capture_pool, surface_object->current_slot);
 	surface_object->current_slot = NULL;
 }
 /*
 * iter5b-β Commit D: fill format-uniform destination_* on a surface
 * from driver_data's CAPTURE-format cache. Idempotent: no-op if
 * destination_planes_count is non-zero already.
 */
 void surface_fill_format_uniform(struct request_data *driver_data,
 				 struct object_surface *surface_object)
 {
 	unsigned int j;
 	if (!driver_data->fmt_valid)
 		return;
 	if (surface_object->destination_planes_count != 0)
 		return;
 	surface_object->destination_planes_count = driver_data->fmt_planes_count;
 	surface_object->destination_buffers_count = driver_data->fmt_buffers_count;
 	if (driver_data->fmt_buffers_count == 1) {
 		for (j = 0; j < driver_data->fmt_planes_count; j++) {
 			surface_object->destination_offsets[j] =
 				j > 0 ? driver_data->fmt_sizes[j - 1] : 0;
 			surface_object->destination_sizes[j] =
 				driver_data->fmt_sizes[j];
 			surface_object->destination_bytesperlines[j] =
 				driver_data->fmt_bytesperlines[0];
 		}
 	} else if (driver_data->fmt_buffers_count == driver_data->fmt_planes_count) {
 		for (j = 0; j < driver_data->fmt_planes_count; j++) {
 			surface_object->destination_offsets[j] = 0;
 			surface_object->destination_sizes[j] =
 				driver_data->fmt_sizes[j];
 			surface_object->destination_bytesperlines[j] =
 				driver_data->fmt_bytesperlines[j];
 		}
 	}
 }
 VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
 				unsigned int width, unsigned int height,
 				VASurfaceID *surfaces_ids,
@@ -169,41 +55,146 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
 {
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;
-	unsigned int i;
+	struct video_format *video_format = NULL;
 	unsigned int destination_sizes[VIDEO_MAX_PLANES];
 	unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
 	unsigned int destination_planes_count;
 	unsigned int format_width, format_height;
 	unsigned int capture_type;
 	unsigned int index_base;
 	unsigned int index;
 	unsigned int i, j;
 	VASurfaceID id;
 	int rc;
-	/*
+	request_log("CreateSurfaces2: format=0x%x %dx%d count=%d\n",
-	 * iter5b-β: only RT-format-level validation here. All V4L2
+		    format, width, height, surfaces_count);
-	 * device state (OUTPUT format, CAPTURE format probe,
+
-	 * cap_pool_init, per-surface destination_* fill) is deferred
+	if (format != VA_RT_FORMAT_YUV420) {
-	 * to RequestCreateContext where the bound VAConfigID
+		request_log("CreateSurfaces2: rejecting RT_FORMAT 0x%x\n", format);
 	 * (and therefore the active VAProfile) is known. CreateSurfaces2
 	 * has no config_id parameter; the VA-API contract is
 	 * CreateConfig → CreateSurfaces → CreateContext, and we
 	 * can't know the OUTPUT pixel format until CreateContext binds.
 	 *
 	 * Surface objects allocated here hold only the requested
 	 * width/height and per-surface lifecycle bookkeeping
 	 * (current_slot, status, params, etc). The format-uniform
 	 * destination_* fields are filled by CreateContext via
 	 * surface_bind_format_uniform_fields(); the per-slot
 	 * destination_* fields fill at BeginPicture via surface_bind_slot.
 	 */
 	if (format != VA_RT_FORMAT_YUV420)
 		return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
 	}
 	if (!driver_data->video_format) {
 		/* Could happen if RequestInit's eager probe came back NULL —
 		 * try one more time here in case the device only became
 		 * format-enumerable after streaming setup. */
 		driver_data->video_format = video_format_probe(driver_data->video_fd);
 		if (driver_data->video_format == NULL) {
 			request_log("CreateSurfaces2: video_format probe still NULL\n");
 			return VA_STATUS_ERROR_OPERATION_FAILED;
 		}
 	}
 	video_format = driver_data->video_format;
 	capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
 	/* Set the CAPTURE format on first surface creation so the size is
 	 * pinned to the picture dimensions the caller passed in. */
 	rc = v4l2_set_format(driver_data->video_fd, capture_type,
 			     video_format->v4l2_format, width, height);
 	if (rc < 0) {
 		request_log("CreateSurfaces2: S_FMT(CAPTURE) failed for fmt=0x%x %dx%d\n",
 			    video_format->v4l2_format, width, height);
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	}
 	rc = v4l2_get_format(driver_data->video_fd, capture_type, &format_width,
 			     &format_height, destination_bytesperlines,
 			     destination_sizes, NULL);
 	if (rc < 0) {
 		request_log("CreateSurfaces2: G_FMT(CAPTURE) failed\n");
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	}
 	request_log("CreateSurfaces2: G_FMT got %dx%d bpl[0]=%d size[0]=%d\n",
 		    format_width, format_height,
 		    destination_bytesperlines[0], destination_sizes[0]);
 	destination_planes_count = video_format->planes_count;
 	rc = v4l2_create_buffers(driver_data->video_fd, capture_type,
 				 surfaces_count, &index_base);
 	if (rc < 0) {
 		request_log("CreateSurfaces2: CREATE_BUFS(CAPTURE) failed\n");
 		return VA_STATUS_ERROR_ALLOCATION_FAILED;
 	}
 	request_log("CreateSurfaces2: CREATE_BUFS ok index_base=%u\n", index_base);
 	for (i = 0; i < surfaces_count; i++) {
 		index = index_base + i;
 		id = object_heap_allocate(&driver_data->surface_heap);
 		surface_object = SURFACE(driver_data, id);
 		if (surface_object == NULL)
 			return VA_STATUS_ERROR_ALLOCATION_FAILED;
-		surface_object->current_slot = NULL;	/* iter2 Fix 3 */
+		rc = v4l2_query_buffer(driver_data->video_fd, capture_type,
-		surface_object->linked_decode_surface_id = VA_INVALID_SURFACE;
+				       index,
-		surface_object->av1_order_hint = 0;
+				       surface_object->destination_map_lengths,
-		surface_object->destination_index = 0;	/* set on bind */
+				       surface_object->destination_map_offsets,
-		surface_object->destination_planes_count = 0;	/* set at CreateContext */
+				       video_format->v4l2_buffers_count);
-		surface_object->destination_buffers_count = 0;	/* set at CreateContext */
+		if (rc < 0) {
 			request_log("CreateSurfaces2: QUERYBUF idx=%u failed\n", index);
 			return VA_STATUS_ERROR_ALLOCATION_FAILED;
 		}
 		for (j = 0; j < video_format->v4l2_buffers_count; j++) {
 			surface_object->destination_map[j] =
 				mmap(NULL,
 				     surface_object->destination_map_lengths[j],
 				     PROT_READ | PROT_WRITE, MAP_SHARED,
 				     driver_data->video_fd,
 				     surface_object->destination_map_offsets[j]);
 			if (surface_object->destination_map[j] == MAP_FAILED) {
 				request_log("CreateSurfaces2: mmap idx=%u plane=%u len=%u offset=%u failed\n",
 					    index, j,
 					    surface_object->destination_map_lengths[j],
 					    surface_object->destination_map_offsets[j]);
 				return VA_STATUS_ERROR_ALLOCATION_FAILED;
 			}
 		}
 		/*
 		 * FIXME: Handle this per-pixelformat, trying to generalize it
 		 * is not a reasonable approach. The final description should be
 		 * in terms of (logical) planes.
 		 */
 		if (video_format->v4l2_buffers_count == 1) {
 			destination_sizes[0] = destination_bytesperlines[0] *
 					       format_height;
 			for (j = 1; j < destination_planes_count; j++)
 				destination_sizes[j] = destination_sizes[0] / 2;
 			for (j = 0; j < destination_planes_count; j++) {
 				surface_object->destination_offsets[j] =
 					j > 0 ? destination_sizes[j - 1] : 0;
 				surface_object->destination_data[j] =
 					((unsigned char *)surface_object->destination_map[0] +
 					 surface_object->destination_offsets[j]);
 				surface_object->destination_sizes[j] =
 					destination_sizes[j];
 				surface_object->destination_bytesperlines[j] =
 					destination_bytesperlines[0];
 			}
 		} else if (video_format->v4l2_buffers_count == destination_planes_count) {
 			for (j = 0; j < destination_planes_count; j++) {
 				surface_object->destination_offsets[j] = 0;
 				surface_object->destination_data[j] =
 					surface_object->destination_map[j];
 				surface_object->destination_sizes[j] =
 					destination_sizes[j];
 				surface_object->destination_bytesperlines[j] =
 					destination_bytesperlines[j];
 			}
 		} else {
 			request_log("CreateSurfaces2: buffers_count=%u planes_count=%u mismatch\n",
 				    video_format->v4l2_buffers_count,
 				    destination_planes_count);
 			return VA_STATUS_ERROR_ALLOCATION_FAILED;
 		}
 		surface_object->status = VASurfaceReady;
 		surface_object->width = width;
@@ -213,6 +204,13 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
 		surface_object->source_data = NULL;
 		surface_object->source_size = 0;
 		surface_object->destination_index = index;
 		surface_object->destination_planes_count =
 			destination_planes_count;
 		surface_object->destination_buffers_count =
 			video_format->v4l2_buffers_count;
 		memset(&surface_object->params, 0,
 		       sizeof(surface_object->params));
 		surface_object->slices_count = 0;
@@ -220,17 +218,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
 		surface_object->request_fd = -1;
 		/*
 		 * iter5b-β Commit D: if CreateContext has already populated
 		 * the format-uniform cache (driver_data->fmt_valid), fill
 		 * the new surface's destination_* immediately. This covers
 		 * the case where a consumer creates more surfaces AFTER
 		 * CreateContext. The first batch of surfaces (created before
 		 * CreateContext) gets filled by CreateContext's surface_heap
 		 * walk; this lazy-fill handles late arrivals.
 		 */
 		surface_fill_format_uniform(driver_data, surface_object);
 		surfaces_ids[i] = id;
 	}
@@ -250,32 +237,26 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,
 {
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;
-	unsigned int i;
+	unsigned int i, j;
 	for (i = 0; i < surfaces_count; i++) {
 		surface_object = SURFACE(driver_data, surfaces_ids[i]);
 		if (surface_object == NULL)
 			return VA_STATUS_ERROR_INVALID_SURFACE;
-		/*
+		if (surface_object->source_data != NULL &&
-		 * source_* are now transient borrows from request_pool, not
+		    surface_object->source_size > 0)
-		 * surface-owned mappings; the pool owns the underlying mmap.
+			munmap(surface_object->source_data,
-		 * Nothing to free here.
+			       surface_object->source_size);
 		 *
 		 * Iter2 Fix 3: destination_* mappings are owned by cap_pool;
 		 * surface_unbind_slot returns the slot to FREE (closing OUR
 		 * EXPBUF fd if any). Pool-owned mmaps are freed at
 		 * cap_pool_destroy time (RequestDestroyContext).
 		 */
 		surface_unbind_slot(driver_data, surface_object);
-		/*
+		for (j = 0; j < surface_object->destination_buffers_count; j++)
-		 * iter6: request_fd is owned by the OUTPUT pool slot, not by
+			if (surface_object->destination_map[j] != NULL &&
-		 * the surface. Do not close here. The pool closes all slot
+			    surface_object->destination_map_lengths[j] > 0)
-		 * fds at request_pool_destroy time, which fires from
+				munmap(surface_object->destination_map[j],
-		 * RequestTerminate (driver unload) — the pool is driver-wide
+				       surface_object->destination_map_lengths[j]);
-		 * and survives context destroy/recreate cycles.
+
-		 */
+		if (surface_object->request_fd > 0)
 			close(surface_object->request_fd);
 		object_heap_free(&driver_data->surface_heap,
 				 (struct object_base *)surface_object);
@@ -286,9 +267,8 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,
 VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
 {
 	struct request_data *driver_data = context->pDriverData;
-	struct object_surface *surface_object = NULL;
+	struct object_surface *surface_object;
 	VAStatus status;
 	struct video_format *video_format;
 	unsigned int output_type, capture_type;
@@ -333,51 +313,19 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
 		goto error;
 	}
 	/*
 	 * iter6: the request_fd belongs to the OUTPUT pool slot, not to the
 	 * surface. REINIT to reset its state in place — close+alloc would
 	 * reuse the lowest-free fd number against a kernel object whose
 	 * teardown hasn't fully drained, racing with QBUF on a slot that
 	 * was just released. The pool's 1:1 slot-to-fd binding eliminates
 	 * cross-slot fd reuse, and REINIT here resets the request object
 	 * for the next decode cycle on the same slot.
 	 *
 	 * Iter4's frame-11 EINVAL (which prompted the iter4 close+alloc
 	 * model) was a control-payload bug — DPB carry-over with FFmpeg's
 	 * V4L2_H264_FRAME_REF semantics not yet matched. That's been fixed
 	 * since iter4 (`74d8dd1`), so REINIT is no longer compromised by
 	 * the cluster-validation EINVAL pattern.
 	 */
 	rc = media_request_reinit(request_fd);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
 	surface_object->request_fd = -1;
 	rc = v4l2_dequeue_buffer(driver_data->video_fd, -1, output_type,
 				 surface_object->source_index, 1);
 	if (rc < 0) {
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
-		/*
+		goto error;
 		 * iter7: OUTPUT DQBUF failed. The V4L2 buffer is in an
 		 * indeterminate kernel state — it may still be QUEUED. Do
 		 * NOT return the slot to acquire-rotation: the next QBUF
 		 * on it would EINVAL. Leave source_data set so the error
 		 * handler skips force_release and the slot stays dead-busy.
 		 */
 		goto error_buffer_indeterminate;
 	}
 	/*
 	 * OUTPUT buffer is back from the kernel: return its pool slot
 	 * for reuse and clear the surface's transient borrow handle.
 	 */
 	request_pool_release(&driver_data->output_pool,
 			     surface_object->source_index);
 	surface_object->source_data = NULL;
 	surface_object->source_size = 0;
 	rc = v4l2_dequeue_buffer(driver_data->video_fd, -1, capture_type,
 				 surface_object->destination_index,
 				 surface_object->destination_buffers_count);
@@ -386,152 +334,14 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
 		goto error;
 	}
 	/*
 	 * Iter2 Fix 3: CAPTURE buffer is back from the kernel with valid
 	 * pixel content. Transition the slot IN_DECODE → DECODED. The slot
 	 * stays bound to this surface until either ExportSurfaceHandle
 	 * (→ EXPORTED), the next BeginPicture for this surface (slot is
 	 * released first), or DestroySurfaces (release).
 	 */
 	if (surface_object->current_slot != NULL) {
 		cap_pool_mark_decoded(&driver_data->capture_pool,
 				      surface_object->current_slot);
 		/*
 		 * iter8 Phase 6 (γ): env-gated diagnostic dump of the CAPTURE
 		 * buffer immediately after DQBUF + mark_decoded. Distinguishes
 		 * "kernel didn't write" from "libva mis-reads" for Bug 4
 		 * (H.264 partial-fill). Off by default; enable with
 		 * LIBVA_V4L2_DUMP_CAPTURE=1. destination_data[] is valid here
 		 * (surface_bind_slot populated it at BeginPicture).
 		 */
 		static const char *dump_env = NULL;
 		static bool dump_env_checked = false;
 		if (!dump_env_checked) {
 			dump_env = getenv("LIBVA_V4L2_DUMP_CAPTURE");
 			dump_env_checked = true;
 		}
 		if (dump_env != NULL && dump_env[0] == '1') {
 			unsigned int p;
 			char hexbuf[128];
 			request_log("γ-dump: surface_id=%u v4l2_index=%u planes=%u\n",
 				    (unsigned int)surface_id,
 				    surface_object->destination_index,
 				    surface_object->destination_planes_count);
 			for (p = 0; p < surface_object->destination_planes_count; p++) {
 				const unsigned char *d = surface_object->destination_data[p];
 				size_t sz = surface_object->destination_sizes[p];
 				size_t scan_lim;
 				unsigned int nz = 0;
 				size_t i;
 				int pos;
 				if (d == NULL) {
 					request_log("γ-dump:  plane[%u] NULL ptr (size=%zu)\n",
 						    p, sz);
 					continue;
 				}
 				/*
 				 * Phase 5 MIN-2: scan at least one Y-MB row
 				 * (16 lines * bytesperline) for plane 0, else
 				 * 1024 bytes for chroma plane.
 				 */
 				if (p == 0) {
 					size_t mbrow =
 					    surface_object->destination_bytesperlines[0] * 16;
 					scan_lim = sz < mbrow ? sz : mbrow;
 				} else {
 					scan_lim = sz < 1024 ? sz : 1024;
 				}
 				for (i = 0; i < scan_lim; i++)
 					if (d[i] != 0)
 						nz++;
 				request_log("γ-dump:  plane[%u] sz=%zu bpl=%u "
 					    "scan=%zu non_zero=%u\n",
 					    p, sz,
 					    surface_object->destination_bytesperlines[p],
 					    scan_lim, nz);
 				pos = 0;
 				for (i = 0; i < 32 && i < sz; i++)
 					pos += snprintf(hexbuf + pos,
 							sizeof(hexbuf) - pos,
 							"%02x ", d[i]);
 				request_log("γ-dump:  plane[%u] head[0..32]: %s\n",
 					    p, hexbuf);
 				if (sz >= 32) {
 					pos = 0;
 					for (i = 0; i < 32; i++)
 						pos += snprintf(hexbuf + pos,
 								sizeof(hexbuf) - pos,
 								"%02x ", d[sz - 32 + i]);
 					request_log("γ-dump:  plane[%u] tail[%zu..%zu]: %s\n",
 						    p, sz - 32, sz - 1, hexbuf);
 				}
 			}
 		}
 	}
 	surface_object->status = VASurfaceDisplaying;
 	status = VA_STATUS_SUCCESS;
 	goto complete;
 error:
-	/*
+	if (request_fd >= 0) {
-	 * iter7: error recovery for the OUTPUT pool slot. If the surface
+		close(request_fd);
 	 * acquired a slot in BeginPicture (source_data != NULL indicates
 	 * an active borrow), reset the slot's request_fd via
 	 * request_pool_force_release so the slot returns to the
 	 * acquire-rotation. force_release tries REINIT first; falls back
 	 * to close+alloc if REINIT fails; leaves the slot dead-busy if
 	 * even alloc fails (other slots unaffected). Replaces iter6's
 	 * accepted bounded leak.
 	 *
 	 * Reachable from: media_request_queue / wait_completion / REINIT
 	 * failures. NOT reachable for OUTPUT-DQBUF failure (separate label
 	 * `error_buffer_indeterminate` below) because in that case the
 	 * V4L2 buffer is in an indeterminate kernel state and reusing the
 	 * slot would EINVAL on the next QBUF.
 	 *
 	 * If the surface never acquired a slot (source_data == NULL),
 	 * there is no slot to release; nothing to do.
 	 */
 	if (surface_object != NULL) {
 		if (surface_object->source_data != NULL) {
 			request_pool_force_release(&driver_data->output_pool,
 						   surface_object->source_index);
 			surface_object->source_data = NULL;
 			surface_object->source_size = 0;
 		}
 		surface_object->request_fd = -1;
 	}
 	goto complete;
 error_buffer_indeterminate:
 	/*
 	 * iter7: OUTPUT DQBUF failed after a successful REINIT. The kernel
 	 * V4L2 buffer is in an unknown state (possibly still QUEUED with
 	 * pending decode result, possibly half-dequeued, possibly stuck
 	 * in driver internals). The slot's request_fd has already been
 	 * REINIT'd to a clean state, but reusing the slot for a new
 	 * decode would QBUF on a buffer the kernel may still hold —
 	 * triggering exactly the iter6 race we eliminated for the happy
 	 * path.
 	 *
 	 * Leave the slot dead-busy: don't release, don't force_release.
 	 * Other slots are unaffected. If this fires repeatedly, the pool
 	 * leaks slots until starvation, at which point acquire returns -1
 	 * and BeginPicture cleanly propagates ALLOCATION_FAILED. This is
 	 * a strictly safer failure mode than reusing an indeterminate
 	 * V4L2 buffer.
 	 */
 	if (surface_object != NULL) {
 		surface_object->source_data = NULL;
 		surface_object->source_size = 0;
 		surface_object->request_fd = -1;
 	}
@@ -544,7 +354,6 @@ VAStatus RequestQuerySurfaceAttributes(VADriverContextP context,
 				       VASurfaceAttrib *attributes,
 				       unsigned int *attributes_count)
 {
 	struct request_data *driver_data = context->pDriverData;
 	VASurfaceAttrib *attributes_list;
 	unsigned int attributes_list_size = V4L2_REQUEST_MAX_CONFIG_ATTRIBUTES *
@@ -552,6 +361,9 @@ VAStatus RequestQuerySurfaceAttributes(VADriverContextP context,
 	int memory_types;
 	unsigned int i = 0;
 	request_log("QuerySurfaceAttributes: config=%u attrs=%p count=%p\n",
 		    config, attributes, attributes_count);
 	attributes_list = malloc(attributes_list_size);
 	memset(attributes_list, 0, attributes_list_size);
@@ -623,7 +435,6 @@ VAStatus RequestQuerySurfaceStatus(VADriverContextP context,
 	struct request_data *driver_data = context->pDriverData;
 	struct object_surface *surface_object;
 	surface_object = SURFACE(driver_data, surface_id);
 	if (surface_object == NULL)
 		return VA_STATUS_ERROR_INVALID_SURFACE;
@@ -677,16 +488,26 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
 	VAStatus status;
 	int rc;
-	video_format = driver_data->video_format;
+	request_log("ExportSurfaceHandle: surface_id=%u mem_type=0x%x flags=0x%x\n",
-	if (video_format == NULL)
+		    surface_id, mem_type, flags);
 		return VA_STATUS_ERROR_OPERATION_FAILED;
-	if (mem_type != VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2)
+	video_format = driver_data->video_format;
 	if (video_format == NULL) {
 		request_log("ExportSurfaceHandle: video_format NULL\n");
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	}
 	if (mem_type != VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2) {
 		request_log("ExportSurfaceHandle: rejecting mem_type 0x%x (only DRM_PRIME_2)\n",
 			    mem_type);
 		return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
 	}
 	surface_object = SURFACE(driver_data, surface_id);
-	if (surface_object == NULL)
+	if (surface_object == NULL) {
 		request_log("ExportSurfaceHandle: invalid surface_id\n");
 		return VA_STATUS_ERROR_INVALID_SURFACE;
 	}
 	export_fds_count = surface_object->destination_buffers_count;
 	export_fds = malloc(export_fds_count * sizeof(*export_fds));
@@ -697,22 +518,12 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
 				surface_object->destination_index, O_RDONLY,
 				export_fds, export_fds_count);
 	if (rc < 0) {
 		request_log("ExportSurfaceHandle: EXPBUF idx=%u count=%u failed\n",
 			    surface_object->destination_index, export_fds_count);
 		status = VA_STATUS_ERROR_OPERATION_FAILED;
 		goto error;
 	}
 	/*
 	 * Iter2 Fix 3: pool now owns OUR copy of the EXPBUF'd fd. The
 	 * consumer receives a dup'd / equivalent fd via the descriptor.
 	 * Slot transitions DECODED → EXPORTED; it will be force-recyclable
 	 * by LRU when the pool is exhausted, but FREE slots are always
 	 * preferred.
 	 */
 	if (surface_object->current_slot != NULL && export_fds_count > 0)
 		cap_pool_mark_exported(&driver_data->capture_pool,
 				       surface_object->current_slot,
 				       export_fds[0]);
 	planes_count = surface_object->destination_planes_count;
 	surface_descriptor->fourcc = VA_FOURCC_NV12;
@@ -726,102 +537,27 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
 		for (i = 0; i < planes_count; i++)
 			size += surface_object->destination_sizes[i];
 	/*
 	 * Iteration 2 Fix 2: choose drm_format_modifier conditionally on
 	 * pitch alignment. Mesa's WSI / Panfrost compositor path rejects
 	 * DRM_FORMAT_MOD_NONE (= LINEAR explicit) buffers whose pitch isn't
 	 * GPU-aligned (typically 64+ bytes for Mali). For 1920-wide content
 	 * the pitch is 1920 (64-aligned, fine); for 864-wide content the
 	 * pitch is 864 (only 16-aligned), Mesa rejects with "WSI pitch not
 	 * properly aligned" and Firefox falls back to SW.
 	 *
 	 * Setting DRM_FORMAT_MOD_INVALID tells the importer "modifier
 	 * unknown, treat as implicit / texture-only" — Firefox's
 	 * DMABufSurface.cpp:1920 explicitly omits modifier attribs from
 	 * eglCreateImage when the value is MOD_INVALID, bypassing Mesa's
 	 * scanout-alignment check. The buffer is then texture-imported
 	 * (small perf cost) instead of WSI scanout-imported, which is
 	 * the correct behavior for a buffer that doesn't meet scanout
 	 * alignment requirements.
 	 *
 	 * We branch on pitch alignment to preserve LINEAR semantics for
 	 * already-aligned content (avoids unnecessary perf cost on the
 	 * common 1920-wide case).
 	 *
 	 * Sonnet Phase 5 review (iter2 question 4) endorsed this
 	 * conditional approach over a universal MOD_INVALID change.
 	 */
 	for (i = 0; i < export_fds_count; i++) {
-		uint64_t modifier = video_format->drm_modifier;
+		surface_descriptor->objects[i].drm_format_modifier =
-		unsigned int bytesperline =
+			video_format->drm_modifier;
 			surface_object->destination_bytesperlines[0];
 		if (bytesperline & 63) /* not 64-byte aligned */
 			modifier = DRM_FORMAT_MOD_INVALID;
 		surface_descriptor->objects[i].drm_format_modifier = modifier;
 		surface_descriptor->objects[i].fd = export_fds[i];
 		surface_descriptor->objects[i].size = export_fds_count == 1 ?
 						      size :
 						      surface_object->destination_sizes[i];
 	}
-	/*
+	surface_descriptor->num_layers = 1;
 	 * Layer construction depends on the consumer's request flags
 	 * (VA_EXPORT_SURFACE_*_LAYERS):
 	 *
 	 *   COMPOSED_LAYERS (default, mpv): one layer carrying both
 	 *   Y and UV planes (drm_format=NV12, num_planes=2). Mesa
 	 *   imports as a single NV12 EGLImage.
 	 *
 	 *   SEPARATE_LAYERS (Firefox 150 RDD): two layers, Y as a
 	 *   single-plane R8 layer, UV as a single-plane GR88 layer.
 	 *   Firefox's GetVAAPISurfaceDescriptor passes
 	 *   VA_EXPORT_SURFACE_SEPARATE_LAYERS so its DMABufSurfaceYUV
 	 *   import code can address Y and UV planes independently.
 	 *   Without this branch, Firefox parsed our COMPOSED layout
 	 *   as if it were SEPARATE, found bogus layer-1 data, and
 	 *   silently fell back to FFmpeg(FFVPX) software decode.
 	 *
 	 * The earlier path 0001 mplane port assumed a single COMPOSED
 	 * shape — fine for mpv but breaks any consumer requesting
 	 * SEPARATE. Honor the flag.
 	 */
 	if ((flags & VA_EXPORT_SURFACE_SEPARATE_LAYERS) && planes_count == 2) {
 		surface_descriptor->num_layers = 2;
-		/* Layer 0: Y plane as DRM_FORMAT_R8 (1 byte/pixel luma). */
+	surface_descriptor->layers[0].drm_format = video_format->drm_format;
-		surface_descriptor->layers[0].drm_format = DRM_FORMAT_R8;
+	surface_descriptor->layers[0].num_planes = planes_count;
 		surface_descriptor->layers[0].num_planes = 1;
 		surface_descriptor->layers[0].object_index[0] =
 			export_fds_count == 1 ? 0 : 0;
 		surface_descriptor->layers[0].offset[0] =
 			surface_object->destination_offsets[0];
 		surface_descriptor->layers[0].pitch[0] =
 			surface_object->destination_bytesperlines[0];
-		/* Layer 1: UV plane as DRM_FORMAT_GR88 (interleaved
+	for (i = 0; i < planes_count; i++) {
-		 * U+V, 2 bytes/pixel chroma at half resolution). */
+		surface_descriptor->layers[0].object_index[i] =
-		surface_descriptor->layers[1].drm_format = DRM_FORMAT_GR88;
+			export_fds_count == 1 ? 0 : i;
-		surface_descriptor->layers[1].num_planes = 1;
+		surface_descriptor->layers[0].offset[i] =
-		surface_descriptor->layers[1].object_index[0] =
+			surface_object->destination_offsets[i];
-			export_fds_count == 1 ? 0 : 1;
+		surface_descriptor->layers[0].pitch[i] =
-		surface_descriptor->layers[1].offset[0] =
+			surface_object->destination_bytesperlines[i];
 			surface_object->destination_offsets[1];
 		surface_descriptor->layers[1].pitch[0] =
 			surface_object->destination_bytesperlines[1];
 	} else {
 		/* COMPOSED_LAYERS / default: one layer with all planes. */
 		surface_descriptor->num_layers = 1;
 		surface_descriptor->layers[0].drm_format = video_format->drm_format;
 		surface_descriptor->layers[0].num_planes = planes_count;
 		for (i = 0; i < planes_count; i++) {
 			surface_descriptor->layers[0].object_index[i] =
 				export_fds_count == 1 ? 0 : i;
 			surface_descriptor->layers[0].offset[i] =
 				surface_object->destination_offsets[i];
 			surface_descriptor->layers[0].pitch[i] =
 				surface_object->destination_bytesperlines[i];
 		}
 	}
 	status = VA_STATUS_SUCCESS;
@@ -32,11 +32,6 @@
 #include <va/va_backend.h>
 #include "object_heap.h"
 #include "cap_pool.h"
 #include "h265.h"
 struct request_data;
 #define SURFACE(data, id)                                                      \
 	((struct object_surface *)object_heap_lookup(&(data)->surface_heap, id))
@@ -45,7 +40,7 @@ struct request_data;
 struct object_surface {
 	struct object_base base;
-	VASurfaceStatus status;
+	VAStatus status;
 	int width;
 	int height;
@@ -53,26 +48,6 @@ struct object_surface {
 	void *source_data;
 	unsigned int source_size;
 	/*
 	 * Iter2 Fix 3: destination_* fields below are now per-decode-cycle.
 	 * They are populated from current_slot in RequestBeginPicture and
 	 * remain valid through SyncSurface, ExportSurfaceHandle, and
 	 * DeriveImage/copy_surface_to_image (vaapi-copy path). Subsequent
 	 * BeginPicture for this surface releases the prior slot and
 	 * acquires a new one.
 	 *
 	 * destination_planes_count, destination_sizes, destination_offsets,
 	 * destination_bytesperlines are FORMAT-uniform across all CAPTURE
 	 * buffers, so they're set once at CreateSurfaces2 time and stay.
 	 *
 	 * destination_index, destination_map[], destination_map_lengths,
 	 * destination_map_offsets, destination_data[] are SLOT-specific
 	 * and re-populated each BeginPicture from current_slot.
 	 *
 	 * destination_buffers_count is also format-uniform (V4L2 planes
 	 * per buffer = 1 for single-plane MPLANE NV12).
 	 */
 	struct cap_pool_slot *current_slot;	/* iter2 Fix 3 */
 	unsigned int destination_index;
 	void *destination_map[VIDEO_MAX_PLANES];
 	unsigned int destination_map_lengths[VIDEO_MAX_PLANES];
@@ -89,33 +64,6 @@ struct object_surface {
 	struct timeval timestamp;
 	/*
 	 * AV1 Phase 3: for streams with apply_grain=1, VAAPI's
 	 * VADecPictureParameterBufferAV1 carries current_display_picture
 	 * (display-time surface) separate from current_frame (decode
 	 * target). vpu981 HW applies grain inline to the decode CAPTURE
 	 * buffer, so the decoded data lives in current_frame's slot — but
 	 * ffmpeg calls vaGetImage on current_display_picture which has no
 	 * slot bound. linked_decode_surface_id, set in av1_set_controls
 	 * on the display surface, points to the decode surface so
 	 * copy_surface_to_image can borrow its destination_data[].
 	 *
 	 * VA_INVALID_SURFACE = no link (the common case: 8-bit codecs,
 	 * AV1 with apply_grain=0, AV1 frames where cur_frame ==
 	 * cur_display).
 	 */
 	VASurfaceID linked_decode_surface_id;
 	/*
 	 * AV1 Phase 3: AV1 order_hint of the frame currently decoded into
 	 * this surface. VAAPI's VADecPictureParameterBufferAV1.order_hint
 	 * is per-frame; kernel's v4l2_ctrl_av1_frame.order_hints[8] is
 	 * per-reference. We track each decoded frame's order_hint here so
 	 * the next frame's av1_set_controls can populate order_hints[i]
 	 * from ref_frame_map[i] → SURFACE → av1_order_hint.
 	 */
 	uint8_t av1_order_hint;
 	union {
 		struct {
 			VAPictureParameterBufferMPEG2 picture;
@@ -125,43 +73,15 @@ struct object_surface {
 		} mpeg2;
 		struct {
 			VAIQMatrixBufferH264 matrix;
 			bool matrix_set;
 			VAPictureParameterBufferH264 picture;
 			VASliceParameterBufferH264 slice;
 		} h264;
 		struct {
 			VAPictureParameterBufferHEVC picture;
 			VASliceParameterBufferHEVC slice;
 			VASliceParameterBufferHEVC slices[HEVC_MAX_SLICES_PER_FRAME];
 			unsigned int num_slices;
 			VAIQMatrixBufferHEVC iqmatrix;
 			bool iqmatrix_set;
 		} h265;
 		struct {
 			VAPictureParameterBufferVP8 picture;
 			VASliceParameterBufferVP8 slice;
 			VAIQMatrixBufferVP8 iqmatrix;
 			bool iqmatrix_set;
 			VAProbabilityDataBufferVP8 probability;
 			bool probability_set;
 		} vp8;
 		struct {
 			VADecPictureParameterBufferVP9 picture;
 			VASliceParameterBufferVP9 slice;
 		} vp9;
 		/*
 		 * ampere-av1-enablement: AV1 needs picture-header +
 		 * variable number of slice/tile params (one per tile).
 		 * tile_group_entries[] holds parsed VASliceParameterBufferAV1
 		 * entries up to MAX_TILES; av1.c builds the matching
 		 * v4l2_ctrl_av1_tile_group_entry[] at set_controls time.
 		 */
 		struct {
 #define AV1_MAX_TILES 128
 			VADecPictureParameterBufferAV1 picture;
 			VASliceParameterBufferAV1 tile_group_entries[AV1_MAX_TILES];
 			unsigned int num_tile_group_entries;
 		} av1;
 	} params;
 	int request_fd;
@@ -205,37 +125,4 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
 				    VASurfaceID surface_id, uint32_t mem_type,
 				    uint32_t flags, void *descriptor);
 /*
 * iter5b-β Commit D: populate a surface's format-uniform destination_*
 * fields (planes_count, buffers_count, offsets, sizes, bytesperlines)
 * from driver_data's cached CAPTURE-side geometry. Idempotent: skip
 * if already filled (destination_planes_count != 0). Caller must
 * ensure driver_data->fmt_valid is true (CreateContext has run).
 *
 * Called by:
 *   - context.c::RequestCreateContext after v4l2_get_format(CAPTURE)
 *     populates the cache; walks the surface_heap and fills every
 *     existing surface (covers surfaces created before CreateContext,
 *     including the ffmpeg vaapi-copy case where surfaces_count=0 is
 *     passed but surfaces exist in the heap from earlier
 *     CreateSurfaces2 calls).
 *   - surface.c::RequestCreateSurfaces2 after surface allocation,
 *     covering the case where CreateContext fired before this
 *     CreateSurfaces2 call (fmt cache is valid, fill immediately).
 */
 void surface_fill_format_uniform(struct request_data *driver_data,
 				 struct object_surface *surface_object);
 /*
 * Iter2 Fix 3: bind / unbind a CAPTURE-pool slot to an object_surface.
 * Called from picture.c::RequestBeginPicture (acquire+bind) and
 * surface.c::RequestDestroySurfaces (unbind). Mirrors slot's V4L2 index
 * and mmap pointers into surface_object->destination_* so existing
 * QBUF/DQBUF/EXPBUF code paths see no behavioral change.
 */
 void surface_bind_slot(struct object_surface *surface_object,
 		       struct cap_pool_slot *slot);
 void surface_unbind_slot(struct request_data *driver_data,
 			 struct object_surface *surface_object);
 #endif
@@ -27,7 +27,7 @@
 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
 #endif
-#ifdef __arm__
+#ifndef __aarch64__
 .text
 .syntax unified
@@ -182,4 +182,19 @@ thumb_function tiled_deinterleave_to_planar
 	b	7b
 end_function tiled_deinterleave_to_planar
 #else /* __aarch64__ */
 /* Fourier-local: aarch64 stub. The body of this file is ARMv7 NEON Thumb
 * assembly used by sunxi-cedrus (Allwinner-tiled NV12 → linear NV12) and
 * is never reached on aarch64 hosts because video_format_is_linear()
 * returns true for our NV12 entries. Provide a stub so the shared object
 * still resolves the symbol; if ever actually called, return immediately. */
 .text
 .global tiled_to_planar
 .type tiled_to_planar, %function
 .hidden tiled_to_planar
 tiled_to_planar:
 	ret
 .size tiled_to_planar, .-tiled_to_planar
 #endif
@@ -33,10 +33,24 @@
 void request_log(const char *format, ...)
 {
 	va_list arguments;
 	static FILE *trace_fp;
 	fprintf(stderr, "%s: ", V4L2_REQUEST_STR_VENDOR);
 	va_start(arguments, format);
 	vfprintf(stderr, format, arguments);
 	va_end(arguments);
 	/* Fourier-local: also tee to /tmp/libva-fourier.log so messages are
 	 * visible from sandboxed GPU processes (Chromium etc.) where stderr
 	 * is redirected. Append-mode, lazy-open. */
 	if (trace_fp == NULL)
 		trace_fp = fopen("/tmp/libva-fourier.log", "a");
 	if (trace_fp != NULL) {
 		fprintf(trace_fp, "%s: ", V4L2_REQUEST_STR_VENDOR);
 		va_start(arguments, format);
 		vfprintf(trace_fp, format, arguments);
 		va_end(arguments);
 		fflush(trace_fp);
 	}
 }
@@ -428,102 +428,37 @@ int v4l2_export_buffer(int video_fd, unsigned int type, unsigned int index,
 	return 0;
 }
-static int v4l2_ioctl_controls(int video_fd, int request_fd, unsigned long ioc,
+int v4l2_set_control(int video_fd, int request_fd, unsigned int id, void *data,
-			       struct v4l2_ext_control *control_array,
+		     unsigned int size)
 			       unsigned int num_controls)
 {
 	struct v4l2_ext_control control;
 	struct v4l2_ext_controls controls;
 	int rc;
 	memset(&control, 0, sizeof(control));
 	memset(&controls, 0, sizeof(controls));
-	controls.controls = control_array;
+	control.id = id;
-	controls.count = num_controls;
+	control.ptr = data;
 	control.size = size;
 	controls.controls = &control;
 	controls.count = 1;
 	if (request_fd >= 0) {
 		controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
 		controls.request_fd = request_fd;
 	}
-	rc = ioctl(video_fd, ioc, &controls);
+	rc = ioctl(video_fd, VIDIOC_S_EXT_CTRLS, &controls);
 	if (rc < 0) {
-		/* ampere-av1 Phase 2.1 diag: surface error_idx so the caller's
+		request_log("Unable to set control: %s\n", strerror(errno));
 		 * error path knows which CID failed validation. error_idx >=
 		 * count means the failure was pre-validation (e.g., bad
 		 * request_fd). errno carries the syscall-level reason. */
 		const char *failed_cid_label = "<pre-validation>";
 		unsigned int failed_size = 0;
 		if (controls.error_idx < num_controls) {
 			failed_size = control_array[controls.error_idx].size;
 			(void)failed_cid_label;  /* keep symbol if logger truncates */
 		}
 		request_log("v4l2_ioctl_controls: rc=%d errno=%d (%s) "
 			    "ioc=0x%lx error_idx=%u count=%u "
 			    "failed_cid=0x%x failed_size=%u\n",
 			    rc, errno, strerror(errno), ioc,
 			    controls.error_idx, num_controls,
 			    controls.error_idx < num_controls
 			        ? control_array[controls.error_idx].id : 0,
 			    failed_size);
 	}
 	return rc;
 }
 int v4l2_get_controls(int video_fd, int request_fd,
 		      struct v4l2_ext_control *control_array,
 		      unsigned int num_controls)
 {
 	int rc;
 	rc = v4l2_ioctl_controls(video_fd, request_fd, VIDIOC_G_EXT_CTRLS,
 				 control_array, num_controls);
 	if (rc < 0) {
 		/*
 		 * EACCES on G_EXT_CTRLS for request fds is the normal case on
 		 * this hantro rig — the kernel doesn't allow readback through
 		 * the request_fd. Caller (h264.c) tracks this with a one-time
 		 * "V4L2 readback unavailable" announcement. Suppress per-call
 		 * noise to keep the log signal-to-noise high.
 		 */
 		if (errno != EACCES)
 			request_log("Unable to get control(s): %s\n",
 				    strerror(errno));
 		return -1;
 	}
 	return 0;
 }
 int v4l2_set_controls(int video_fd, int request_fd,
 		      struct v4l2_ext_control *control_array,
 		      unsigned int num_controls)
 {
 	int rc;
 	rc = v4l2_ioctl_controls(video_fd, request_fd, VIDIOC_S_EXT_CTRLS,
 				 control_array, num_controls);
 	if (rc < 0) {
 		request_log("Unable to set control(s): %s\n", strerror(errno));
 		return -1;
 	}
 	return 0;
 }
 int v4l2_set_control(int video_fd, int request_fd, unsigned int id, void *data,
 		     unsigned int size)
 {
 	struct v4l2_ext_control control;
 	memset(&control, 0, sizeof(control));
 	control.id = id;
 	control.ptr = data;
 	control.size = size;
 	return v4l2_set_controls(video_fd, request_fd, &control, 1);
 }
 int v4l2_set_stream(int video_fd, unsigned int type, bool enable)
 {
 	enum v4l2_buf_type buf_type = type;
@@ -539,63 +474,3 @@ int v4l2_set_stream(int video_fd, unsigned int type, bool enable)
 	return 0;
 }
 int v4l2_query_ext_ctrl(int video_fd, unsigned int id,
 			struct v4l2_query_ext_ctrl *qec)
 {
 	struct v4l2_query_ext_ctrl local;
 	struct v4l2_query_ext_ctrl *target = qec ? qec : &local;
 	int rc;
 	memset(target, 0, sizeof(*target));
 	target->id = id;
 	rc = ioctl(video_fd, VIDIOC_QUERY_EXT_CTRL, target);
 	if (rc < 0)
 		return -1;
 	return 0;
 }
 int v4l2_query_menu(int video_fd, unsigned int id, unsigned int index,
 		    struct v4l2_querymenu *qm)
 {
 	int rc;
 	if (qm == NULL)
 		return -1;
 	memset(qm, 0, sizeof(*qm));
 	qm->id = id;
 	qm->index = index;
 	rc = ioctl(video_fd, VIDIOC_QUERYMENU, qm);
 	if (rc < 0)
 		return -1;
 	return 0;
 }
 bool v4l2_ctrl_menu_has_value(int video_fd, unsigned int id,
 			      unsigned int value)
 {
 	struct v4l2_query_ext_ctrl qec;
 	struct v4l2_querymenu qm;
 	long long i;
 	if (v4l2_query_ext_ctrl(video_fd, id, &qec) < 0)
 		return false;
 	if (qec.type != V4L2_CTRL_TYPE_MENU &&
 	    qec.type != V4L2_CTRL_TYPE_INTEGER_MENU)
 		return false;
 	for (i = qec.minimum; i <= qec.maximum; i += qec.step ? qec.step : 1) {
 		if (v4l2_query_menu(video_fd, id, (unsigned int)i, &qm) < 0)
 			continue;
 		if ((unsigned int)i == value)
 			return true;
 	}
 	return false;
 }
@@ -54,47 +54,8 @@ int v4l2_dequeue_buffer(int video_fd, int request_fd, unsigned int type,
 int v4l2_export_buffer(int video_fd, unsigned int type, unsigned int index,
 		       unsigned int flags, int *export_fds,
 		       unsigned int export_fds_count);
 int v4l2_get_controls(int video_fd, int request_fd,
 		      struct v4l2_ext_control *controls,
 		      unsigned int num_controls);
 int v4l2_set_controls(int video_fd, int request_fd,
 		      struct v4l2_ext_control *controls,
 		      unsigned int num_controls);
 int v4l2_set_control(int video_fd, int request_fd, unsigned int id, void *data,
 		     unsigned int size);
 int v4l2_set_stream(int video_fd, unsigned int type, bool enable);
 /*
 * Capability-probe helpers. These let calling code discover what the
 * backing kernel driver supports rather than hardcoding assumptions
 * about specific decoder hardware.
 */
 /*
 * Query the metadata of an extended control by CID. Fills *qec on
 * success. Returns 0 if the control exists, -1 (errno=EINVAL) if the
 * driver does not expose this CID. Pass qec=NULL to test existence
 * only.
 */
 struct v4l2_query_ext_ctrl;
 int v4l2_query_ext_ctrl(int video_fd, unsigned int id,
 			struct v4l2_query_ext_ctrl *qec);
 /*
 * Query a single menu item of a menu/intmenu control at the given
 * index. Fills *qm on success. Returns 0 if the menu item exists at
 * this index, -1 otherwise.
 */
 struct v4l2_querymenu;
 int v4l2_query_menu(int video_fd, unsigned int id, unsigned int index,
 		    struct v4l2_querymenu *qm);
 /*
 * Convenience: for a menu-type control, return true iff `value` is a
 * valid menu entry (i.e. the driver accepts it). Walks all menu items
 * up to the control's maximum to check.
 */
 bool v4l2_ctrl_menu_has_value(int video_fd, unsigned int id,
 			      unsigned int value);
 #endif
@@ -32,6 +32,7 @@
 #include <linux/videodev2.h>
 #include "utils.h"
 #include "v4l2.h"
 #include "video.h"
 static struct video_format formats[] = {
@@ -39,14 +40,22 @@ static struct video_format formats[] = {
 		.description		= "NV12 YUV",
 		.v4l2_format		= V4L2_PIX_FMT_NV12,
 		.v4l2_buffers_count	= 1,
 		.v4l2_mplane		= false,
 		.drm_format		= DRM_FORMAT_NV12,
 		.drm_modifier		= DRM_FORMAT_MOD_NONE,
 		.planes_count		= 2,
 		.bpp			= 16,
 	},
 	{
 		.description		= "NV12 YUV (multi-plane)",
 		.v4l2_format		= V4L2_PIX_FMT_NV12,
 		.v4l2_buffers_count	= 1,
 		.v4l2_mplane		= true,
 		.drm_format		= DRM_FORMAT_NV12,
 		.drm_modifier		= DRM_FORMAT_MOD_NONE,
 		.planes_count		= 2,
 		.bpp			= 16,
 	},
 // Code to handle this DRM_FORMAT is __arm__ only
 #ifdef __arm__
 	{
 		.description		= "Sunxi tiled NV12 YUV",
 		.v4l2_format		= V4L2_PIX_FMT_SUNXI_TILED_NV12,
@@ -57,22 +66,56 @@ static struct video_format formats[] = {
 		.planes_count		= 2,
 		.bpp			= 16
 	},
 #endif
 };
 static unsigned int formats_count = sizeof(formats) / sizeof(formats[0]);
-struct video_format *video_format_find(unsigned int pixelformat)
+struct video_format *video_format_find(unsigned int pixelformat, bool mplane)
 {
 	unsigned int i;
 	for (i = 0; i < formats_count; i++)
-		if (formats[i].v4l2_format == pixelformat)
+		if (formats[i].v4l2_format == pixelformat &&
 		    formats[i].v4l2_mplane == mplane)
 			return &formats[i];
 	return NULL;
 }
 /*
 * Probe the V4L2 video device for a supported CAPTURE pixel format and
 * return the matching video_format entry. Tries single-plane CAPTURE first
 * (the original sunxi-cedrus path: SUNXI_TILED_NV12 or NV12), then falls
 * back to multi-plane CAPTURE (Rockchip hantro / RK3588 VDPU381). Returns
 * NULL if no match.
 *
 * Centralised here so RequestInit can populate driver_data->video_format
 * before any vaCreateContext call. The original library set it lazily in
 * RequestCreateSurfaces, but Chromium's vaapi_video_decoder may call
 * vaCreateContext first (with surfaces=NULL, surfaces_count=0) and only
 * create surfaces afterwards via vaCreateSurfaces2.
 */
 struct video_format *video_format_probe(int video_fd)
 {
 	struct video_format *format = NULL;
 	if (v4l2_find_format(video_fd, V4L2_BUF_TYPE_VIDEO_CAPTURE,
 			     V4L2_PIX_FMT_SUNXI_TILED_NV12))
 		format = video_format_find(V4L2_PIX_FMT_SUNXI_TILED_NV12, false);
 	if (format == NULL &&
 	    v4l2_find_format(video_fd, V4L2_BUF_TYPE_VIDEO_CAPTURE,
 			     V4L2_PIX_FMT_NV12))
 		format = video_format_find(V4L2_PIX_FMT_NV12, false);
 	if (format == NULL &&
 	    v4l2_find_format(video_fd, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE,
 			     V4L2_PIX_FMT_NV12))
 		format = video_format_find(V4L2_PIX_FMT_NV12, true);
 	return format;
 }
 bool video_format_is_linear(struct video_format *format)
 {
 	if (format == NULL)
@@ -38,7 +38,8 @@ struct video_format {
 	unsigned int bpp;
 };
-struct video_format *video_format_find(unsigned int pixelformat);
+struct video_format *video_format_find(unsigned int pixelformat, bool mplane);
 struct video_format *video_format_probe(int video_fd);
 bool video_format_is_linear(struct video_format *format);
 #endif
@@ -1,263 +0,0 @@
 /*
 * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
 *
 * fresnel-fourier iter3 Phase 6 commit B: VP8 codec dispatcher
 * implemented against V4L2_CID_STATELESS_VP8_FRAME (kernel UAPI
 * <linux/v4l2-controls.h>:1900-1958). Single batched control per
 * frame, no init-time device-wide menus (VP8 has no DECODE_MODE/
 * START_CODE — confirmed by Phase 0 V4L2 inventory + Phase 3
 * cross-validator strace).
 *
 * Reference: FFmpeg libavcodec/v4l2_request_vp8.c (kwiboo branch);
 *            FFmpeg libavcodec/vaapi_vp8.c (VAAPI source-side
 *            verification of the field semantics);
 *            kernel drivers/media/platform/verisilicon/
 *              hantro_g1_vp8_dec.c (RK3399 hardware reads
 *              first_part_header_bits + first_part_size to compute
 *              MB-data DMA offset).
 *
 * Phase 5 review amendments incorporated (see phase5_iter3_review.md):
 *   C1 first_part_header_bits = slice->macroblock_offset
 *      (NOT 0; kernel reads it unconditionally; same formula as
 *      v4l2_request_vp8.c uses internally)
 *   C2 first_part_size = slice->partition_size[0] +
 *                        ((macroblock_offset + 7) / 8)
 *      (recover total partition size from VAAPI's post-parse
 *      remainder)
 *   C3 VAProbabilityBufferType (not VAProbabilityDataBufferType)
 *   C4 (int8_t) cast (not (s8); kernel-internal typedef not in
 *      userspace UAPI)
 *   S3 assert(probability_set) runtime guard (kernel has NO
 *      coeff_probs default fallback; consumer MUST send
 *      VAProbabilityBufferType per frame)
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include "vp8.h"
 #include "context.h"
 #include "request.h"
 #include "surface.h"
 #include <assert.h>
 #include <stdint.h>
 #include <string.h>
 #include <sys/ioctl.h>
 #include <linux/videodev2.h>
 #include <linux/v4l2-controls.h>
 #include "v4l2.h"
 int vp8_set_controls(struct request_data *driver_data,
 		     struct object_context *context_object,
 		     struct object_surface *surface_object)
 {
 	VAPictureParameterBufferVP8 *picture =
 		&surface_object->params.vp8.picture;
 	VASliceParameterBufferVP8 *slice =
 		&surface_object->params.vp8.slice;
 	VAIQMatrixBufferVP8 *iqmatrix =
 		&surface_object->params.vp8.iqmatrix;
 	VAProbabilityDataBufferVP8 *probability =
 		&surface_object->params.vp8.probability;
 	bool iqmatrix_set = surface_object->params.vp8.iqmatrix_set;
 	bool probability_set = surface_object->params.vp8.probability_set;
 	struct v4l2_ctrl_vp8_frame frame;
 	struct object_surface *last_ref;
 	struct object_surface *golden_ref;
 	struct object_surface *alt_ref;
 	int rc;
 	int i, j;
 	memset(&frame, 0, sizeof frame);
 	/* Phase 5 S3: kernel has no coeff_probs default fallback. The
 	 * VAAPI consumer chain (FFmpeg's vaapi_vp8.c:146-148, used by
 	 * mpv and ffmpeg-vaapi) always sends VAProbabilityBufferType
 	 * per frame. Surface immediately if a future consumer doesn't. */
 	assert(probability_set);
 	/* Clause 3: frame geometry + per-frame scalars */
 	frame.width = picture->frame_width;
 	frame.height = picture->frame_height;
 	frame.horizontal_scale = 0;  /* not exposed by VAAPI */
 	frame.vertical_scale = 0;
 	frame.version = picture->pic_fields.bits.version;
 	frame.prob_skip_false = picture->prob_skip_false;
 	frame.prob_intra = picture->prob_intra;
 	frame.prob_last = picture->prob_last;
 	frame.prob_gf = picture->prob_gf;
 	/* Phase 3 Q2: VAAPI counts include control partition;
 	 * kernel counts DCT only — off-by-one. */
 	frame.num_dct_parts = slice->num_of_partitions - 1;
 	/* Clause 4: DPB timestamp resolution (mirrors mpeg2.c pattern;
 	 * NULL surface → timestamp stays 0 from memset). */
 	last_ref = SURFACE(driver_data, picture->last_ref_frame);
 	golden_ref = SURFACE(driver_data, picture->golden_ref_frame);
 	alt_ref = SURFACE(driver_data, picture->alt_ref_frame);
 	if (last_ref != NULL)
 		frame.last_frame_ts =
 			v4l2_timeval_to_ns(&last_ref->timestamp);
 	if (golden_ref != NULL)
 		frame.golden_frame_ts =
 			v4l2_timeval_to_ns(&golden_ref->timestamp);
 	if (alt_ref != NULL)
 		frame.alt_frame_ts =
 			v4l2_timeval_to_ns(&alt_ref->timestamp);
 	/* Clause 5: loop filter mapping */
 	for (i = 0; i < 4; i++) {
 		frame.lf.ref_frm_delta[i] =
 			picture->loop_filter_deltas_ref_frame[i];
 		frame.lf.mb_mode_delta[i] =
 			picture->loop_filter_deltas_mode[i];
 	}
 	frame.lf.sharpness_level = picture->pic_fields.bits.sharpness_level;
 	frame.lf.level = picture->loop_filter_level[0];
 	if (picture->pic_fields.bits.loop_filter_adj_enable)
 		frame.lf.flags |= V4L2_VP8_LF_ADJ_ENABLE;
 	if (picture->pic_fields.bits.mode_ref_lf_delta_update)
 		frame.lf.flags |= V4L2_VP8_LF_DELTA_UPDATE;
 	if (picture->pic_fields.bits.filter_type)
 		frame.lf.flags |= V4L2_VP8_LF_FILTER_TYPE_SIMPLE;
 	/* Clause 6: quantization base + delta derivation */
 	if (iqmatrix_set) {
 		frame.quant.y_ac_qi =
 			iqmatrix->quantization_index[0][0];
 		frame.quant.y_dc_delta = (int8_t)
 			(iqmatrix->quantization_index[0][1] -
 			 iqmatrix->quantization_index[0][0]);
 		frame.quant.y2_dc_delta = (int8_t)
 			(iqmatrix->quantization_index[0][2] -
 			 iqmatrix->quantization_index[0][0]);
 		frame.quant.y2_ac_delta = (int8_t)
 			(iqmatrix->quantization_index[0][3] -
 			 iqmatrix->quantization_index[0][0]);
 		frame.quant.uv_dc_delta = (int8_t)
 			(iqmatrix->quantization_index[0][4] -
 			 iqmatrix->quantization_index[0][0]);
 		frame.quant.uv_ac_delta = (int8_t)
 			(iqmatrix->quantization_index[0][5] -
 			 iqmatrix->quantization_index[0][0]);
 	}
 	if (picture->pic_fields.bits.segmentation_enabled && iqmatrix_set) {
 		for (i = 1; i < 4; i++)
 			frame.segment.quant_update[i] = (int8_t)
 				(iqmatrix->quantization_index[i][0] -
 				 iqmatrix->quantization_index[0][0]);
 	}
 	/* Clause 7: segment fields */
 	for (i = 0; i < 3; i++)
 		frame.segment.segment_probs[i] =
 			picture->mb_segment_tree_probs[i];
 	if (picture->pic_fields.bits.segmentation_enabled)
 		frame.segment.flags |= V4L2_VP8_SEGMENT_FLAG_ENABLED;
 	if (picture->pic_fields.bits.update_mb_segmentation_map)
 		frame.segment.flags |= V4L2_VP8_SEGMENT_FLAG_UPDATE_MAP;
 	if (picture->pic_fields.bits.update_segment_feature_data)
 		frame.segment.flags |=
 			V4L2_VP8_SEGMENT_FLAG_UPDATE_FEATURE_DATA;
 	/* DELTA_VALUE_MODE: VAAPI doesn't expose abs_delta. FFmpeg sets
 	 * unconditionally per !s->segmentation.absolute_vals (default).
 	 * Kernel ignores when ENABLED bit clear (BBB case). */
 	frame.segment.flags |= V4L2_VP8_SEGMENT_FLAG_DELTA_VALUE_MODE;
 	if (picture->pic_fields.bits.segmentation_enabled) {
 		for (i = 0; i < 4; i++)
 			frame.segment.lf_update[i] = (int8_t)
 				(picture->loop_filter_level[i] -
 				 picture->loop_filter_level[0]);
 	}
 	/* Clause 8: entropy table mapping (3 VAAPI sources merged) */
 	for (i = 0; i < 4; i++)
 		frame.entropy.y_mode_probs[i] = picture->y_mode_probs[i];
 	for (i = 0; i < 3; i++)
 		frame.entropy.uv_mode_probs[i] = picture->uv_mode_probs[i];
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < 19; j++)
 			frame.entropy.mv_probs[i][j] =
 				picture->mv_probs[i][j];
 	/* coeff_probs[4][8][3][11]: VAAPI layout matches kernel exactly;
 	 * direct memcpy. Both vaapi_vp8.c:133-143 and v4l2_request_vp8.c:
 	 * 141-153 apply identical coeff_bands_inverse reordering before
 	 * writing — VAAPI consumer has done the reordering for us. */
 	memcpy(frame.entropy.coeff_probs,
 	       probability->dct_coeff_probs,
 	       sizeof frame.entropy.coeff_probs);
 	/* Clause 9: coder state + first-partition fields */
 	frame.coder_state.range = picture->bool_coder_ctx.range;
 	frame.coder_state.value = picture->bool_coder_ctx.value;
 	frame.coder_state.bit_count = picture->bool_coder_ctx.count;
 	/* Phase 5 C1+C2: macroblock_offset IS first_part_header_bits by
 	 * source identity; kernel hantro_g1_vp8_dec.c:260 reads it
 	 * unconditionally to compute MB-data DMA offset. partition_size[0]
 	 * is the post-parse REMAINDER; recover total via
 	 * + ceil(macroblock_offset/8). */
 	frame.first_part_header_bits = slice->macroblock_offset;
 	frame.first_part_size =
 		slice->partition_size[0] +
 		((uint32_t)slice->macroblock_offset + 7) / 8;
 	for (i = 0; i < 8; i++)
 		frame.dct_part_sizes[i] = slice->partition_size[i + 1];
 	/* Clause 9: flags assembly (6 mainline-documented bits only;
 	 * EXPERIMENTAL + bit 0x40 NOT replicated despite ffmpeg-v4l2-
 	 * request-git setting them — kernel hantro_vp8.c only inspects
 	 * KEY_FRAME bit). VAAPI inverts: key_frame=0 means it IS a
 	 * keyframe per VP8 spec. */
 	if (!picture->pic_fields.bits.key_frame)
 		frame.flags |= V4L2_VP8_FRAME_FLAG_KEY_FRAME;
 	frame.flags |= V4L2_VP8_FRAME_FLAG_SHOW_FRAME;
 	if (picture->pic_fields.bits.mb_no_coeff_skip)
 		frame.flags |= V4L2_VP8_FRAME_FLAG_MB_NO_SKIP_COEFF;
 	if (picture->pic_fields.bits.sign_bias_golden)
 		frame.flags |= V4L2_VP8_FRAME_FLAG_SIGN_BIAS_GOLDEN;
 	if (picture->pic_fields.bits.sign_bias_alternate)
 		frame.flags |= V4L2_VP8_FRAME_FLAG_SIGN_BIAS_ALT;
 	/* Clause 1+10: single-control batched submission */
 	struct v4l2_ext_control ctrls[1] = {
 		{
 			.id = V4L2_CID_STATELESS_VP8_FRAME,
 			.ptr = &frame,
 			.size = sizeof frame,
 		},
 	};
 	rc = v4l2_set_controls(driver_data->video_fd,
 			       surface_object->request_fd,
 			       ctrls, 1);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	return 0;
 }
@@ -1,38 +0,0 @@
 /*
 * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
 *
 * fresnel-fourier iter3: VP8 codec dispatcher header.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #ifndef _VP8_H_
 #define _VP8_H_
 struct object_context;
 struct object_surface;
 struct request_data;
 int vp8_set_controls(struct request_data *driver_data,
 		     struct object_context *context,
 		     struct object_surface *surface_object);
 #endif
@@ -1,754 +0,0 @@
 /*
 * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
 *
 * fresnel-fourier iter4 Phase 6 commit B: VP9 codec dispatcher
 * implemented against V4L2_CID_STATELESS_VP9_FRAME (0xa40a2c) +
 * V4L2_CID_STATELESS_VP9_COMPRESSED_HDR (0xa40a2d). rkvdec on
 * RK3399 mandatorily requires both controls per
 * drivers/staging/media/rkvdec/rkvdec-vp9.c::rkvdec_vp9_run_preamble:752.
 *
 * Reference: FFmpeg libavcodec/v4l2_request_vp9.c (kwiboo branch);
 *            FFmpeg libavcodec/vaapi_vp9.c (VAAPI source-side
 *            verification of field semantics);
 *            kernel drivers/media/v4l2-core/v4l2-vp9.c +
 *              drivers/staging/media/rkvdec/rkvdec-vp9.c.
 *
 * Phase 5 review amendments incorporated (see phase5_iter4_review.md):
 *   C1 frame.interpolation_filter = picture->mcomp_filter_type
 *      (NO XOR; vaapi_vp9.c:62 already applied the XOR before storing
 *      into VAAPI's mcomp_filter_type; double-XOR would swap
 *      EIGHTTAP and EIGHTTAP_SMOOTH for inter frames)
 *   C2 LF deltas persisted across frames in object_context.vp9_lf,
 *      init to VP9 spec defaults {1,0,-1,-1,0,0} on
 *      keyframe/intra_only/error_resilient, updated only when parsed
 *      lf_delta.update=1, ALWAYS copied to kernel control
 *   C3 vp9_fill_compressed_hdr takes out_reference_mode pointer
 *      (reference_mode lives in v4l2_ctrl_vp9_frame, NOT in
 *      _compressed_hdr; threaded via parameter)
 *
 * Suggested findings incorporated:
 *   S4 uv_mode memcpy from FFmpeg's fill_compressed_hdr is omitted —
 *      rkvdec reads uv_mode from kernel's persistent
 *      probability_tables, NOT from prob_updates ctrl
 *   S5 lossless_flag semantics align with FFmpeg's s->s.h.lossless
 *      (LosslessFlag = base_qindex==0 && y_dc_delta_q==0 &&
 *      uv_dc_delta_q==0 && uv_ac_delta_q==0)
 */
 #include "vp9.h"
 #include "v4l2.h"
 #include "utils.h"
 #include <assert.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 #include <linux/v4l2-controls.h>
 #include <linux/videodev2.h>
 /* Clause 3: compile-time size assertions. UAPI shifts must fail loudly. */
 _Static_assert(sizeof(struct v4l2_ctrl_vp9_frame) == 168,
 	       "v4l2_ctrl_vp9_frame size mismatch — kernel UAPI changed");
 _Static_assert(sizeof(struct v4l2_ctrl_vp9_compressed_hdr) == 2040,
 	       "v4l2_ctrl_vp9_compressed_hdr size mismatch — kernel UAPI changed");
 /*
 * VPX range coder — minimal port of FFmpeg vpx_rac.[ch] + vp89_rac.h.
 * Stateless static helpers; bitstream-only readers. ~80 LOC.
 */
 static const uint8_t vpx_norm_shift[256] = {
 	8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,
 	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 };
 struct vp9_rac {
 	int high;
 	int bits;
 	const uint8_t *buffer;
 	const uint8_t *end;
 	unsigned int code_word;
 };
 static int vp9_rac_init(struct vp9_rac *c, const uint8_t *buf, int size)
 {
 	if (size < 1)
 		return -1;
 	c->high = 255;
 	c->bits = -16;
 	c->buffer = buf;
 	c->end = buf + size;
 	c->code_word = ((unsigned)buf[0] << 16) | ((unsigned)buf[1] << 8) | buf[2];
 	c->buffer += 3;
 	return 0;
 }
 static unsigned vp9_rac_renorm(struct vp9_rac *c)
 {
 	int shift = vpx_norm_shift[c->high];
 	int bits = c->bits;
 	unsigned code_word = c->code_word;
 	c->high <<= shift;
 	code_word <<= shift;
 	bits += shift;
 	if (bits >= 0 && c->buffer + 1 < c->end) {
 		code_word |= (((unsigned)c->buffer[0] << 8) | c->buffer[1]) << bits;
 		c->buffer += 2;
 		bits -= 16;
 	}
 	c->bits = bits;
 	return code_word;
 }
 static int vp9_rac_get_prob(struct vp9_rac *c, uint8_t prob)
 {
 	unsigned code_word = vp9_rac_renorm(c);
 	unsigned low = 1 + (((c->high - 1) * prob) >> 8);
 	unsigned low_shift = low << 16;
 	int bit = code_word >= low_shift;
 	c->high = bit ? c->high - low : low;
 	c->code_word = bit ? code_word - low_shift : code_word;
 	return bit;
 }
 static int vp9_rac_get_branchy(struct vp9_rac *c, int prob)
 {
 	return vp9_rac_get_prob(c, (uint8_t)prob);
 }
 static int vp9_rac_bit(struct vp9_rac *c)
 {
 	return vp9_rac_get_prob(c, 128);
 }
 static int vp9_rac_uint(struct vp9_rac *c, int bits)
 {
 	int value = 0;
 	while (bits--)
 		value = (value << 1) | vp9_rac_bit(c);
 	return value;
 }
 /* inv_map_table: VP9 differential probability update table.
 * Verbatim copy from FFmpeg v4l2_request_vp9.c:44-64. */
 static const uint8_t inv_map_table[255] = {
 	  7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
 	189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
 	 10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
 	 25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
 	 40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
 	 55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
 	 70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
 	 86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
 	101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
 	116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
 	131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
 	146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 	161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 	177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
 	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
 	207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
 	222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
 	237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
 	252, 253, 253,
 };
 static int read_prob_delta(struct vp9_rac *c)
 {
 	int d;
 	if (!vp9_rac_bit(c)) {
 		d = vp9_rac_uint(c, 4);
 	} else if (!vp9_rac_bit(c)) {
 		d = vp9_rac_uint(c, 4) + 16;
 	} else if (!vp9_rac_bit(c)) {
 		d = vp9_rac_uint(c, 5) + 32;
 	} else {
 		d = vp9_rac_uint(c, 7);
 		if (d >= 65)
 			d = (d << 1) - 65 + vp9_rac_bit(c);
 		d += 64;
 	}
 	return inv_map_table[d];
 }
 /*
 * Clause 6: minimal big-endian bit reader over the uncompressed header
 * for the fields VAAPI doesn't expose: lf_delta_enabled / lf_delta_update /
 * lf_ref_deltas / lf_mode_deltas / base_q_idx / delta_q_y_dc / delta_q_uv_dc /
 * delta_q_uv_ac.
 *
 * Walks: frame_marker(2) + profile(2 or 3) + show_existing_frame(1) +
 *        frame_type(1) + show_frame(1) + error_resilient(1) +
 *        if keyframe: sync_code(24) + color_config + frame_size + render_size
 *        else: intra_only(1 if !show_frame) + reset(2) +
 *              if intra_only: sync_code(24) + (if profile>0: color_config) +
 *                             refresh_flags(8) + frame_size + render_size
 *              else: refresh_flags(8) + 3*(ref_idx(3)+sign_bias(1)) +
 *                    frame_size_with_refs + allow_hpmv(1) + interp_filter(2 or 3)
 *        loop_filter_params + quantization_params
 *
 * Only profile-0 paths are exercised for BBB; non-profile-0 fields read
 * their bits but do not write them back. Keep targeted, not general.
 */
 struct uh_reader {
 	const uint8_t *buf;
 	size_t size;
 	size_t bit_pos;
 };
 static unsigned uh_read_bits(struct uh_reader *r, int n)
 {
 	unsigned v = 0;
 	int i;
 	for (i = 0; i < n; i++) {
 		size_t byte = r->bit_pos >> 3;
 		int bit = 7 - (r->bit_pos & 7);
 		if (byte >= r->size)
 			return 0;
 		v = (v << 1) | ((r->buf[byte] >> bit) & 1);
 		r->bit_pos++;
 	}
 	return v;
 }
 /* Phase 7 fix: VP9 spec s(N) is N magnitude bits + 1 sign bit (total N+1).
 * Previous uh_read_signed_6 read 4+1=5 bits instead of 6+1=7; bit drift of
 * 2 bits per ref_delta accumulated across the lf_delta updates and shifted
 * base_q_idx by 8 bits, producing 0x41 (frame 1 keyframe) instead of 0x2e.
 * Phase 3 anchor cross-check confirmed the corrected 7-bit read places
 * base_q_idx at bit 111 with value 0x2e=46. */
 static int uh_read_sbits(struct uh_reader *r, int n)
 {
 	int v = (int)uh_read_bits(r, n);
 	int sign = (int)uh_read_bits(r, 1);
 	return sign ? -v : v;
 }
 static int uh_read_delta_q(struct uh_reader *r)
 {
 	/* read_delta_q(): if delta_coded bit set, read s(4) = 4 mag + 1 sign */
 	if (uh_read_bits(r, 1))
 		return uh_read_sbits(r, 4);
 	return 0;
 }
 static void vp9_parse_uncompressed_header_lf_quant(
 	const uint8_t *data, uint32_t size,
 	struct v4l2_ctrl_vp9_frame *frame,
 	int8_t persistent_ref_deltas[4],
 	int8_t persistent_mode_deltas[2],
 	bool *out_keyframe_or_intraonly,
 	bool *out_lf_delta_updated)
 {
 	struct uh_reader r = { .buf = data, .size = size, .bit_pos = 0 };
 	bool keyframe, intra_only = false, show_frame, error_resilient;
 	int profile;
 	int i;
 	*out_lf_delta_updated = false;
 	uh_read_bits(&r, 2);                 /* frame_marker */
 	{
 		int p_lo = uh_read_bits(&r, 1);
 		int p_hi = uh_read_bits(&r, 1);
 		profile = p_lo + (p_hi << 1);
 		if (profile == 3)
 			uh_read_bits(&r, 1);
 	}
 	if (uh_read_bits(&r, 1))             /* show_existing_frame */
 		return;                      /* no LF/quant in the bitstream */
 	keyframe = !uh_read_bits(&r, 1);
 	show_frame = uh_read_bits(&r, 1);
 	error_resilient = uh_read_bits(&r, 1);
 	if (keyframe) {
 		uh_read_bits(&r, 24);        /* sync_code */
 		/* color_config (profile=0): just bt709 + range bit */
 		if (profile >= 2)
 			uh_read_bits(&r, 1); /* ten_or_twelve_bit */
 		uh_read_bits(&r, 3);         /* color_space */
 		if (1) {                     /* color_space != CS_RGB */
 			uh_read_bits(&r, 1); /* color_range */
 			if (profile == 1 || profile == 3) {
 				uh_read_bits(&r, 2); /* subsampling */
 				uh_read_bits(&r, 1); /* reserved */
 			}
 		} else if (profile == 1 || profile == 3) {
 			uh_read_bits(&r, 1); /* reserved */
 		}
 		uh_read_bits(&r, 16);        /* frame_width_minus_1 */
 		uh_read_bits(&r, 16);        /* frame_height_minus_1 */
 		if (uh_read_bits(&r, 1)) {
 			uh_read_bits(&r, 16);
 			uh_read_bits(&r, 16);
 		}
 	} else {
 		intra_only = show_frame ? 0 : uh_read_bits(&r, 1);
 		if (!error_resilient)
 			uh_read_bits(&r, 2); /* reset_frame_context */
 		if (intra_only) {
 			uh_read_bits(&r, 24); /* sync_code */
 			if (profile > 0) {
 				if (profile >= 2)
 					uh_read_bits(&r, 1);
 				uh_read_bits(&r, 3); /* color_space */
 				uh_read_bits(&r, 1); /* color_range */
 				if (profile == 1 || profile == 3) {
 					uh_read_bits(&r, 2);
 					uh_read_bits(&r, 1);
 				}
 			}
 			uh_read_bits(&r, 8); /* refresh_frame_flags */
 			uh_read_bits(&r, 16);
 			uh_read_bits(&r, 16);
 			if (uh_read_bits(&r, 1)) {
 				uh_read_bits(&r, 16);
 				uh_read_bits(&r, 16);
 			}
 		} else {
 			uh_read_bits(&r, 8); /* refresh_frame_flags */
 			for (i = 0; i < 3; i++) {
 				uh_read_bits(&r, 3);
 				uh_read_bits(&r, 1);
 			}
 			/* frame_size_with_refs: up to 3 found_ref bits, then
 			 * if no found_ref: explicit width+height; else ref-pick.
 			 * Then render_size. Just walk it. */
 			{
 				bool found = false;
 				for (i = 0; i < 3; i++) {
 					if (uh_read_bits(&r, 1))
 						found = true;
 				}
 				if (!found) {
 					uh_read_bits(&r, 16);
 					uh_read_bits(&r, 16);
 				}
 				if (uh_read_bits(&r, 1)) {
 					uh_read_bits(&r, 16);
 					uh_read_bits(&r, 16);
 				}
 			}
 			uh_read_bits(&r, 1); /* allow_hpmv */
 			if (uh_read_bits(&r, 1)) /* interp_filter switchable */
 				;
 			else
 				uh_read_bits(&r, 2); /* interp_filter literal */
 		}
 	}
 	*out_keyframe_or_intraonly = keyframe || intra_only;
 	uh_read_bits(&r, 1);                 /* refresh_frame_context */
 	uh_read_bits(&r, 1);                 /* frame_parallel_decoding_mode */
 	if (!error_resilient || keyframe || intra_only)
 		uh_read_bits(&r, 2);         /* frame_context_idx + reset_frame_context */
 	/* loop_filter_params */
 	uh_read_bits(&r, 6);                 /* filter_level (already in VAAPI) */
 	uh_read_bits(&r, 3);                 /* sharpness (already in VAAPI) */
 	if (uh_read_bits(&r, 1)) {           /* lf_delta.enabled */
 		frame->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED;
 		if (uh_read_bits(&r, 1)) {   /* lf_delta.updated */
 			frame->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE;
 			*out_lf_delta_updated = true;
 			for (i = 0; i < 4; i++) {
 				if (uh_read_bits(&r, 1))
 					persistent_ref_deltas[i] =
 						(int8_t)uh_read_sbits(&r, 6);
 			}
 			for (i = 0; i < 2; i++) {
 				if (uh_read_bits(&r, 1))
 					persistent_mode_deltas[i] =
 						(int8_t)uh_read_sbits(&r, 6);
 			}
 		}
 	}
 	/* quantization_params */
 	frame->quant.base_q_idx     = (uint8_t)uh_read_bits(&r, 8);
 	frame->quant.delta_q_y_dc   = (int8_t)uh_read_delta_q(&r);
 	frame->quant.delta_q_uv_dc  = (int8_t)uh_read_delta_q(&r);
 	frame->quant.delta_q_uv_ac  = (int8_t)uh_read_delta_q(&r);
 }
 /*
 * Clause 9: compressed-header parser — port of FFmpeg
 * v4l2_request_vp9.c:99-261::fill_compressed_hdr.
 *
 * Phase 5 C3: out_reference_mode threaded via out-param. Callers
 * derive `allowcompinter` from VAAPI sign-bias bits and pass it.
 */
 #define V4L2_VP9_TX_MODE_ONLY_4X4_LOCAL    0
 #define V4L2_VP9_TX_MODE_ALLOW_32X32_LOCAL 3
 #define V4L2_VP9_TX_MODE_SELECT_LOCAL      4
 static void vp9_fill_compressed_hdr(
 	struct v4l2_ctrl_vp9_compressed_hdr *ctrl,
 	const uint8_t *buffer, uint32_t size,
 	uint8_t lossless_flag,
 	bool keyframe_or_intraonly,
 	bool allowcompinter,
 	bool highprecision_mvs,
 	int interp_filter_switchable,
 	uint8_t *out_reference_mode)
 {
 	struct vp9_rac c;
 	int comppredmode = 0;
 	int i, j, k, l, m, n;
 	*out_reference_mode = 0;
 	if (vp9_rac_init(&c, buffer, size) < 0)
 		return;
 	if (vp9_rac_get_branchy(&c, 128))    /* marker bit */
 		return;
 	if (lossless_flag) {
 		ctrl->tx_mode = V4L2_VP9_TX_MODE_ONLY_4X4_LOCAL;
 	} else {
 		ctrl->tx_mode = (uint8_t)vp9_rac_uint(&c, 2);
 		if (ctrl->tx_mode == V4L2_VP9_TX_MODE_ALLOW_32X32_LOCAL)
 			ctrl->tx_mode = (uint8_t)(ctrl->tx_mode + vp9_rac_bit(&c));
 		if (ctrl->tx_mode == V4L2_VP9_TX_MODE_SELECT_LOCAL) {
 			for (i = 0; i < 2; i++)
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->tx8[i][0] = (uint8_t)read_prob_delta(&c);
 			for (i = 0; i < 2; i++)
 				for (j = 0; j < 2; j++)
 					if (vp9_rac_get_branchy(&c, 252))
 						ctrl->tx16[i][j] = (uint8_t)read_prob_delta(&c);
 			for (i = 0; i < 2; i++)
 				for (j = 0; j < 3; j++)
 					if (vp9_rac_get_branchy(&c, 252))
 						ctrl->tx32[i][j] = (uint8_t)read_prob_delta(&c);
 		}
 	}
 	for (i = 0; i < 4; i++) {
 		if (vp9_rac_bit(&c)) {
 			for (j = 0; j < 2; j++)
 				for (k = 0; k < 2; k++)
 					for (l = 0; l < 6; l++)
 						for (m = 0; m < 6; m++) {
 							if (m >= 3 && l == 0)
 								break;
 							for (n = 0; n < 3; n++)
 								if (vp9_rac_get_branchy(&c, 252))
 									ctrl->coef[i][j][k][l][m][n] =
 										(uint8_t)read_prob_delta(&c);
 						}
 		}
 		if (ctrl->tx_mode == i)
 			break;
 	}
 	for (i = 0; i < 3; i++)
 		if (vp9_rac_get_branchy(&c, 252))
 			ctrl->skip[i] = (uint8_t)read_prob_delta(&c);
 	if (!keyframe_or_intraonly) {
 		for (i = 0; i < 7; i++)
 			for (j = 0; j < 3; j++)
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->inter_mode[i][j] = (uint8_t)read_prob_delta(&c);
 		if (interp_filter_switchable)
 			for (i = 0; i < 4; i++)
 				for (j = 0; j < 2; j++)
 					if (vp9_rac_get_branchy(&c, 252))
 						ctrl->interp_filter[i][j] =
 							(uint8_t)read_prob_delta(&c);
 		for (i = 0; i < 4; i++)
 			if (vp9_rac_get_branchy(&c, 252))
 				ctrl->is_inter[i] = (uint8_t)read_prob_delta(&c);
 		if (allowcompinter) {
 			comppredmode = vp9_rac_bit(&c);
 			if (comppredmode)
 				comppredmode += vp9_rac_bit(&c);
 			if (comppredmode == 2)   /* PRED_SWITCHABLE */
 				for (i = 0; i < 5; i++)
 					if (vp9_rac_get_branchy(&c, 252))
 						ctrl->comp_mode[i] = (uint8_t)read_prob_delta(&c);
 		} else {
 			comppredmode = 0;        /* PRED_SINGLEREF */
 		}
 		if (comppredmode != 1) {        /* != PRED_COMPREF */
 			for (i = 0; i < 5; i++) {
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->single_ref[i][0] = (uint8_t)read_prob_delta(&c);
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->single_ref[i][1] = (uint8_t)read_prob_delta(&c);
 			}
 		}
 		if (comppredmode != 0) {        /* != PRED_SINGLEREF */
 			for (i = 0; i < 5; i++)
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->comp_ref[i] = (uint8_t)read_prob_delta(&c);
 		}
 		for (i = 0; i < 4; i++)
 			for (j = 0; j < 9; j++)
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->y_mode[i][j] = (uint8_t)read_prob_delta(&c);
 		for (i = 0; i < 4; i++)
 			for (j = 0; j < 4; j++)
 				for (k = 0; k < 3; k++)
 					if (vp9_rac_get_branchy(&c, 252))
 						ctrl->partition[(i * 4) + j][k] =
 							(uint8_t)read_prob_delta(&c);
 		for (i = 0; i < 3; i++)
 			if (vp9_rac_get_branchy(&c, 252))
 				ctrl->mv.joint[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 		for (i = 0; i < 2; i++) {
 			if (vp9_rac_get_branchy(&c, 252))
 				ctrl->mv.sign[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 			for (j = 0; j < 10; j++)
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->mv.classes[i][j] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 			if (vp9_rac_get_branchy(&c, 252))
 				ctrl->mv.class0_bit[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 			for (j = 0; j < 10; j++)
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->mv.bits[i][j] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 		}
 		for (i = 0; i < 2; i++) {
 			for (j = 0; j < 2; j++)
 				for (k = 0; k < 3; k++)
 					if (vp9_rac_get_branchy(&c, 252))
 						ctrl->mv.class0_fr[i][j][k] =
 							(uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 			for (j = 0; j < 3; j++)
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->mv.fr[i][j] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 		}
 		if (highprecision_mvs) {
 			for (i = 0; i < 2; i++) {
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->mv.class0_hp[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 				if (vp9_rac_get_branchy(&c, 252))
 					ctrl->mv.hp[i] = (uint8_t)((vp9_rac_uint(&c, 7) << 1) | 1);
 			}
 		}
 	}
 	*out_reference_mode = (uint8_t)comppredmode;
 }
 /*
 * Clause 1+2+4+5+7+10+11+12: orchestrate VP9 control submission.
 * 2 batched controls per frame: VP9_FRAME + VP9_COMPRESSED_HDR.
 */
 int vp9_set_controls(struct request_data *driver_data,
 		     struct object_context *context,
 		     struct object_surface *surface_object)
 {
 	VADecPictureParameterBufferVP9 *picture =
 		&surface_object->params.vp9.picture;
 	VASliceParameterBufferVP9 *slice =
 		&surface_object->params.vp9.slice;
 	struct v4l2_ctrl_vp9_frame frame;
 	struct v4l2_ctrl_vp9_compressed_hdr compressed_hdr;
 	struct v4l2_ext_control ctrls[2];
 	int rc, i;
 	bool keyframe = !picture->pic_fields.bits.frame_type;
 	bool intra_only = picture->pic_fields.bits.intra_only;
 	bool error_resilient = picture->pic_fields.bits.error_resilient_mode;
 	bool allowcompinter;
 	bool keyframe_or_intraonly_parsed = false;
 	bool lf_delta_updated = false;
 	uint8_t parsed_reference_mode = 0;
 	memset(&frame, 0, sizeof frame);
 	memset(&compressed_hdr, 0, sizeof compressed_hdr);
 	/* Clause 4: frame geometry + per-frame scalars */
 	frame.frame_width_minus_1   = (uint16_t)(picture->frame_width - 1);
 	frame.frame_height_minus_1  = (uint16_t)(picture->frame_height - 1);
 	frame.render_width_minus_1  = frame.frame_width_minus_1;
 	frame.render_height_minus_1 = frame.frame_height_minus_1;
 	frame.profile           = picture->profile;
 	frame.bit_depth         = picture->bit_depth;
 	frame.tile_cols_log2    = picture->log2_tile_columns;
 	frame.tile_rows_log2    = picture->log2_tile_rows;
 	frame.frame_context_idx = picture->pic_fields.bits.frame_context_idx;
 	frame.lf.level     = picture->filter_level;
 	frame.lf.sharpness = picture->sharpness_level;
 	frame.uncompressed_header_size = picture->frame_header_length_in_bytes;
 	frame.compressed_header_size   = picture->first_partition_size;
 	/* Clause 5: DPB timestamp resolution */
 	{
 		VASurfaceID last_id = picture->reference_frames[picture->pic_fields.bits.last_ref_frame];
 		VASurfaceID gold_id = picture->reference_frames[picture->pic_fields.bits.golden_ref_frame];
 		VASurfaceID alt_id  = picture->reference_frames[picture->pic_fields.bits.alt_ref_frame];
 		struct object_surface *last_ref =
 			(last_id != VA_INVALID_SURFACE) ? SURFACE(driver_data, last_id) : NULL;
 		struct object_surface *gold_ref =
 			(gold_id != VA_INVALID_SURFACE) ? SURFACE(driver_data, gold_id) : NULL;
 		struct object_surface *alt_ref =
 			(alt_id  != VA_INVALID_SURFACE) ? SURFACE(driver_data, alt_id)  : NULL;
 		if (last_ref) frame.last_frame_ts   = v4l2_timeval_to_ns(&last_ref->timestamp);
 		if (gold_ref) frame.golden_frame_ts = v4l2_timeval_to_ns(&gold_ref->timestamp);
 		if (alt_ref)  frame.alt_frame_ts    = v4l2_timeval_to_ns(&alt_ref->timestamp);
 	}
 	if (picture->pic_fields.bits.last_ref_frame_sign_bias)
 		frame.ref_frame_sign_bias |= V4L2_VP9_SIGN_BIAS_LAST;
 	if (picture->pic_fields.bits.golden_ref_frame_sign_bias)
 		frame.ref_frame_sign_bias |= V4L2_VP9_SIGN_BIAS_GOLDEN;
 	if (picture->pic_fields.bits.alt_ref_frame_sign_bias)
 		frame.ref_frame_sign_bias |= V4L2_VP9_SIGN_BIAS_ALT;
 	allowcompinter = !(
 		picture->pic_fields.bits.last_ref_frame_sign_bias ==
 		    picture->pic_fields.bits.golden_ref_frame_sign_bias &&
 		picture->pic_fields.bits.golden_ref_frame_sign_bias ==
 		    picture->pic_fields.bits.alt_ref_frame_sign_bias);
 	/* Clause 6: persistent LF delta state — Phase 5 C2 */
 	if (!context->vp9_lf.initialized || keyframe || intra_only || error_resilient) {
 		context->vp9_lf.ref_deltas[0] = 1;
 		context->vp9_lf.ref_deltas[1] = 0;
 		context->vp9_lf.ref_deltas[2] = -1;
 		context->vp9_lf.ref_deltas[3] = -1;
 		context->vp9_lf.mode_deltas[0] = 0;
 		context->vp9_lf.mode_deltas[1] = 0;
 		context->vp9_lf.initialized = true;
 	}
 	vp9_parse_uncompressed_header_lf_quant(
 		surface_object->source_data,
 		surface_object->source_size,
 		&frame,
 		context->vp9_lf.ref_deltas,
 		context->vp9_lf.mode_deltas,
 		&keyframe_or_intraonly_parsed,
 		&lf_delta_updated);
 	(void)lf_delta_updated;
 	for (i = 0; i < 4; i++)
 		frame.lf.ref_deltas[i] = context->vp9_lf.ref_deltas[i];
 	for (i = 0; i < 2; i++)
 		frame.lf.mode_deltas[i] = context->vp9_lf.mode_deltas[i];
 	/* Clause 7: segmentation mapping */
 	for (i = 0; i < 7; i++)
 		frame.seg.tree_probs[i] = picture->mb_segment_tree_probs[i];
 	for (i = 0; i < 3; i++)
 		frame.seg.pred_probs[i] = picture->segment_pred_probs[i];
 	if (picture->pic_fields.bits.segmentation_enabled)
 		frame.seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ENABLED;
 	if (picture->pic_fields.bits.segmentation_update_map)
 		frame.seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP;
 	if (picture->pic_fields.bits.segmentation_temporal_update)
 		frame.seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE;
 	for (i = 0; i < 8; i++) {
 		if (slice->seg_param[i].segment_flags.fields.segment_reference_enabled) {
 			frame.seg.feature_enabled[i] |= 1 << V4L2_VP9_SEG_LVL_REF_FRAME;
 			frame.seg.feature_data[i][V4L2_VP9_SEG_LVL_REF_FRAME] =
 				(int16_t)slice->seg_param[i].segment_flags.fields.segment_reference;
 		}
 		if (slice->seg_param[i].segment_flags.fields.segment_reference_skipped)
 			frame.seg.feature_enabled[i] |= 1 << V4L2_VP9_SEG_LVL_SKIP;
 	}
 	/* Clause 10: frame flags + reference_mode + interpolation_filter */
 	if (keyframe)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_KEY_FRAME;
 	if (picture->pic_fields.bits.show_frame)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_SHOW_FRAME;
 	if (error_resilient)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT;
 	if (intra_only)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_INTRA_ONLY;
 	if (picture->pic_fields.bits.allow_high_precision_mv)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV;
 	if (picture->pic_fields.bits.refresh_frame_context)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX;
 	if (picture->pic_fields.bits.frame_parallel_decoding_mode)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE;
 	if (picture->pic_fields.bits.subsampling_x)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING;
 	if (picture->pic_fields.bits.subsampling_y)
 		frame.flags |= V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING;
 	/* Phase 5 C1: NO XOR. VAAPI's mcomp_filter_type is already post-XOR. */
 	frame.interpolation_filter = picture->pic_fields.bits.mcomp_filter_type;
 	/* reset_frame_context: FFmpeg's (resetctx > 0 ? resetctx - 1 : 0) */
 	frame.reset_frame_context =
 		picture->pic_fields.bits.reset_frame_context > 0
 		? (uint8_t)(picture->pic_fields.bits.reset_frame_context - 1)
 		: 0;
 	/* Clause 9: compressed-header parser fills both compressed_hdr and
 	 * out_reference_mode. allowcompinter derived from sign biases above. */
 	{
 		int interp_switchable = (frame.interpolation_filter == V4L2_VP9_INTERP_FILTER_SWITCHABLE);
 		vp9_fill_compressed_hdr(
 			&compressed_hdr,
 			surface_object->source_data + frame.uncompressed_header_size,
 			frame.compressed_header_size,
 			picture->pic_fields.bits.lossless_flag,
 			keyframe || intra_only,
 			allowcompinter,
 			picture->pic_fields.bits.allow_high_precision_mv,
 			interp_switchable,
 			&parsed_reference_mode);
 	}
 	frame.reference_mode = parsed_reference_mode;
 	/* Clause 11: 2-control batched submission */
 	memset(ctrls, 0, sizeof ctrls);
 	ctrls[0].id   = V4L2_CID_STATELESS_VP9_FRAME;
 	ctrls[0].ptr  = &frame;
 	ctrls[0].size = sizeof frame;
 	ctrls[1].id   = V4L2_CID_STATELESS_VP9_COMPRESSED_HDR;
 	ctrls[1].ptr  = &compressed_hdr;
 	ctrls[1].size = sizeof compressed_hdr;
 	rc = v4l2_set_controls(driver_data->video_fd,
 			       surface_object->request_fd,
 			       ctrls, 2);
 	if (rc < 0)
 		return VA_STATUS_ERROR_OPERATION_FAILED;
 	return VA_STATUS_SUCCESS;
 }
@@ -1,38 +0,0 @@
 /*
 * Copyright (C) 2026 Markus Fritsche <fritsche.markus@gmail.com>
 *
 * fresnel-fourier iter4 Phase 6 commit B: VP9 codec dispatcher header.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef _VP9_H_
 #define _VP9_H_
 #include "context.h"
 #include "request.h"
 #include "surface.h"
 int vp9_set_controls(struct request_data *driver_data,
 		     struct object_context *context,
 		     struct object_surface *surface);
 #endif /* _VP9_H_ */
@@ -1,167 +0,0 @@
 /*
 * cap_pool_probe_pattern.c — synthetic regression test for the
 * iter5 sonnet C4 / iter6 candidate A "cap_pool resolution-change race."
 *
 * Exercises the surface-allocation pattern that originally tripped
 * REQBUFS-EBUSY on the iter5-end driver: vaCreateSurfaces at one
 * resolution, then vaDestroySurfaces, then vaCreateSurfaces at a
 * different resolution. iter6's REINIT discipline + cap_pool's
 * REQBUFS(0)-on-CAPTURE-and-OUTPUT during S_FMT-on-resolution-change
 * (CreateSurfaces2 in surface.c) closes this race; this test anchors
 * that fact with a deterministic repro.
 *
 * Build:
 *   gcc -O2 -Wall -Wextra -o cap_pool_probe_pattern \
 *       cap_pool_probe_pattern.c \
 *       $(pkg-config --cflags --libs libva libva-drm)
 *
 * Run:
 *   LIBVA_DRIVER_NAME=v4l2_request \
 *   LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
 *   LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
 *     ./cap_pool_probe_pattern
 *
 * Pass criterion (on iter6 driver and later):
 *   - Exit code 0
 *   - No "REQBUFS" / "EBUSY" / "Unable to request buffers" /
 *     "Unable to set format" lines on the v4l2-request driver's stderr
 *   - vainfo or visual inspection confirms the test program reached
 *     the "PASS" line on stdout
 *
 * Fail behavior pre-iter5: vaCreateSurfaces at the second resolution
 * would emit REQBUFS-EBUSY because OUTPUT/CAPTURE buffers from the
 * first allocation hadn't been torn down before S_FMT was attempted
 * on the new resolution. iter5's CreateSurfaces2 added the dual
 * REQBUFS(0) drain; iter6's REINIT keeps the OUTPUT pool's request_fd
 * lifecycle clean across the destroy-recreate cycle.
 */
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <va/va.h>
 #include <va/va_drm.h>
 #define DRM_RENDER_NODE "/dev/dri/renderD128"
 static const char *va_status_str(VAStatus s)
 {
 	return vaErrorStr(s);
 }
 #define VA_OK_OR_FAIL(call, msg) do { \
 	VAStatus _vs = (call); \
 	if (_vs != VA_STATUS_SUCCESS) { \
 		fprintf(stderr, "FAIL: %s: %s (0x%x)\n", \
 			(msg), va_status_str(_vs), _vs); \
 		return 10; \
 	} \
 } while (0)
 int main(void)
 {
 	int drm_fd;
 	VADisplay dpy;
 	int va_major = 0, va_minor = 0;
 	VAConfigID config = VA_INVALID_ID;
 	VAContextID context = VA_INVALID_ID;
 	VASurfaceID small_surfaces[4];
 	VASurfaceID big_surfaces[4];
 	const unsigned int small_w = 128, small_h = 128;
 	const unsigned int big_w = 1920, big_h = 1080;
 	/* Open render node + libva display. */
 	drm_fd = open(DRM_RENDER_NODE, O_RDWR);
 	if (drm_fd < 0) {
 		fprintf(stderr, "FAIL: open(%s): %s\n",
 			DRM_RENDER_NODE, strerror(errno));
 		return 1;
 	}
 	dpy = vaGetDisplayDRM(drm_fd);
 	if (dpy == NULL) {
 		fprintf(stderr, "FAIL: vaGetDisplayDRM returned NULL\n");
 		close(drm_fd);
 		return 2;
 	}
 	VA_OK_OR_FAIL(vaInitialize(dpy, &va_major, &va_minor),
 		      "vaInitialize");
 	printf("libva %d.%d initialized via %s\n", va_major, va_minor,
 	       DRM_RENDER_NODE);
 	/*
 	 * vaCreateConfig with H.264 Main + VLD entrypoint forces our
 	 * driver's RequestCreateConfig to set up the H.264 decode path,
 	 * which is the path that reaches CreateSurfaces2 (and the
 	 * resolution-change handling there).
 	 */
 	VA_OK_OR_FAIL(vaCreateConfig(dpy, VAProfileH264Main, VAEntrypointVLD,
 				     NULL, 0, &config),
 		      "vaCreateConfig(H264Main, VLD)");
 	/* Phase 1: allocate small probe-pattern surfaces.
 	 *
 	 * iter5 sonnet C4 specified the race as vaCreateSurfaces(small)
 	 * then vaCreateSurfaces(big), allocation-only — matching mpv's
 	 * libplacebo probe pattern that surfaced the original failure.
 	 * No context creation needed for the C4 race; the cap_pool's
 	 * resolution-change handling lives in CreateSurfaces2 itself
 	 * (REQBUFS(0)+S_FMT pair on the OUTPUT queue, cap_pool_destroy
 	 * + cap_pool_init on the CAPTURE queue).
 	 *
 	 * (vaCreateContext + recreate at a new resolution surfaced an
 	 * additional STREAMON-on-recreate failure during iter7 Phase 7
 	 * verification. That's iter8 candidate; out of scope for the C4
 	 * regression test.)
 	 */
 	printf("Phase 1: vaCreateSurfaces %ux%u, count=4\n", small_w, small_h);
 	VA_OK_OR_FAIL(vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420,
 				       small_w, small_h, small_surfaces, 4,
 				       NULL, 0),
 		      "vaCreateSurfaces (small)");
 	/* Phase 2: dispose small surfaces. Our driver's CreateSurfaces2
 	 * keeps the cap_pool initialized at the small resolution; the
 	 * pool will be torn down + rebuilt by Phase 3's resolution-change
 	 * branch in CreateSurfaces2.
 	 */
 	printf("Phase 2: vaDestroySurfaces (small)\n");
 	VA_OK_OR_FAIL(vaDestroySurfaces(dpy, small_surfaces, 4),
 		      "vaDestroySurfaces (small)");
 	/* Phase 3: allocate at the new (much larger) resolution. This is
 	 * the C4 race-hitting path: pre-iter5 hit REQBUFS-EBUSY because
 	 * CAPTURE/OUTPUT buffers from the small allocation hadn't been
 	 * torn down before S_FMT on the new size. iter5's CreateSurfaces2
 	 * added the dual REQBUFS(0) drain; iter7 also adds OUTPUT pool
 	 * teardown for the case where a context-bound resolution change
 	 * leaves the request_pool stale (defensive — not exercised in
 	 * this no-context test path).
 	 */
 	printf("Phase 3: vaCreateSurfaces %ux%u, count=4 (resolution change)\n",
 	       big_w, big_h);
 	VA_OK_OR_FAIL(vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420,
 				       big_w, big_h, big_surfaces, 4,
 				       NULL, 0),
 		      "vaCreateSurfaces (big)");
 	/* Phase 4: clean up. */
 	printf("Phase 4: cleanup\n");
 	VA_OK_OR_FAIL(vaDestroySurfaces(dpy, big_surfaces, 4),
 		      "vaDestroySurfaces (big)");
 	VA_OK_OR_FAIL(vaDestroyConfig(dpy, config),
 		      "vaDestroyConfig");
 	VA_OK_OR_FAIL(vaTerminate(dpy),
 		      "vaTerminate");
 	close(drm_fd);
 	(void)context; /* unused in the C4-faithful no-context test path */
 	printf("PASS: cap_pool probe-pattern resolution-change handled cleanly.\n");
 	printf("Inspect driver stderr for absence of REQBUFS/EBUSY/Unable lines.\n");
 	return 0;
 }
@@ -1,53 +0,0 @@
 #!/bin/bash
 # run_cap_pool_probe.sh — orchestrate the cap_pool probe-pattern regression test.
 #
 # Runs the cap_pool_probe_pattern test program with the v4l2_request driver
 # and grep-checks driver stderr for race indicators. Exits 0 on PASS, 1 on FAIL.
 #
 # Usage: ./run_cap_pool_probe.sh [path_to_test_binary]
 # If no argument, looks for ./cap_pool_probe_pattern in the same directory.
 set -eu
 BIN="${1:-$(dirname "$0")/cap_pool_probe_pattern}"
 if [[ ! -x "$BIN" ]]; then
 	echo "FAIL: test binary not found or not executable: $BIN" >&2
 	echo "Build it first:" >&2
 	echo "  gcc -O2 -Wall -Wextra -o $BIN $(dirname "$0")/cap_pool_probe_pattern.c \\" >&2
 	echo "      \$(pkg-config --cflags --libs libva libva-drm)" >&2
 	exit 2
 fi
 LOG=$(mktemp -t cap_pool_probe.XXXXXX.log)
 trap 'rm -f "$LOG"' EXIT
 env LIBVA_DRIVER_NAME=v4l2_request \
    LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
    LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
    "$BIN" >"$LOG" 2>&1
 rc=$?
 echo "--- test program output ---"
 cat "$LOG"
 echo "--- end output ---"
 if [[ "$rc" -ne 0 ]]; then
 	echo "FAIL: test binary exited with rc=$rc" >&2
 	exit 1
 fi
 # Race indicators on driver-prefixed lines only (avoids matching the
 # test program's own informational output). Driver log lines start with
 # "v4l2-request:".
 race_lines=$(grep -E '^v4l2-request:' "$LOG" \
 	| grep -iE 'REQBUFS|EBUSY|Unable to request buffers|Unable to set format' \
 	|| true)
 if [[ -n "$race_lines" ]]; then
 	echo "FAIL: driver stderr contains race indicators:" >&2
 	echo "$race_lines" >&2
 	exit 1
 fi
 echo "PASS: cap_pool probe-pattern test clean (no race indicators)."
 exit 0
@@ -1,139 +0,0 @@
 #!/bin/bash
 # run_msync_pixel_verify.sh — verify decoded pixel correctness post-msync-removal.
 #
 # iter5 sweep commit d3a299b removed msync(MS_SYNC|MS_INVALIDATE) from the
 # CAPTURE buffer DQBUF path alongside the iter1 patch-0010 hex-dump diagnostic.
 # iter5 Phase 5 sonnet caveat C3 flagged: no formal pixel-correctness check
 # was done. This script is that check.
 #
 # Approach:
 #   1. SW reference: ffmpeg libavcodec H.264 decode of bbb_1080p30_h264.mp4,
 #      first 100 frames, NV12 raw output -> sw_ref.yuv.
 #   2. HW subject: same input through our v4l2_request driver via
 #      ffmpeg -hwaccel vaapi -hwaccel_output_format vaapi
 #             -i ... -vf hwdownload,format=nv12 -f rawvideo -pix_fmt nv12
 #      Captures the post-DQBUF buffer through libva readback, exercising
 #      the same code path we removed msync from.
 #   3. Compare: byte-for-byte cmp + per-frame sha256.
 #
 # Pass: byte-for-byte identical (or per-frame sha matches) -> msync
 # verifiably unnecessary on this hardware/kernel; iter5 sonnet C3 closes.
 # Fail: divergence; restore msync in surface.c, re-run, document outcome.
 #
 # Usage: ./run_msync_pixel_verify.sh [fixture_path]
 # If no argument, defaults to /home/mfritsche/fourier-test/bbb_1080p30_h264.mp4
 set -eu
 FIXTURE="${1:-/home/mfritsche/fourier-test/bbb_1080p30_h264.mp4}"
 N_FRAMES=100
 WORKDIR=$(mktemp -d -t msync_verify.XXXXXX)
 trap 'rm -rf "$WORKDIR"' EXIT
 if [[ ! -f "$FIXTURE" ]]; then
 	echo "FAIL: fixture not found: $FIXTURE" >&2
 	exit 2
 fi
 # Probe fixture dimensions for crop alignment of the HW path.
 # Hantro pads height to MB boundaries (16-line align); FFmpeg SW decode
 # returns crop-aligned (visible) frame size. Without explicit cropping
 # on the HW side, hwdownload + format=nv12 emits MB-padded frames, which
 # would diverge in size from SW even if pixels are correct.
 read FIXTURE_W FIXTURE_H < <(ffprobe -v error -select_streams v:0 \
 	-show_entries stream=width,height -of csv=p=0 "$FIXTURE" \
 	| tr ',' ' ')
 if [[ -z "${FIXTURE_W:-}" || -z "${FIXTURE_H:-}" ]]; then
 	echo "FAIL: ffprobe could not read width/height from $FIXTURE" >&2
 	exit 2
 fi
 echo "Fixture: $FIXTURE ($FIXTURE_W x $FIXTURE_H)"
 echo "Frames:  $N_FRAMES"
 echo "Workdir: $WORKDIR"
 echo
 # 1. SW reference
 echo "[1/3] FFmpeg SW decode -> sw_ref.yuv"
 ffmpeg -hide_banner -loglevel error -y \
 	-i "$FIXTURE" \
 	-frames:v "$N_FRAMES" \
 	-f rawvideo -pix_fmt nv12 \
 	"$WORKDIR/sw_ref.yuv"
 SW_BYTES=$(stat -c %s "$WORKDIR/sw_ref.yuv")
 SW_SHA=$(sha256sum "$WORKDIR/sw_ref.yuv" | cut -d' ' -f1)
 echo "    sw_ref.yuv: $SW_BYTES bytes, sha256=$SW_SHA"
 # 2. HW subject via libva v4l2_request
 # Explicit crop=$FIXTURE_W:$FIXTURE_H after hwdownload normalizes any
 # MB-padding the HW driver applies (hantro pads height to multiples of
 # 16). Without this crop, an iter6+ correct decode could falsely
 # diverge in total byte count from the SW reference.
 echo "[2/3] FFmpeg HW decode via v4l2_request driver -> hw_capture.yuv"
 env LIBVA_DRIVER_NAME=v4l2_request \
    LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
    LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
    ffmpeg -hide_banner -loglevel error -y \
 	-hwaccel vaapi -hwaccel_output_format vaapi \
 	-i "$FIXTURE" \
 	-vf "hwdownload,format=nv12,crop=$FIXTURE_W:$FIXTURE_H:0:0" \
 	-frames:v "$N_FRAMES" \
 	-f rawvideo -pix_fmt nv12 \
 	"$WORKDIR/hw_capture.yuv"
 HW_BYTES=$(stat -c %s "$WORKDIR/hw_capture.yuv")
 HW_SHA=$(sha256sum "$WORKDIR/hw_capture.yuv" | cut -d' ' -f1)
 echo "    hw_capture.yuv: $HW_BYTES bytes, sha256=$HW_SHA"
 echo
 # 3. Compare
 echo "[3/3] Compare"
 if [[ "$SW_BYTES" -ne "$HW_BYTES" ]]; then
 	# Diagnose stride/padding artifacts before declaring pixel
 	# corruption. With explicit crop in step 2 this should not
 	# happen, but if a future kernel change shifts the alignment
 	# we want a clear diagnostic, not a false pixel-corruption
 	# accusation.
 	EXPECTED_SW=$(( FIXTURE_W * FIXTURE_H * 3 / 2 * N_FRAMES ))
 	for PAD in 16 32; do
 		PADDED_H=$(( (FIXTURE_H + PAD - 1) / PAD * PAD ))
 		EXPECTED_PADDED=$(( FIXTURE_W * PADDED_H * 3 / 2 * N_FRAMES ))
 		if [[ "$HW_BYTES" -eq "$EXPECTED_PADDED" ]]; then
 			echo "DIAGNOSTIC: HW size $HW_BYTES matches MB-padded layout" >&2
 			echo "  ($FIXTURE_W x $PADDED_H, $PAD-line align). The crop=$FIXTURE_W:$FIXTURE_H" >&2
 			echo "  filter step did not normalize. Check FFmpeg version / hwdownload behavior." >&2
 			echo "  This is a stride artifact, not pixel corruption." >&2
 			exit 3
 		fi
 	done
 	echo "FAIL: size mismatch (SW=$SW_BYTES vs HW=$HW_BYTES, expected $EXPECTED_SW)" >&2
 	echo "      Different frame count or NV12 packing — investigate." >&2
 	exit 1
 fi
 if [[ "$SW_SHA" == "$HW_SHA" ]]; then
 	echo "PASS: byte-for-byte identical."
 	echo "      msync removal verified safe on this hardware/kernel."
 	exit 0
 fi
 # Per-frame divergence analysis on full-buffer mismatch.
 echo "Buffer-level sha differs. Computing per-frame divergence..."
 FRAME_SIZE=$(( SW_BYTES / N_FRAMES ))
 DIVERGENT=0
 for ((i = 0; i < N_FRAMES; i++)); do
 	OFFSET=$(( i * FRAME_SIZE ))
 	SW_FRAME_SHA=$(dd if="$WORKDIR/sw_ref.yuv" bs="$FRAME_SIZE" \
 		count=1 skip="$i" 2>/dev/null | sha256sum | cut -d' ' -f1)
 	HW_FRAME_SHA=$(dd if="$WORKDIR/hw_capture.yuv" bs="$FRAME_SIZE" \
 		count=1 skip="$i" 2>/dev/null | sha256sum | cut -d' ' -f1)
 	if [[ "$SW_FRAME_SHA" != "$HW_FRAME_SHA" ]]; then
 		DIVERGENT=$(( DIVERGENT + 1 ))
 		[[ "$DIVERGENT" -le 5 ]] && \
 			echo "    frame $i: SW=$SW_FRAME_SHA HW=$HW_FRAME_SHA"
 	fi
 done
 echo "FAIL: $DIVERGENT / $N_FRAMES frames diverge from SW reference."
 echo "      Action: restore msync(MS_SYNC|MS_INVALIDATE) in surface.c"
 echo "      RequestSyncSurface DQBUF path; re-run this script."
 exit 1
@@ -1,299 +0,0 @@
 #!/bin/bash
 # run_perf_binding_cell.sh — iter8 perf binding cell.
 #
 # Anchors campaign-wide claims with measured numbers. Runs four consumer
 # configurations for $DURATION seconds each on $FIXTURE and emits a
 # markdown table comparing:
 #   1. mpv --hwdec=vaapi          (DMA-BUF zero-copy through libva)
 #   2. mpv --hwdec=vaapi-copy     (HW decode + VAImage readback)
 #   3. firefox (iter5-amend, sandbox enabled, file:// URL)
 #   4. mpv --hwdec=no             (SW decode baseline / control)
 #
 # For each consumer: CPU% (median + p90), GPU freq (median MHz), drops in
 # measurement window, p50 frame interval (ms), VmRSS delta (MiB).
 #
 # Usage:
 #   ./run_perf_binding_cell.sh [fixture_path]
 #
 # If no argument, defaults to /home/mfritsche/fourier-test/bbb_1080p30_h264.mp4
 # Override DURATION via env: DURATION=60 ./run_perf_binding_cell.sh
 #
 # Reproducibility: results depend on (a) the iter7-end driver being installed
 # at /usr/lib/dri/v4l2_request_drv_video.so, (b) ohm idle (no other compute
 # load), (c) fixture present at the expected path. Run on a stable thermal
 # state (after a few minutes of cool-down).
 set -eu
 FIXTURE="${1:-/home/mfritsche/fourier-test/bbb_1080p30_h264.mp4}"
 DURATION="${DURATION:-30}"
 WORKDIR="${WORKDIR:-$(mktemp -d -t perf_binding.XXXXXX)}"
 GPU_DEVFREQ_PATH="${GPU_DEVFREQ_PATH:-/sys/class/devfreq/fde60000.gpu/cur_freq}"
 # DISPLAY/Wayland env for the operator's session, needed for Firefox under sudo.
 export XDG_RUNTIME_DIR="${XDG_RUNTIME_DIR:-/run/user/1001}"
 export WAYLAND_DISPLAY="${WAYLAND_DISPLAY:-wayland-0}"
 export DISPLAY="${DISPLAY:-:0}"
 export XAUTHORITY="${XAUTHORITY:-/run/user/1001/xauth_pxiMur}"
 # libva env vars for the v4l2_request driver path.
 export LIBVA_DRIVER_NAME=v4l2_request
 export LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1
 export LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0
 if [[ ! -f "$FIXTURE" ]]; then
 	echo "FAIL: fixture not found: $FIXTURE" >&2
 	exit 2
 fi
 mkdir -p "$WORKDIR"
 echo "Fixture:   $FIXTURE ($(stat -c %s "$FIXTURE") bytes)"
 echo "Duration:  ${DURATION}s per configuration"
 echo "Workdir:   $WORKDIR"
 echo "GPU freq:  $GPU_DEVFREQ_PATH"
 echo "Driver sha: $(sha256sum /usr/lib/dri/v4l2_request_drv_video.so | cut -d' ' -f1)"
 echo
 # percentile_from_stream sorted_file pct -> echo Nth percentile value
 # Argument: a file with one numeric value per line (no header), and a percentile
 # integer (50, 90, etc.). Numbers can be float; uses awk.
 percentile_from_stream() {
 	local file="$1" pct="$2"
 	awk -v pct="$pct" '
 	{ a[NR] = $1 }
 	END {
 		if (NR == 0) { print "0"; exit }
 		# sort
 		for (i = 1; i <= NR; i++) for (j = i+1; j <= NR; j++) if (a[i] > a[j]) { t = a[i]; a[i] = a[j]; a[j] = t }
 		idx = int((pct/100.0) * NR + 0.5)
 		if (idx < 1) idx = 1
 		if (idx > NR) idx = NR
 		print a[idx]
 	}' "$file"
 }
 # Background-poll GPU freq while the consumer runs. Writes Hz values to $1.
 poll_gpu_freq() {
 	local out="$1"
 	: >"$out"
 	while [[ -e "/proc/$BG_PARENT_PID" ]]; do
 		if [[ -r "$GPU_DEVFREQ_PATH" ]]; then
 			cat "$GPU_DEVFREQ_PATH" 2>/dev/null >>"$out" || true
 		fi
 		sleep 0.1
 	done
 }
 # Run a single consumer configuration. Args:
 #   $1 label (used for filename, no spaces)
 #   $2 launcher cmd (will be exec'd as mfritsche; should be a single line)
 #   $3 'mpv' or 'firefox' — affects how we find the PID to track
 run_consumer() {
 	local label="$1"
 	local launcher="$2"
 	local kind="$3"
 	local logdir="$WORKDIR/$label"
 	mkdir -p "$logdir"
 	echo "=== Running: $label ==="
 	# Kill any running firefox/mpv first to clean state.
 	pkill -f firefox 2>/dev/null || true
 	pkill -x mpv     2>/dev/null || true
 	sleep 1
 	# VmRSS at start (we'll read again at end) — captured per-PID after launch.
 	# Launch consumer in background, capture stdout+stderr to a log.
 	(
 		eval "$launcher" >"$logdir/consumer.log" 2>&1
 	) &
 	local launcher_pid=$!
 	# Wait briefly for the process tree to spawn the actual decode worker.
 	sleep 4
 	local target_pid
 	case "$kind" in
 		mpv)
 			target_pid=$(pgrep -x mpv | head -1)
 			;;
 		firefox)
 			# Firefox's RDD process holds /dev/video1; that's the one with
 			# the libva decoder context. Wait an extra few seconds for it
 			# to spawn and bind the device.
 			sleep 6
 			target_pid=$(pgrep -af 'contentproc.*\brdd\b' | awk '{print $1}' | head -1)
 			if [[ -z "${target_pid:-}" ]]; then
 				# Fallback: find whichever firefox process holds /dev/video1.
 				target_pid=$(sudo lsof -t /dev/video1 2>/dev/null | head -1 || true)
 			fi
 			;;
 		*)
 			echo "  bad kind: $kind" >&2
 			return 1
 			;;
 	esac
 	if [[ -z "${target_pid:-}" ]]; then
 		echo "  WARN: could not locate $kind process; skipping pidstat" >&2
 		# Let the consumer run for the duration anyway so the log gets data.
 		sleep "$DURATION"
 		kill -TERM "$launcher_pid" 2>/dev/null || true
 		pkill -f firefox 2>/dev/null || true
 		pkill -x mpv     2>/dev/null || true
 		return 0
 	fi
 	echo "  Tracking PID $target_pid"
 	# VmRSS at start.
 	local rss_start
 	rss_start=$(awk '/^VmRSS:/{print $2}' "/proc/$target_pid/status" 2>/dev/null || echo 0)
 	echo "  VmRSS start: ${rss_start} kB"
 	# Poll GPU freq in background (keyed off launcher_pid).
 	BG_PARENT_PID=$launcher_pid
 	poll_gpu_freq "$logdir/gpu_freq.log" &
 	local poll_pid=$!
 	# Run pidstat for $DURATION seconds.
 	pidstat -u -p "$target_pid" 1 "$DURATION" >"$logdir/pidstat.log" 2>&1 || true
 	# VmRSS at end (before killing).
 	local rss_end
 	rss_end=$(awk '/^VmRSS:/{print $2}' "/proc/$target_pid/status" 2>/dev/null || echo "$rss_start")
 	# Stop everything.
 	kill "$poll_pid" 2>/dev/null || true
 	kill -TERM "$launcher_pid" 2>/dev/null || true
 	pkill -f firefox 2>/dev/null || true
 	pkill -x mpv     2>/dev/null || true
 	sleep 1
 	# Parse pidstat by header: locate the %CPU column index from the
 	# column-name row (where any field equals "%CPU"), then apply it
 	# to data rows. Robust across sysstat 12.x point releases.
 	# pidstat default output has no '#' header marker — the header is
 	# the first row containing "%CPU" as a field.
 	awk '
 		# Header row: any line where some field equals "%CPU".
 		!col {
 			for (i = 1; i <= NF; i++) if ($i == "%CPU") { col = i; next }
 		}
 		# Data row: lines whose value at $col is numeric. Skip the
 		# trailing "Average" summary by requiring $col to parse cleanly.
 		col && NF >= col && $col ~ /^[0-9]+(\.[0-9]+)?$/ {
 			print $col
 		}
 	' "$logdir/pidstat.log" >"$logdir/cpu_pct.log" || true
 	local cpu_p50 cpu_p90
 	if [[ -s "$logdir/cpu_pct.log" ]]; then
 		cpu_p50=$(percentile_from_stream "$logdir/cpu_pct.log" 50)
 		cpu_p90=$(percentile_from_stream "$logdir/cpu_pct.log" 90)
 	else
 		cpu_p50="ERR"
 		cpu_p90="ERR"
 	fi
 	# GPU freq median. Values are Hz; convert to MHz via temp file (avoids
 	# unreliable /dev/stdin in a nested subshell-over-pipe).
 	local gpu_med_mhz
 	if [[ -s "$logdir/gpu_freq.log" ]]; then
 		awk '{print $1/1000000}' "$logdir/gpu_freq.log" >"$logdir/gpu_freq_mhz.log"
 		gpu_med_mhz=$(percentile_from_stream "$logdir/gpu_freq_mhz.log" 50)
 	else
 		gpu_med_mhz="—"
 	fi
 	# RSS delta MiB.
 	local rss_delta_mib
 	rss_delta_mib=$(awk -v s="$rss_start" -v e="$rss_end" 'BEGIN{printf "%.1f", (e-s)/1024.0}')
 	# Drops + p50 frame interval — only available for mpv.
 	local drops="—"
 	local p50_frame_ms="—"
 	if [[ "$kind" == "mpv" ]]; then
 		drops=$(grep -oE 'frame-drop-count[^\t ]*\s*=\s*[0-9]+' "$logdir/consumer.log" \
 			| awk -F= '{print $2}' | tr -d ' ' | tail -1)
 		drops="${drops:-0}"
 		# p50 frame interval from mpv vsync-jitter or frame timing — leave
 		# as "—" unless mpv emitted detailed timing.
 	fi
 	# Emit row.
 	cat >>"$WORKDIR/results.tsv" <<-ROW
 	$label	$cpu_p50	$cpu_p90	$drops	$p50_frame_ms	$gpu_med_mhz	$rss_delta_mib
 	ROW
 	echo "  CPU% p50=$cpu_p50  p90=$cpu_p90  drops=$drops  gpu_med=$gpu_med_mhz MHz  rss_delta=$rss_delta_mib MiB"
 	echo
 }
 # Header for results.
 echo "consumer	cpu_p50	cpu_p90	drops_${DURATION}s	p50_frame_ms	gpu_med_mhz	rss_delta_mib" >"$WORKDIR/results.tsv"
 # === Configurations ===
 # 1. mpv DMA-BUF zero-copy
 run_consumer "mpv-vaapi-dmabuf" \
 	"sudo -u mfritsche env LIBVA_DRIVER_NAME=v4l2_request \
 		LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
 		LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
 		mpv --no-config --hwdec=vaapi --vo=null --no-audio \
 		    --term-status-msg='\${frame-drop-count}' \
 		    --length=$DURATION '$FIXTURE'" \
 	mpv
 # 2. mpv vaapi-copy
 run_consumer "mpv-vaapi-copy" \
 	"sudo -u mfritsche env LIBVA_DRIVER_NAME=v4l2_request \
 		LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
 		LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
 		mpv --no-config --hwdec=vaapi-copy --vo=null --no-audio \
 		    --term-status-msg='\${frame-drop-count}' \
 		    --length=$DURATION '$FIXTURE'" \
 	mpv
 # 3. Firefox-fourier (iter5-amend, sandbox enabled)
 run_consumer "firefox-fourier-hw" \
 	"sudo -u mfritsche env XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
 		WAYLAND_DISPLAY=$WAYLAND_DISPLAY DISPLAY=$DISPLAY \
 		XAUTHORITY=$XAUTHORITY \
 		LIBVA_DRIVER_NAME=v4l2_request \
 		LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video1 \
 		LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media0 \
 		firefox --new-window 'file://$FIXTURE'" \
 	firefox
 # 4. SW baseline
 run_consumer "mpv-sw-baseline" \
 	"sudo -u mfritsche mpv --no-config --hwdec=no --vo=null --no-audio \
 		--term-status-msg='\${frame-drop-count}' \
 		--length=$DURATION '$FIXTURE'" \
 	mpv
 # === Generate markdown table ===
 {
 	echo "# Performance binding cell — iter8 (libva-multiplanar campaign)"
 	echo
 	echo "Run date: $(date -Iseconds)"
 	echo "Host: $(uname -n) ($(uname -m))"
 	echo "Kernel: $(uname -r)"
 	echo "Driver sha256: \`$(sha256sum /usr/lib/dri/v4l2_request_drv_video.so | cut -d' ' -f1)\`"
 	echo "Fixture: \`$FIXTURE\` ($(stat -c %s "$FIXTURE") bytes)"
 	echo "Duration per consumer: ${DURATION}s"
 	echo
 	echo "| Consumer | CPU% p50 | CPU% p90 | Drops in window | p50 frame ms | GPU MHz median | VmRSS Δ MiB |"
 	echo "|---|---|---|---|---|---|---|"
 	tail -n +2 "$WORKDIR/results.tsv" | awk -F'\t' '{
 		printf "| %s | %s | %s | %s | %s | %s | %s |\n",
 			$1, $2, $3, $4, $5, $6, $7
 	}'
 } >"$WORKDIR/perf_binding_cell.md"
 echo "=== Done ==="
 echo "Results: $WORKDIR/perf_binding_cell.md"
 echo "Per-consumer logs: $WORKDIR/{mpv-vaapi-dmabuf,mpv-vaapi-copy,firefox-fourier-hw,mpv-sw-baseline}/"
 echo
 cat "$WORKDIR/perf_binding_cell.md"
Author	SHA1	Message	Date
test0r	e8c3937435	STUDY.md: replace with pointer to libva-multiplanar campaign Phase 0 The Phase 0 / Phase 2 substrate that lived here has been transformed into ../phase0_findings.md as the campaign-level Phase 0 document. This file is reduced to a pointer + a git-show recipe to recover the prior content from commit `e0acc33`.	2026-05-04 08:08:32 +00:00
test0r	e0acc33455	STUDY.md: phase 2 finding — libva surface stack works; Brave wall is chromeos pipeline mpv --hwdec=vaapi successfully probes our driver end-to-end: RequestQueryImageFormats, QueryConfigEntrypoints, CreateConfig, QuerySurfaceAttributes, CreateSurfaces2, DeriveImage, CreateImage, CreateBuffer, ExportSurfaceHandle all run clean across all seven enumerated profiles. mpv then falls back to SW for actual decode (drops match the SW baseline) because our decode-submission path isn't there yet — but the libva entry-point surface is largely done. Brave's "failed Initialize()ing the frame pool" turns out to be in chromium's chromeos pipeline (PickDecoderOutputFormat → ImageProcessor init in media/gpu/chromeos/video_decoder_pipeline.cc), not in our driver. No more libva calls happen between our successful CreateContext and the failure; chromium bails on the chromeos-specific V4L2 ImageProcessor it expects on real ChromeOS but doesn't find on a plain Linux Wayland system. Fix is on the Chromium build side, not here. Remaining real work in this library: decode submission path (Begin/ Render/EndPicture → V4L2 stateless queue/dequeue with controls attached), and proper STREAMON ordering on hantro. STUDY.md now documents both. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 22:41:54 +00:00
test0r	283df22748	WIP: log surfaces_count + flags + SUCCESS marker in CreateContext	2026-04-25 22:34:22 +00:00
test0r	4671f64479	WIP: ENTER traces in image.c + buffer.c entry points	2026-04-25 22:30:44 +00:00
test0r	07fd527114	WIP: more entry-point tracing (CreateConfig, GetConfigAttributes, QuerySurfaceAttributes, QueryConfigEntrypoints) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 22:25:36 +00:00
test0r	2737ff921d	WIP: instrument surface.c (CreateSurfaces2, ExportSurfaceHandle) for tracing Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 22:21:44 +00:00
test0r	f2c3a4c32f	STUDY.md: checkpoint after first-day port work Update with current state: library builds clean, vainfo enumerates profiles, vaCreateContext succeeds on Brave (with STREAMON deferred as WIP unblocker), next failure is frame pool initialization in vaCreateSurfaces2. Documents the 12-step diff stack vs bootlin upstream and what still needs to happen to actually decode a frame. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 22:10:11 +00:00
test0r	44a73271ae	WIP: defer STREAMON in CreateContext (probe how far vaCreateContext gets) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 22:08:28 +00:00
test0r	0a3432ad64	Eager CAPTURE format probe in RequestInit Chromium's vaapi_video_decoder may call vaCreateContext with surfaces=NULL, surfaces_count=0 and then create surfaces afterwards via vaCreateSurfaces2. In that order driver_data->video_format is still NULL when CreateContext runs and our early `video_format == NULL` guard returns OPERATION_FAILED. Confirmed via temporary request_log() — Brave hits exactly that path on ohm. Move the probe out of RequestCreateSurfaces into a new video_format_probe() helper in video.c, and call it eagerly from RequestInit. RequestCreateSurfaces still re-probes if init came up NULL, which preserves the original lazy behaviour for any caller that needs it. Also small clean-up of surface.c since the probe block moved out: drop the now-dead `bool found` local. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 22:03:38 +00:00
test0r	ac674b84ec	src/utils.c: tee request_log() to /tmp/libva-fourier.log Sandboxed processes (Chromium GPU process etc.) redirect stderr, so request_log() output never reaches our test harness. Add a lazy-open append to /tmp/libva-fourier.log so we can capture diagnostic logging regardless of process sandboxing. Will revert before final. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:59:57 +00:00
test0r	1458622729	src/context.c: temporary diagnostic logging in CreateContext Added request_log() calls at every failure path inside RequestCreateContext to identify exactly which guard fires when Brave's vaCreateContext fails on ohm. Will revert these before final. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:57:33 +00:00
test0r	4d14ffb801	src/tiled_yuv.S: stub tiled_to_planar on aarch64 The whole body of tiled_yuv.S is wrapped in #ifndef __aarch64__ — the ARMv7 NEON Thumb code doesn't assemble on aarch64. On 64-bit ARM the .S object is therefore empty and tiled_to_planar comes out as an undefined symbol in the .so, which dlopen rejects at vainfo time: libva error: dlopen ... failed: undefined symbol: tiled_to_planar Runtime-wise, tiled_to_planar is only called when video_format_is_linear() returns false, which only happens for the sunxi-cedrus DRM_FORMAT_MOD_ALLWINNER_TILED NV12 entry — never on hantro/rkvdec. So the right fix is a no-op stub on aarch64 just to satisfy the linker. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:54:19 +00:00
test0r	13e9b64bcf	src/h264.c: drop num_slices field; kernel infers from queued controls decode->num_slices was the last field on v4l2_ctrl_h264_decode_params that didn't survive the upstream cleanup. The kernel now infers the slice count from how many slice_params controls were queued via the request API for the given OUTPUT buffer; no explicit count is needed. This was the only remaining build error; library should now compile clean against current linux-api-headers. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:52:17 +00:00
test0r	fc4bb1063f	src/h264.c: track upstream UAPI shape for H.264 controls The H.264 stateless control structs and CIDs were upstreamed and reshaped during the journey from libva-v4l2-request's bundled headers to the kernel's <linux/v4l2-controls.h>. Three breaking changes need matching code-side updates: 1. struct v4l2_ctrl_h264_slice_params lost its .size field. The full slice byte length is now communicated via the OUTPUT buffer's m.planes[0].bytesused at QBUF time, which v4l2_queue_buffer already sets. Drop the assignment. 2. struct v4l2_h264_reference replaces the bare integer DPB index in ref_pic_list0[] / ref_pic_list1[]. The new struct is { __u8 fields, __u8 index } where `fields` is V4L2_H264_TOP_FIELD_REF (0x1) / BOTTOM_FIELD_REF (0x2) / FRAME_REF (0x3). Hantro G1 only does frame-based H.264, so use V4L2_H264_FRAME_REF unconditionally. 3. The pred_weight_table block moved out of slice_params into its own V4L2_CID_STATELESS_H264_PRED_WEIGHTS control with struct v4l2_ctrl_h264_pred_weights. Add it as a separate v4l2_set_control() call alongside SPS/PPS/SLICE_PARAMS/DECODE_PARAMS/SCALING_MATRIX, and thread the pred_weights pointer through h264_va_slice_to_v4l2(). Also: switch the v4l2_set_control() CID arguments from V4L2_CID_MPEG_VIDEO_H264_* to V4L2_CID_STATELESS_H264_* directly. h264-ctrls.h still defines the old names as aliases so external callers keep working, but using the canonical names internally matches what the kernel actually expects and avoids confusion when reading h264.c side-by-side with the kernel source. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:51:43 +00:00
test0r	da9f2a55d2	include/h264-ctrls.h: passthrough to system header + CID aliases H.264 stateless controls were upstreamed to the kernel some time after this library went dormant. The struct names match exactly (the kernel adopted v4l2_ctrl_h264_{sps,pps,scaling_matrix,pred_weights,slice_params, decode_params} and v4l2_h264_{weight_factors,dpb_entry,reference} verbatim) so the duplicated copies in include/h264-ctrls.h trigger redefinition errors as soon as any source file pulls in <linux/videodev2.h> (which transitively pulls in <linux/v4l2-controls.h>). Same fix as hevc-ctrls.h in commit `4ccbfe9`: replace the bundled struct definitions with a passthrough to <linux/v4l2-controls.h>. The CID prefix changed during upstreaming from V4L2_CID_MPEG_VIDEO_H264_* to V4L2_CID_STATELESS_H264_*; provide compatibility aliases so h264.c keeps compiling. V4L2_PIX_FMT_H264_SLICE is already in <linux/videodev2.h> (same fourcc value, same name we renamed to in commit `c1f5108`) so no need to redefine. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:49:26 +00:00
test0r	53999cd154	The real multiplanar wedge: probe MPLANE in surface.c, add NV12 mplane This is what was actually broken — and it's much smaller than the STUDY guessed. The library is in fact multiplanar-aware throughout the v4l2.c helpers (v4l2_setup_format, v4l2_get_format, v4l2_query_buffer, v4l2_queue_buffer, v4l2_dequeue_buffer all branch on v4l2_type_is_mplane). The whole "context.c / picture.c are single-plane" hypothesis from STUDY.md was wrong — they derive output_type and capture_type from video_format->v4l2_mplane and pass them through. The bug is upstream of all that: driver_data->video_format is set in RequestCreateSurfaces only, and the probe there only tries V4L2_BUF_TYPE_VIDEO_CAPTURE (single-plane). On hantro the single-plane ENUM_FMT returns nothing, video_format stays NULL, and every subsequent operation hits the `if (video_format == NULL) return OPERATION_FAILED` guard at the top of context.c / buffer.c / image.c. That's exactly what Brave's vaCreateContext failure was — not a downstream multiplanar fault, just the format-detection short-circuit firing. Three small changes: - src/video.c: add an NV12 multi-plane entry next to the existing single-plane NV12 / Sunxi entries. Same pixelformat fourcc (S264 vs NV12 has nothing to do with mplane), distinguished by the v4l2_mplane bit. - src/video.h + src/video.c: video_format_find() takes a `bool mplane` parameter and matches both fields. Without this the single-plane and multi-plane NV12 entries collide on pixelformat. - src/surface.c: the probe block now tries V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE as a fallback after the single-plane probes return nothing. On a single-plane decoder the mplane probe is a no-op; on hantro it picks the new mplane NV12 entry, video_format gets set, and the rest of the library — already mplane-aware — does the right thing for OUTPUT S_FMT, REQBUFS, EXPBUF, QBUF, DQBUF. Also: src/context.c: the H264 case still referenced V4L2_PIX_FMT_H264_SLICE_RAW that we renamed in commit `c1f5108`. Caught now. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:47:30 +00:00
test0r	294bdc24f6	STUDY.md: port plan + reference implementations + test fixtures Cold-start dossier for the multiplanar port: goal, why-this-fork-exists, state-today, port plan (v4l2.c / context.c / picture.c), reference impls to read side-by-side (FFmpeg libavcodec/v4l2_request*, GStreamer gst-plugins-bad/sys/v4l2codecs, Chromium media/gpu/v4l2), test fixtures (ohm + bbb_1080p30_h264.mp4 + GStreamer ceiling at 6% CPU), out-of-scope (HEVC/VP9/AV1). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:30:54 +00:00
test0r	2f54a8d9e8	src/config.c: probe both single- and multi-plane V4L2 buffer types Bootlin's original probe assumed single-plane decoders (the sunxi-cedrus target the library was originally written for). Rockchip's hantro VPU on RK3566/RK3588 reports V4L2_CAP_VIDEO_M2M_MPLANE only — the single- plane VIDIOC_ENUM_FMT path returns nothing, so vainfo prints an empty "Supported profile and entrypoints" list and downstream apps fall back to software. Add an OUTPUT_MPLANE fallback to all three format probes (MPEG-2, H.264, HEVC). On a single-plane decoder the second probe is a no-op; on a multi-plane decoder it's what makes the profile list non-empty. This is the probe fix only — it gets vainfo to enumerate profiles and gets Brave's GPU process to attempt vaCreateContext. The rest of the multiplanar port (S_FMT / REQBUFS / EXPBUF / QBUF in context.c, picture.c, v4l2.c) is the next phase; vaCreateContext currently fails because those paths are still single-plane only. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:27:46 +00:00
test0r	4ccbfe923f	Strip HEVC build path Three changes that together make the build compile cleanly against a current linux-api-headers (>= 5.x post-HEVC-UAPI-rename): - src/meson.build: comment h265.c + h265.h out of sources/headers. - include/hevc-ctrls.h: replace bundled HEVC structs with a single #include <linux/v4l2-controls.h>. The bundled definitions were identical to what later landed in mainline as V4L2_CID_STATELESS_HEVC_* (renamed) and v4l2_ctrl_hevc_* (kept the field names but moved into the kernel public header). Keeping a duplicate copy now triggers redefinition errors. The header is kept as a passthrough rather than deleted so any downstream patch that says #include <hevc-ctrls.h> still compiles. - src/picture.c: drop the four HEVC case blocks. Three of them were in switches that already had `default: break`, so removing them is functionally a no-op. The fourth was the only external reference to h265_set_controls — removing it lets the library link cleanly with h265.c excluded. Why this is OK rather than the more ambitious "fix HEVC properly": RK3566 has no HW HEVC at all (the only decoder block is the Hantro G1 which speaks H.264 / MPEG-2 / VP8). HEVC can come back as a separate effort once we're on RK3588 silicon AND the library is updated to the renamed kernel CIDs. For Fourier's first port milestone (H.264 multi- plane on RK3566 hantro) HEVC is dead weight. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:26:28 +00:00
test0r	1b02c9b476	src/h264.c: include utils.h for request_log() prototype GCC's -Wimplicit-function-declaration is fatal in current toolchains (GCC 14+) but was a warning when this code was written. h264.c uses request_log() in two places without including utils.h; the build was quietly relying on an implicit declaration that recent compilers reject. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:16:35 +00:00
test0r	c1f5108ac3	include/h264-ctrls.h, src/config.c: rename to V4L2_PIX_FMT_H264_SLICE Kernel mainline renamed V4L2_PIX_FMT_H264_SLICE_RAW → V4L2_PIX_FMT_H264_SLICE some time after this library went dormant (the 'S264' fourcc value is unchanged; only the C identifier moved). The two definitions have the same value so this is purely cosmetic at runtime, but keeping the name aligned with linux/videodev2.h matches downstream patches (e.g. bootlin PR #38) and avoids confusion when reading kernel + library side-by-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-04-25 21:13:11 +00:00