Files
libva-v4l2-request-fourier/src/h264.c
T
claude-noether 02266841c6 iter8 Phase 6c α-2: pass H.264 POC values through unchanged for rkvdec
Bug 4 root cause per Phase 7 γ + Phase 4c strace re-decode:
libva strips FFmpeg's bit-16 POC sentinel; kdirect (ffmpeg-v4l2request)
does NOT strip. rkvdec writes top/bottom_field_order_cnt directly to
MMIO via writel_relaxed; with libva sending 0 instead of kdirect's
65536, hardware POC comparisons mismatch and motion compensation
silently corrupts (16x32 patch + nothing else).

The original h264_strip_ffmpeg_poc_sentinel was hantro-specific
(hantro_h264.c prepare_table fed unmasked tbl->poc[]). Hantro+H.264
is not exercised on RK3399; deferring per-driver gating to iter9 if
it surfaces.

Preserve VA_PICTURE_H264_INVALID → return 0 (correct zero-init for
empty DPB slots per Phase 5c amendment).

4 call sites unchanged (h264.c:309, 312, 462, 465 — for ref and current
frame TopFieldOrderCnt / BottomFieldOrderCnt). Both reference and
current-frame POCs now pass through unchanged so hardware compares
agree.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 12:57:51 +00:00

1005 lines
36 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Copyright (C) 2007 Intel Corporation
* Copyright (C) 2016 Florent Revest <florent.revest@free-electrons.com>
* Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
* Copyright (C) 2018 Bootlin
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <assert.h>
#include <limits.h>
#include <string.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/videodev2.h>
#include "request.h"
#include "utils.h"
#include "surface.h"
#include "v4l2.h"
#include "h264_slice_header.h"
enum h264_slice_type {
H264_SLICE_P = 0,
H264_SLICE_B = 1,
};
static bool is_picture_null(VAPictureH264 *pic)
{
return pic->picture_id == VA_INVALID_SURFACE;
}
static struct h264_dpb_entry *
dpb_find_invalid_entry(struct object_context *context)
{
unsigned int i;
for (i = 0; i < H264_DPB_SIZE; i++) {
struct h264_dpb_entry *entry = &context->dpb.entries[i];
if (!entry->valid && !entry->reserved)
return entry;
}
return NULL;
}
static struct h264_dpb_entry *
dpb_find_oldest_unused_entry(struct object_context *context)
{
unsigned int min_age = UINT_MAX;
unsigned int i;
struct h264_dpb_entry *match = NULL;
for (i = 0; i < H264_DPB_SIZE; i++) {
struct h264_dpb_entry *entry = &context->dpb.entries[i];
if (!entry->used && (entry->age < min_age)) {
min_age = entry->age;
match = entry;
}
}
return match;
}
static struct h264_dpb_entry *dpb_find_entry(struct object_context *context)
{
struct h264_dpb_entry *entry;
entry = dpb_find_invalid_entry(context);
if (!entry)
entry = dpb_find_oldest_unused_entry(context);
return entry;
}
static struct h264_dpb_entry *dpb_lookup(struct object_context *context,
VAPictureH264 *pic, unsigned int *idx,
unsigned char *fields)
{
unsigned int i;
for (i = 0; i < H264_DPB_SIZE; i++) {
struct h264_dpb_entry *entry = &context->dpb.entries[i];
if (!entry->valid)
continue;
if (entry->pic.picture_id == pic->picture_id) {
if (idx)
*idx = i;
if (fields) {
//if (entry->pic.TopFieldOrderCnt < entry->pic.BottomFieldOrderCnt) {
// *fields = V4L2_H264_TOP_FIELD_REF;
//} else if (entry->pic.TopFieldOrderCnt > entry->pic.BottomFieldOrderCnt) {
// *fields = V4L2_H264_BOTTOM_FIELD_REF;
//} else {
*fields = V4L2_H264_FRAME_REF;
//}
}
return entry;
}
}
return NULL;
}
static void dpb_clear_entry(struct h264_dpb_entry *entry, bool reserved)
{
memset(entry, 0, sizeof(*entry));
if (reserved)
entry->reserved = true;
}
static void dpb_insert(struct object_context *context, VAPictureH264 *pic,
struct h264_dpb_entry *entry)
{
if (is_picture_null(pic))
return;
if (dpb_lookup(context, pic, NULL, NULL))
return;
if (!entry)
entry = dpb_find_entry(context);
memcpy(&entry->pic, pic, sizeof(entry->pic));
entry->age = context->dpb.age;
entry->valid = true;
entry->reserved = false;
if (!(pic->flags & VA_PICTURE_H264_INVALID))
entry->used = true;
}
static void dpb_update(struct object_context *context,
VAPictureParameterBufferH264 *parameters)
{
unsigned int i;
context->dpb.age++;
for (i = 0; i < H264_DPB_SIZE; i++) {
struct h264_dpb_entry *entry = &context->dpb.entries[i];
entry->used = false;
}
for (i = 0; i < parameters->num_ref_frames; i++) {
VAPictureH264 *pic = &parameters->ReferenceFrames[i];
struct h264_dpb_entry *entry;
if (is_picture_null(pic))
continue;
entry = dpb_lookup(context, pic, NULL, NULL);
if (entry) {
entry->age = context->dpb.age;
entry->used = true;
} else {
dpb_insert(context, pic, NULL);
}
}
}
/*
* Strip ffmpeg-vaapi's POC sentinel.
*
* ffmpeg's H264POCContext initialises prev_poc_msb to (1 << 16) =
* 0x10000 in libavcodec/h264dec.c (lines 301 and 444 of v8.0). After
* an IDR the idr() helper resets prev_poc_msb to that same sentinel.
* ff_h264_init_poc (libavcodec/h264_parse.c lines 296-305) then
* computes pc->poc_msb as prev_poc_msb when the slice header's
* poc_lsb hasn't wrapped — which is the typical case for normal
* content. The sentinel leaks into field_poc[] and from there into
* VAPictureH264.TopFieldOrderCnt / BottomFieldOrderCnt at
* libavcodec/vaapi_h264.c::fill_vaapi_pic.
*
* Working VAAPI backends (intel-iHD, i965 verified empirically on
* meitner 2026-05-02) tolerate the high word — they either mask it
* or treat POCs as relative comparisons. V4L2 stateless H.264
* driver-side consumers (hantro_h264.c::prepare_table feeds the
* value direct to tbl->poc[]) need the spec value, so we strip the
* sentinel here at the libva-v4l2-request boundary.
*
* Detection by bit-16-set rather than blind subtraction so that a
* future ffmpeg version that fixes the sentinel leak degrades
* gracefully. POC values for non-degenerate H.264 content rarely
* exceed 16 bits; bit 16 set is a strong signal of the sentinel.
*
* Empty DPB slots (VA_PICTURE_H264_INVALID) carry POC=0 by
* libavcodec/vaapi_h264.c::init_vaapi_pic and need no fix-up.
*/
static inline int32_t h264_strip_ffmpeg_poc_sentinel(int32_t poc, uint32_t flags)
{
if (flags & VA_PICTURE_H264_INVALID)
return 0;
/*
* iter8 α-2: pass POC values through unchanged for rkvdec. The
* sentinel-subtract was added for hantro's tbl->poc[] prepare_table
* which fed the value through unmasked. rkvdec writes POC to MMIO
* via writel_relaxed (rkvdec-h264.c:975-978) and the macro
* RKVDEC_CUR_POC is a 32-bit passthrough. kdirect (ffmpeg-v4l2request)
* delivers the sentinel-encoded value directly and decodes
* correctly; libva's strip was the cause of the 16x32 partial-fill
* Bug 4 symptom. Hantro+H.264 isn't exercised on RK3399 (hantro-dec
* doesn't advertise H.264 there) — restoring the strip per-driver
* is iter9 work if it ever surfaces.
*/
return poc;
}
static void h264_fill_dpb(struct request_data *data,
struct object_context *context,
VAPictureParameterBufferH264 *VAPicture,
struct v4l2_ctrl_h264_decode_params *decode)
{
const int max_frame_num =
1 << (VAPicture->seq_fields.bits.log2_max_frame_num_minus4 + 4);
const int cur_frame_num = (int)VAPicture->frame_num;
int i;
for (i = 0; i < H264_DPB_SIZE; i++) {
struct v4l2_h264_dpb_entry *dpb = &decode->dpb[i];
struct h264_dpb_entry *entry = &context->dpb.entries[i];
struct object_surface *surface =
SURFACE(data, entry->pic.picture_id);
uint64_t timestamp;
/*
* Skip entries no longer referenced by the consumer's
* VAPictureParameterBufferH264.ReferenceFrames[]. dpb_update()
* clears `used` for all entries then re-marks only those in the
* current ReferenceFrames list; entries with valid=true but
* used=false are stale (a frame the libva consumer has retired
* from its DPB).
*
* Without this skip, our V4L2 dpb[] grows monotonically until
* H264_DPB_SIZE; by frame_num=10 it carries 7+ entries while
* SPS.max_num_ref_frames may be 4. The kernel reflist builder /
* cluster validator rejects the request with EINVAL once the
* count exceeds the SPS contract — which iter1+iter2+iter3
* surfaced as the "frame-11 EINVAL" carryover. iter4 fix:
* report only currently-used entries to match FFmpeg's
* libavcodec/v4l2_request_h264.c::fill_dpb behaviour (which
* iterates h->short_ref[] / h->long_ref[] — exactly the
* currently-referenced set).
*/
if (!entry->valid || !entry->used)
continue;
if (surface) {
timestamp = v4l2_timeval_to_ns(&surface->timestamp);
dpb->reference_ts = timestamp;
}
dpb->frame_num = entry->pic.frame_idx;
/*
* Per ext-ctrls-codec-stateless.rst, dpb[].pic_num must
* equal the H.264 spec's PicNum (8-28) for short-term refs
* or LongTermPicNum (8-29) for long-term refs.
*
* For frames (not field-coded), PicNum = FrameNumWrap.
* FrameNumWrap = (frame_num > cur_frame_num)
* ? frame_num - max_frame_num
* : frame_num
* (per spec section 8.2.4.1, frame_num wraparound).
*
* VAAPI convention (libavcodec/vaapi_h264.c::fill_vaapi_pic
* line 64): VAPictureH264.frame_idx holds long_term_frame_idx
* for long-term refs and frame_num for short-term refs. So
* for long-term entries we copy frame_idx straight through
* as LongTermPicNum.
*
* fourier's previous code set pic_num to picture_id (the
* VAAPI surface id) which is unrelated to H.264 PicNum;
* mediatek's vdec_h264_req_common.c::dst_entry->pic_num is
* one consumer that fails on that. Hantro doesn't read
* pic_num at all (uses reference_ts for ref resolution),
* which is why fourier's wrong value never surfaced on
* PineTab2 (RK3566 via hantro/rk3568-vpu).
*/
if (entry->pic.flags & VA_PICTURE_H264_LONG_TERM_REFERENCE) {
dpb->pic_num = entry->pic.frame_idx;
} else {
int frame_num = (int)entry->pic.frame_idx;
dpb->pic_num = (frame_num > cur_frame_num)
? frame_num - max_frame_num
: frame_num;
}
dpb->top_field_order_cnt =
h264_strip_ffmpeg_poc_sentinel(entry->pic.TopFieldOrderCnt,
entry->pic.flags);
dpb->bottom_field_order_cnt =
h264_strip_ffmpeg_poc_sentinel(entry->pic.BottomFieldOrderCnt,
entry->pic.flags);
dpb->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
if (entry->used)
dpb->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE;
if (entry->pic.flags & VA_PICTURE_H264_LONG_TERM_REFERENCE)
dpb->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
/*
* Mark this DPB entry as a frame reference (both top + bottom
* fields). The kernel's v4l2_h264_init_reflist_builder iterates
* dpb[] and skips entries whose `fields` member is zero — they
* count as "no valid field reference for this entry." For
* frame-coded streams (BBB and most desktop H.264) every
* reference is a frame reference; per UAPI doc
* (ext-ctrls-codec-stateless.rst), fields must be set to
* V4L2_H264_FRAME_REF (= TOP|BOTTOM) for frames.
*
* Cross-reference: FFmpeg libavcodec/v4l2_request_h264.c::
* fill_dpb_entry sets entry->fields from pic->reference; for
* frames pic->reference includes V4L2_H264_FRAME_REF. Without
* this, P-slices that need to walk the reference list (the
* first one in BBB is at frame 11) hit "no valid refs" inside
* the kernel's reflist builder and S_EXT_CTRLS rejects the
* whole request with EINVAL (error_idx == count, the kernel's
* "application bug" sentinel).
*/
dpb->fields = V4L2_H264_FRAME_REF;
}
}
static void h264_va_picture_to_v4l2(struct request_data *driver_data,
struct object_context *context,
struct object_surface *surface,
VAPictureParameterBufferH264 *VAPicture,
struct v4l2_ctrl_h264_decode_params *decode,
struct v4l2_ctrl_h264_pps *pps,
struct v4l2_ctrl_h264_sps *sps)
{
unsigned char *b;
unsigned char nal_ref_idc;
unsigned char nal_unit_type;
/* Extract missing nal_ref_idc and nal_unit_type */
b = surface->source_data;
if (context->h264_start_code)
b += 3;
nal_ref_idc = (b[0] >> 5) & 0x3;
nal_unit_type = b[0] & 0x1f;
/*
* Bit-parse the slice_header() to recover fields VAAPI doesn't
* forward and that hantro G1 hardware reads out of DECODE_PARAMS:
*
* - dec_ref_pic_marking_bit_size -> G1_REG_DEC_CTRL5_REFPIC_MK_LEN
* - idr_pic_id -> G1_REG_DEC_CTRL5_IDR_PIC_ID
* - pic_order_cnt_bit_size -> G1_REG_DEC_CTRL6_POC_LENGTH
* - pic_order_cnt_lsb / delta_pic_order_cnt_* (used by hantro
* reference-list builder for poc_type=0/1 inter prediction)
*
* Without these set correctly, hantro's hardware bitstream parser
* walks past zero bits, lands on garbage, decodes zero pixels —
* the all-zero CAPTURE output observed during 2026-05-04 Phase 0.
*
* Spec: ITU-T H.264 §7.3.3 slice_header. Cross-reference (proven
* working): FFmpeg libavcodec/h264_slice.c populates
* H264SliceContext::ref_pic_marking_bit_size and
* pic_order_cnt_bit_size by the same bit-precise parse.
*/
{
const struct h264_slice_header_context sh_ctx = {
.separate_colour_plane_flag =
(VAPicture->seq_fields.bits.residual_colour_transform_flag != 0),
.log2_max_frame_num_minus4 =
VAPicture->seq_fields.bits.log2_max_frame_num_minus4,
.frame_mbs_only_flag =
(VAPicture->seq_fields.bits.frame_mbs_only_flag != 0),
.pic_order_cnt_type =
VAPicture->seq_fields.bits.pic_order_cnt_type,
.log2_max_pic_order_cnt_lsb_minus4 =
VAPicture->seq_fields.bits.log2_max_pic_order_cnt_lsb_minus4,
.delta_pic_order_always_zero_flag =
(VAPicture->seq_fields.bits.delta_pic_order_always_zero_flag != 0),
.bottom_field_pic_order_in_frame_present_flag =
(VAPicture->pic_fields.bits.pic_order_present_flag != 0),
.redundant_pic_cnt_present_flag =
(VAPicture->pic_fields.bits.redundant_pic_cnt_present_flag != 0),
.weighted_pred_flag =
(VAPicture->pic_fields.bits.weighted_pred_flag != 0),
.weighted_bipred_idc =
VAPicture->pic_fields.bits.weighted_bipred_idc,
.num_ref_idx_l0_default_active_minus1 =
surface->params.h264.slice.num_ref_idx_l0_active_minus1,
.num_ref_idx_l1_default_active_minus1 =
surface->params.h264.slice.num_ref_idx_l1_active_minus1,
.chroma_format_idc =
VAPicture->seq_fields.bits.chroma_format_idc,
.bit_depth_luma_minus8 =
VAPicture->bit_depth_luma_minus8,
.bit_depth_chroma_minus8 =
VAPicture->bit_depth_chroma_minus8,
.nal_unit_type = nal_unit_type,
.nal_ref_idc = nal_ref_idc,
};
struct h264_slice_header_info sh = { 0 };
unsigned char *nal_payload = b + 1; /* past NAL header byte */
size_t nal_payload_len = surface->slices_size -
(size_t)((nal_payload) - (unsigned char *)surface->source_data);
int sh_rc = h264_parse_slice_header(nal_payload, nal_payload_len,
&sh_ctx, &sh);
if (sh_rc == 0) {
decode->idr_pic_id = sh.idr_pic_id;
decode->pic_order_cnt_lsb = sh.pic_order_cnt_lsb;
decode->delta_pic_order_cnt_bottom = sh.delta_pic_order_cnt_bottom;
decode->delta_pic_order_cnt0 = sh.delta_pic_order_cnt0;
decode->delta_pic_order_cnt1 = sh.delta_pic_order_cnt1;
decode->pic_order_cnt_bit_size = sh.pic_order_cnt_bit_size;
decode->dec_ref_pic_marking_bit_size = sh.dec_ref_pic_marking_bit_size;
} else {
request_log("slice_header parse FAILED rc=%d "
"(payload_len=%zu) — DECODE_PARAMS bit_size "
"fields left zero, hantro will likely produce zeros\n",
sh_rc, nal_payload_len);
}
}
h264_fill_dpb(driver_data, context, VAPicture, decode);
/*
* Populate every V4L2_CID_STATELESS_H264_DECODE_PARAMS field
* we can derive from VAAPI's pre-parsed VAPictureParameterBuffer
* + bitstream byte. Cross-reference: GStreamer
* gstv4l2codech264dec.c::gst_v4l2_codec_h264_dec_fill_decoder_params
* (lines 632-678).
*
* Fields not derivable from VAAPI (idr_pic_id, pic_order_cnt_lsb,
* delta_pic_order_cnt_*, dec_ref_pic_marking_bit_size,
* pic_order_cnt_bit_size, slice_group_change_cycle) require a
* full slice_header() bit-level parse, which libva-v4l2-request
* does not currently do. They are left at zero-init and the
* kernel-side hantro-vpu may compute them itself when scanning
* the OUTPUT bitstream — a hypothesis verified empirically by
* running this patch and inspecting the CAPTURE buffer.
*/
decode->nal_ref_idc = nal_ref_idc;
decode->frame_num = VAPicture->frame_num;
decode->top_field_order_cnt =
h264_strip_ffmpeg_poc_sentinel(VAPicture->CurrPic.TopFieldOrderCnt,
VAPicture->CurrPic.flags);
decode->bottom_field_order_cnt =
h264_strip_ffmpeg_poc_sentinel(VAPicture->CurrPic.BottomFieldOrderCnt,
VAPicture->CurrPic.flags);
if (nal_unit_type == 5)
decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC;
if (VAPicture->pic_fields.bits.field_pic_flag)
decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC;
if (VAPicture->CurrPic.flags & VA_PICTURE_H264_BOTTOM_FIELD)
decode->flags |= V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD;
pps->weighted_bipred_idc =
VAPicture->pic_fields.bits.weighted_bipred_idc;
pps->pic_init_qs_minus26 = VAPicture->pic_init_qs_minus26;
pps->pic_init_qp_minus26 = VAPicture->pic_init_qp_minus26;
pps->chroma_qp_index_offset = VAPicture->chroma_qp_index_offset;
pps->second_chroma_qp_index_offset =
VAPicture->second_chroma_qp_index_offset;
if (VAPicture->pic_fields.bits.entropy_coding_mode_flag)
pps->flags |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE;
if (VAPicture->pic_fields.bits.weighted_pred_flag)
pps->flags |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED;
if (VAPicture->pic_fields.bits.transform_8x8_mode_flag)
pps->flags |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE;
if (VAPicture->pic_fields.bits.constrained_intra_pred_flag)
pps->flags |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED;
if (VAPicture->pic_fields.bits.pic_order_present_flag)
pps->flags |=
V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT;
if (VAPicture->pic_fields.bits.deblocking_filter_control_present_flag)
pps->flags |=
V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT;
if (VAPicture->pic_fields.bits.redundant_pic_cnt_present_flag)
pps->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT;
sps->max_num_ref_frames = VAPicture->num_ref_frames;
sps->chroma_format_idc = VAPicture->seq_fields.bits.chroma_format_idc;
sps->bit_depth_luma_minus8 = VAPicture->bit_depth_luma_minus8;
sps->bit_depth_chroma_minus8 = VAPicture->bit_depth_chroma_minus8;
sps->log2_max_frame_num_minus4 =
VAPicture->seq_fields.bits.log2_max_frame_num_minus4;
sps->log2_max_pic_order_cnt_lsb_minus4 =
VAPicture->seq_fields.bits.log2_max_pic_order_cnt_lsb_minus4;
sps->pic_order_cnt_type = VAPicture->seq_fields.bits.pic_order_cnt_type;
sps->pic_width_in_mbs_minus1 = VAPicture->picture_width_in_mbs_minus1;
sps->pic_height_in_map_units_minus1 =
VAPicture->picture_height_in_mbs_minus1;
if (VAPicture->seq_fields.bits.residual_colour_transform_flag)
sps->flags |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE;
if (VAPicture->seq_fields.bits.gaps_in_frame_num_value_allowed_flag)
sps->flags |=
V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED;
if (VAPicture->seq_fields.bits.frame_mbs_only_flag)
sps->flags |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY;
if (VAPicture->seq_fields.bits.mb_adaptive_frame_field_flag)
sps->flags |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD;
if (VAPicture->seq_fields.bits.direct_8x8_inference_flag)
sps->flags |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE;
if (VAPicture->seq_fields.bits.delta_pic_order_always_zero_flag)
sps->flags |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO;
}
static void h264_va_matrix_to_v4l2(struct request_data *driver_data,
struct object_context *context,
VAIQMatrixBufferH264 *VAMatrix,
struct v4l2_ctrl_h264_scaling_matrix *v4l2_matrix)
{
memcpy(v4l2_matrix->scaling_list_4x4, &VAMatrix->ScalingList4x4,
sizeof(VAMatrix->ScalingList4x4));
/*
* In YUV422, there's only two matrices involved, while YUV444
* needs 6. However, in the former case, the two matrices
* should be placed at the 0 and 3 offsets.
*/
memcpy(v4l2_matrix->scaling_list_8x8[0], &VAMatrix->ScalingList8x8[0],
sizeof(v4l2_matrix->scaling_list_8x8[0]));
memcpy(v4l2_matrix->scaling_list_8x8[3], &VAMatrix->ScalingList8x8[1],
sizeof(v4l2_matrix->scaling_list_8x8[3]));
}
/*
* H.264 spec default scaling matrices: Flat_4x4_16 and Flat_8x8_16
* (every entry = 16). When sps_scaling_matrix_present_flag and
* pps_scaling_matrix_present_flag are both false, the bitstream
* carries no explicit scaling lists and the decoder uses these
* flat defaults — matching ITU-T H.264 (08/2024) §7.4.2.1.1.1
* (sequence scaling) and §7.4.2.2 (picture scaling).
*
* Why we always provide the matrix: hantro G1's set_params reads
* pps->flags & V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT to drive
* the G1_REG_DEC_CTRL2_TYPE1_QUANT_E hardware bit. FFmpeg's
* v4l2_request_h264.c always submits the SCALING_MATRIX control
* with the spec default when the bitstream omits explicit lists,
* and always sets the SCALING_MATRIX_PRESENT flag (commit
* comment: "FFmpeg always provide a scaling matrix"). We mirror
* that so the kernel sees a consistent control set across drivers.
*/
static void h264_default_flat_scaling_matrix(
struct v4l2_ctrl_h264_scaling_matrix *v4l2_matrix)
{
memset(v4l2_matrix->scaling_list_4x4, 16,
sizeof(v4l2_matrix->scaling_list_4x4));
memset(v4l2_matrix->scaling_list_8x8, 16,
sizeof(v4l2_matrix->scaling_list_8x8));
}
static void h264_copy_pred_table(struct v4l2_h264_weight_factors *factors,
unsigned int num_refs,
int16_t luma_weight[32],
int16_t luma_offset[32],
int16_t chroma_weight[32][2],
int16_t chroma_offset[32][2])
{
unsigned int i;
for (i = 0; i < num_refs; i++) {
unsigned int j;
factors->luma_weight[i] = luma_weight[i];
factors->luma_offset[i] = luma_offset[i];
for (j = 0; j < 2; j++) {
factors->chroma_weight[i][j] = chroma_weight[i][j];
factors->chroma_offset[i][j] = chroma_offset[i][j];
}
}
}
static void h264_va_slice_to_v4l2(struct request_data *driver_data,
struct object_context *context,
VASliceParameterBufferH264 *VASlice,
VAPictureParameterBufferH264 *VAPicture,
struct v4l2_ctrl_h264_slice_params *slice,
struct v4l2_ctrl_h264_pred_weights *weights)
{
slice->header_bit_size = VASlice->slice_data_bit_offset;
//if (context->h264_start_code)
// slice->header_bit_size += 3 * 8;
slice->first_mb_in_slice = VASlice->first_mb_in_slice;
slice->slice_type = VASlice->slice_type;
slice->cabac_init_idc = VASlice->cabac_init_idc;
slice->slice_qp_delta = VASlice->slice_qp_delta;
slice->disable_deblocking_filter_idc =
VASlice->disable_deblocking_filter_idc;
slice->slice_alpha_c0_offset_div2 = VASlice->slice_alpha_c0_offset_div2;
slice->slice_beta_offset_div2 = VASlice->slice_beta_offset_div2;
if (((VASlice->slice_type % 5) == H264_SLICE_P) ||
((VASlice->slice_type % 5) == H264_SLICE_B)) {
unsigned int i;
slice->num_ref_idx_l0_active_minus1 =
VASlice->num_ref_idx_l0_active_minus1;
for (i = 0; i < VASlice->num_ref_idx_l0_active_minus1 + 1; i++) {
VAPictureH264 *pic = &VASlice->RefPicList0[i];
struct h264_dpb_entry *entry;
unsigned int idx;
unsigned char fields;
entry = dpb_lookup(context, pic, &idx, &fields);
if (!entry)
continue;
slice->ref_pic_list0[i].index = idx;
slice->ref_pic_list0[i].fields = fields;
}
}
if ((VASlice->slice_type % 5) == H264_SLICE_B) {
unsigned int i;
slice->num_ref_idx_l1_active_minus1 =
VASlice->num_ref_idx_l1_active_minus1;
for (i = 0; i < VASlice->num_ref_idx_l1_active_minus1 + 1; i++) {
VAPictureH264 *pic = &VASlice->RefPicList1[i];
struct h264_dpb_entry *entry;
unsigned int idx;
unsigned char fields;
entry = dpb_lookup(context, pic, &idx, &fields);
if (!entry)
continue;
slice->ref_pic_list1[i].index = idx;
slice->ref_pic_list1[i].fields = fields;
}
}
if (VASlice->direct_spatial_mv_pred_flag)
slice->flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED;
weights->chroma_log2_weight_denom =
VASlice->chroma_log2_weight_denom;
weights->luma_log2_weight_denom =
VASlice->luma_log2_weight_denom;
if (((VASlice->slice_type % 5) == H264_SLICE_P) ||
((VASlice->slice_type % 5) == H264_SLICE_B))
h264_copy_pred_table(&weights->weight_factors[0],
slice->num_ref_idx_l0_active_minus1 + 1,
VASlice->luma_weight_l0,
VASlice->luma_offset_l0,
VASlice->chroma_weight_l0,
VASlice->chroma_offset_l0);
if ((VASlice->slice_type % 5) == H264_SLICE_B)
h264_copy_pred_table(&weights->weight_factors[1],
slice->num_ref_idx_l1_active_minus1 + 1,
VASlice->luma_weight_l1,
VASlice->luma_offset_l1,
VASlice->chroma_weight_l1,
VASlice->chroma_offset_l1);
}
int h264_get_controls(struct request_data *driver_data,
struct object_context *context)
{
struct v4l2_ext_control controls[2] = {
{
.id = V4L2_CID_STATELESS_H264_DECODE_MODE,
}, {
.id = V4L2_CID_STATELESS_H264_START_CODE,
}
};
int rc;
rc = v4l2_get_controls(driver_data->video_fd, -1, controls, 2);
if (rc < 0)
return VA_STATUS_ERROR_OPERATION_FAILED;
switch (controls[0].value) {
case V4L2_STATELESS_H264_DECODE_MODE_SLICE_BASED:
break;
case V4L2_STATELESS_H264_DECODE_MODE_FRAME_BASED:
break;
default:
request_log("Unsupported decode mode\n");
return VA_STATUS_ERROR_OPERATION_FAILED;
}
switch (controls[1].value) {
case V4L2_STATELESS_H264_START_CODE_NONE:
context->h264_start_code = false;
break;
case V4L2_STATELESS_H264_START_CODE_ANNEX_B:
context->h264_start_code = true;
break;
default:
request_log("Unsupported start code\n");
return VA_STATUS_ERROR_OPERATION_FAILED;
}
return VA_STATUS_SUCCESS;
}
static inline __u8 h264_profile_to_idc(VAProfile profile)
{
switch (profile) {
case VAProfileH264Main:
return 77;
case VAProfileH264High:
return 100;
case VAProfileH264ConstrainedBaseline:
return 66;
case VAProfileH264MultiviewHigh:
return 118;
case VAProfileH264StereoHigh:
return 128;
default:
return 0;
}
}
/*
* Derive sps.level_idc from the encoded frame size in macroblocks per
* H.264 Annex A.3 (Table A-1) MaxFS thresholds. Each level's MaxFS is
* the maximum encoded frame size in MBs the level supports; we pick
* the smallest level whose MaxFS contains the actual frame size.
*
* Level decoding for the V4L2 control: level_idc = level * 10
* Level 1.0 → 10, Level 4.1 → 41, Level 5.1 → 51, Level 6.0 → 60.
*
* VAAPI does not expose the bitstream's actual level_idc on the
* decode side (VAPictureParameterBufferH264 has no such field) — see
* va.h. The H.264 SPS NAL is parsed client-side by ffmpeg-vaapi /
* mpv and only slice data is forwarded in VASliceDataBuffer, so a
* SPS-NAL byte parser is not viable at this layer.
*
* Without framerate we cannot also check MaxMBPS / MaxBR / MaxCPB.
* That gap is acceptable in practice: consumers that push
* temporally-dense streams (high MBPS) almost always also push
* spatially-large frames (high MaxFS), so frame-size-based level
* selection over-allocates on the temporal axis but never
* under-allocates a level the consumer relies on for correct
* decode-resource sizing.
*
* Picks for typical content:
* 1080p (8160 MBs) → Level 4.1 (level_idc = 41)
* 4K (32400 MBs) → Level 5.1 (level_idc = 51)
* 8K (138240 MBs) → Level 6.0 (level_idc = 60)
*
* Replaces the hardcoded level_idc=51 from patch 0013.
*/
static inline __u8 h264_derive_level_idc(unsigned int width_in_mbs,
unsigned int height_in_mbs)
{
const unsigned int frame_size_mbs = width_in_mbs * height_in_mbs;
if (frame_size_mbs <= 99) return 10; /* Level 1.0 */
if (frame_size_mbs <= 396) return 11; /* Level 1.1 - 2.0 */
if (frame_size_mbs <= 792) return 21; /* Level 2.1 */
if (frame_size_mbs <= 1620) return 22; /* Level 2.2 - 3.0 */
if (frame_size_mbs <= 3600) return 31; /* Level 3.1 */
if (frame_size_mbs <= 5120) return 32; /* Level 3.2 */
if (frame_size_mbs <= 8192) return 41; /* Level 4.0 - 4.1 */
if (frame_size_mbs <= 8704) return 42; /* Level 4.2 */
if (frame_size_mbs <= 22080) return 50; /* Level 5.0 */
if (frame_size_mbs <= 36864) return 51; /* Level 5.1 - 5.2 */
if (frame_size_mbs <= 139264) return 60; /* Level 6.0 - 6.2 */
return 62; /* > Level 6 ceiling */
}
int h264_set_controls(struct request_data *driver_data,
struct object_context *context,
VAProfile profile,
struct object_surface *surface)
{
struct v4l2_ctrl_h264_scaling_matrix matrix = { 0 };
struct v4l2_ctrl_h264_decode_params decode = { 0 };
struct v4l2_ctrl_h264_slice_params slice = { 0 };
struct v4l2_ctrl_h264_pred_weights weights = { 0 };
struct v4l2_ctrl_h264_pps pps = { 0 };
struct v4l2_ctrl_h264_sps sps = { 0 };
struct h264_dpb_entry *output;
int rc;
output = dpb_lookup(context, &surface->params.h264.picture.CurrPic,
NULL, NULL);
if (!output)
output = dpb_find_entry(context);
dpb_clear_entry(output, true);
dpb_update(context, &surface->params.h264.picture);
h264_va_picture_to_v4l2(driver_data, context, surface,
&surface->params.h264.picture,
&decode, &pps, &sps);
/*
* Populate the scaling matrix unconditionally: from VAAPI's
* VAIQMatrixBufferH264 when the consumer sent one this frame
* (matrix_set), otherwise from the H.264 spec flat defaults.
* Submitted to the kernel as V4L2_CID_STATELESS_H264_SCALING_MATRIX
* for every request — required for FFmpeg/hantro contract parity
* (see h264_default_flat_scaling_matrix() docblock).
*/
if (surface->params.h264.matrix_set)
h264_va_matrix_to_v4l2(driver_data, context,
&surface->params.h264.matrix, &matrix);
else
h264_default_flat_scaling_matrix(&matrix);
h264_va_slice_to_v4l2(driver_data, context,
&surface->params.h264.slice,
&surface->params.h264.picture, &slice, &weights);
/*
* Mirror SCALING_MATRIX_PRESENT in PPS flags. Hantro G1 set_params
* gates its G1_REG_DEC_CTRL2_TYPE1_QUANT_E register bit on this;
* FFmpeg sets it unconditionally with the comment "FFmpeg always
* provide a scaling matrix." We submit the matrix always (above),
* so the flag must be set always to match.
*/
pps.flags |= V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT;
/*
* Populate pps->num_ref_idx_l0/l1_default_active_minus1. Hantro G1
* writes both into G1_REG_DEC_CTRL6_REFIDX0_ACTIVE / REFIDX1_ACTIVE
* MMIO registers (via "(field) + 1", so an uninitialized 0 here
* would advertise "1 active reference per list" to hardware, wrong
* for I/IDR frames with 0 refs and wrong for B frames with >1).
*
* VAAPI's VAPictureParameterBufferH264 does not carry the parsed
* PPS num_ref_idx_l*_default_active_minus1 fields — those are in
* the bitstream's PPS NAL which VAAPI consumers parse client-side
* but don't forward. The closest available source is VASlice's
* num_ref_idx_l*_active_minus1, which is the per-slice override
* defaulting to the PPS value (H.264 §7.4.3 num_ref_idx_active_
* override_flag). For most streams these values match; mismatch
* only on streams with explicit per-slice overrides.
*
* For IDR frames (no references), the values are not used by
* hantro's reference list builder, so a wrong value here is
* harmless. For inter frames it matters and slice-derived is
* the best we can do without a full PPS-NAL parser.
*/
pps.num_ref_idx_l0_default_active_minus1 =
surface->params.h264.slice.num_ref_idx_l0_active_minus1;
pps.num_ref_idx_l1_default_active_minus1 =
surface->params.h264.slice.num_ref_idx_l1_active_minus1;
/*
* Derive PFRAME / BFRAME flags in v4l2_ctrl_h264_decode_params.flags
* from VASliceParameterBufferH264.slice_type. VAAPI's slice_type
* matches the H.264 spec slice_type semantic: 0=P, 1=B, 2=I, 3=SP,
* 4=SI; values 5..9 mean "all slices in the picture have this
* slice_type" (mod 5 yields the underlying type). VAAPI consumers
* (ffmpeg, mpv) populate this for every slice; in FRAME_BASED mode
* we only see the most-recent slice's params, but slice_type is
* uniform across a single coded picture for our purposes.
*
* Kernel consumers that read these flags: tegra-vde
* (drivers/media/platform/nvidia/tegra-vde/h264.c lines 783-799 of
* 6.19.x) selects the inter-frame decode kernel. Hantro / rkvdec /
* cedrus / mediatek / qcom-iris-stateless do not consume them.
* Setting them keeps the libva-v4l2-request fork upstreamable
* across drivers without affecting hantro behaviour.
*
* Cross-reference: ext-ctrls-codec-stateless.rst Decode Parameters
* Flags — V4L2_H264_DECODE_PARAM_FLAG_PFRAME / _BFRAME.
*/
switch (surface->params.h264.slice.slice_type % 5) {
case H264_SLICE_P:
decode.flags |= V4L2_H264_DECODE_PARAM_FLAG_PFRAME;
break;
case H264_SLICE_B:
decode.flags |= V4L2_H264_DECODE_PARAM_FLAG_BFRAME;
break;
default:
/* I / SP / SI: no extra flag. */
break;
}
sps.profile_idc = h264_profile_to_idc(profile);
/*
* Derive level_idc from encoded frame size per H.264 Annex A.3.
* VAAPI doesn't expose level_idc on the decode side (see
* h264_derive_level_idc()'s docblock for the rationale); we pick
* the smallest level whose MaxFS contains the picture dimensions.
* Replaces patch 0013's intermediate hardcode of 51.
*/
sps.level_idc = h264_derive_level_idc(
(unsigned int)surface->params.h264.picture.picture_width_in_mbs_minus1 + 1u,
(unsigned int)surface->params.h264.picture.picture_height_in_mbs_minus1 + 1u);
/*
* Build the per-request control list incrementally:
* - SPS, PPS, DECODE_PARAMS, SCALING_MATRIX: always required.
* Hantro G1 reads the SCALING_MATRIX_PRESENT flag from PPS to
* gate hardware register G1_REG_DEC_CTRL2_TYPE1_QUANT_E and
* reads the matrix entries directly into hardware tables when
* decoding. FFmpeg always submits the matrix (with spec-default
* flat values when no explicit lists are in the bitstream); we
* match that — see h264_default_flat_scaling_matrix() docblock.
* Earlier patch 0012 made SCALING_MATRIX submission conditional
* on VAAPI's VAIQMatrixBuffer arrival; that was corpus-correct
* (bbb has no explicit scaling lists) but inconsistent with the
* hantro contract — replaced 2026-05-04.
* - SLICE_PARAMS: SLICE_BASED only. Kernel doc
* ext-ctrls-codec-stateless.rst (FRAME_BASED entry):
* "When this mode is selected, the
* V4L2_CID_STATELESS_H264_SLICE_PARAMS control shall not be
* set." Submitting it under FRAME_BASED triggers cluster-
* validation EINVAL at error_idx=count.
* - PRED_WEIGHTS: SLICE_BASED + V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED.
*
* Patch 0002 unconditionally sets the device to FRAME_BASED,
* so slice_based is hardcoded false here. When the planned
* probe-then-set commit lands, this becomes
* context->decode_mode == V4L2_STATELESS_H264_DECODE_MODE_SLICE_BASED.
*/
struct v4l2_ext_control controls[6] = { 0 };
unsigned int num_controls = 0;
const bool slice_based = false; /* TODO: probe via context->decode_mode */
controls[num_controls].id = V4L2_CID_STATELESS_H264_SPS;
controls[num_controls].p_h264_sps = &sps;
controls[num_controls].size = sizeof(sps);
num_controls++;
controls[num_controls].id = V4L2_CID_STATELESS_H264_PPS;
controls[num_controls].p_h264_pps = &pps;
controls[num_controls].size = sizeof(pps);
num_controls++;
controls[num_controls].id = V4L2_CID_STATELESS_H264_DECODE_PARAMS;
controls[num_controls].p_h264_decode_params = &decode;
controls[num_controls].size = sizeof(decode);
num_controls++;
controls[num_controls].id = V4L2_CID_STATELESS_H264_SCALING_MATRIX;
controls[num_controls].p_h264_scaling_matrix = &matrix;
controls[num_controls].size = sizeof(matrix);
num_controls++;
if (slice_based) {
controls[num_controls].id = V4L2_CID_STATELESS_H264_SLICE_PARAMS;
controls[num_controls].p_h264_slice_params = &slice;
controls[num_controls].size = sizeof(slice);
num_controls++;
if (V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED(&pps, &slice)) {
controls[num_controls].id = V4L2_CID_STATELESS_H264_PRED_WEIGHTS;
controls[num_controls].ptr = &weights;
controls[num_controls].size = sizeof(weights);
num_controls++;
}
}
rc = v4l2_set_controls(driver_data->video_fd, surface->request_fd,
controls, num_controls);
if (rc < 0)
return VA_STATUS_ERROR_OPERATION_FAILED;
dpb_insert(context, &surface->params.h264.picture.CurrPic, output);
return VA_STATUS_SUCCESS;
}