h264: bit-parse slice_header to populate DECODE_PARAMS bit-size fields

The load-bearing fix from diff_against_ffmpeg.md (campaign repo).

Adds src/h264_slice_header.{c,h} — a minimal H.264 slice_header()
bit-parser per ITU-T H.264 (08/2024) §7.3.3. Parses just enough of
the slice header to populate the V4L2 DECODE_PARAMS fields VAAPI
doesn't carry and that hantro G1 hardware reads directly out of
DECODE_PARAMS into MMIO registers:

  dec_param->dec_ref_pic_marking_bit_size  -> G1_REG_DEC_CTRL5_REFPIC_MK_LEN
  dec_param->idr_pic_id                    -> G1_REG_DEC_CTRL5_IDR_PIC_ID
  dec_param->pic_order_cnt_bit_size        -> G1_REG_DEC_CTRL6_POC_LENGTH
  dec_param->pic_order_cnt_lsb             -> hantro reflist builder (poc_type=0)
  dec_param->delta_pic_order_cnt_bottom    -> same
  dec_param->delta_pic_order_cnt0/1        -> hantro reflist builder (poc_type=1)

Without these set correctly, hantro's hardware bitstream parser
walks past zero bits in the slice header, lands on garbage, decodes
zero pixels — the all-zero CAPTURE output observed across both mpv
and Firefox during 2026-05-04 Phase 0 (see libva-multiplanar campaign
phase0_evidence/2026-05-04-kernel-trace/findings.md).

Implementation:
- Minimal RBSP bit reader (br_read_u/_ue/_se), MSB-first, fault-flag
  on overrun.
- Emulation-prevention unescape (strips 0x03 after 0x00 0x00) on
  the first 64 bytes of the slice — slice headers fit comfortably.
- Walks slice_header() up to and including dec_ref_pic_marking(),
  measuring bit positions for the *_bit_size fields.
- Skips ref_pic_list_modification() and pred_weight_table() —
  needed only to advance the bit position to dec_ref_pic_marking().
- Returns a struct with the V4L2 fields plus diagnostics
  (first_mb_in_slice, slice_type, pps_id, frame_num).

Wired into h264_va_picture_to_v4l2 (src/h264.c) right after the
nal_ref_idc/nal_unit_type extraction. SPS/PPS context is built from
VAPicture's seq_fields and pic_fields; num_ref_idx_l0/l1_active
defaults come from VASlice (best available substitute for the
parsed PPS values). On parse success, populates decode_params with
the recovered values + emits a request_log with the decoded fields
for cross-validation against VAAPI's pre-parsed values.

src/meson.build: adds h264_slice_header.{c,h} to sources.

Cross-references:
- FFmpeg libavcodec/h264_slice.c (Kwiboo v4l2-request-n8.1) — populates
  H264SliceContext::ref_pic_marking_bit_size / pic_order_cnt_bit_size
  by the same bit-precise parse, then v4l2_request_h264.c forwards
  to V4L2.
- Linux drivers/media/platform/verisilicon/hantro_g1_h264_dec.c
  set_params() — the register-write code that reads these fields.

MVC nal_unit_type 20/21 unhandled (this fork strips MVC alongside
HEVC). Multi-slice non-IDR streams parse the first slice's header
only; for FRAME_BASED mode that's fine — kernel sees the whole
bitstream and parses subsequent slices itself.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 12:34:47 +00:00
parent d41a4b96b3
commit 9de1be34ef
4 changed files with 543 additions and 0 deletions
+361
View File
@@ -0,0 +1,361 @@
/*
* H.264 slice header bit-parser implementation.
*
* Implements just enough of ITU-T Rec. H.264 (08/2024) §7.3.3
* slice_header to populate the V4L2 DECODE_PARAMS bit-position
* fields (idr_pic_id, pic_order_cnt_lsb, delta_pic_order_cnt_*,
* pic_order_cnt_bit_size, dec_ref_pic_marking_bit_size).
*
* Skips through ref_pic_list_modification() and pred_weight_table()
* because dec_ref_pic_marking() (whose bit length we need) comes
* after them. MVC extensions (nal_unit_type 20/21) are not handled
* — this fork strips MVC alongside HEVC.
*/
#include "h264_slice_header.h"
#include <errno.h>
#include <string.h>
/*
* Minimal RBSP bit reader. Reads bits MSB-first. Tracks bit_pos for
* caller use (e.g. computing the size of a syntax element by
* pre/post bit_pos delta).
*/
struct br {
const uint8_t *data;
size_t length; /* bytes */
size_t bit_pos;
bool error;
};
static uint32_t br_read_u(struct br *b, unsigned n)
{
uint32_t v = 0;
while (n--) {
if (b->bit_pos >= b->length * 8) {
b->error = true;
return 0;
}
v = (v << 1) | ((b->data[b->bit_pos >> 3] >>
(7 - (b->bit_pos & 7))) & 1u);
b->bit_pos++;
}
return v;
}
static uint32_t br_read_ue(struct br *b)
{
unsigned zeros = 0;
while (br_read_u(b, 1) == 0) {
if (b->error || ++zeros >= 32)
return 0;
}
if (zeros == 0)
return 0;
return (1u << zeros) - 1u + br_read_u(b, zeros);
}
static int32_t br_read_se(struct br *b)
{
uint32_t v = br_read_ue(b);
if (v & 1u)
return (int32_t)((v + 1u) >> 1);
return -(int32_t)(v >> 1);
}
/*
* RBSP unescape: strip emulation prevention bytes (after every
* 0x00 0x00 in the encoded stream, an extra 0x03 is inserted to
* prevent byte-aligned start-code emulation; we strip those before
* bit-parsing). Output buffer must be at least as large as input.
*
* Slice headers are short (<100 bits typically), so we unescape
* only the first H264_SLICE_HEADER_SCAN_BYTES = 64 input bytes.
* That covers any realistic slice header including
* dec_ref_pic_marking() and a generous safety margin.
*/
#define H264_SLICE_HEADER_SCAN_BYTES 64
static size_t rbsp_unescape(uint8_t *out, const uint8_t *in,
size_t in_len)
{
size_t out_len = 0;
int zero_run = 0;
size_t i;
size_t cap = in_len < H264_SLICE_HEADER_SCAN_BYTES ?
in_len : H264_SLICE_HEADER_SCAN_BYTES;
for (i = 0; i < cap; i++) {
if (zero_run >= 2 && in[i] == 0x03) {
zero_run = 0;
continue;
}
out[out_len++] = in[i];
zero_run = (in[i] == 0x00) ? zero_run + 1 : 0;
}
return out_len;
}
/*
* §7.3.3.1 ref_pic_list_modification() — skip past it without
* keeping any values. Length depends on slice_type and the loop
* terminator modification_of_pic_nums_idc == 3.
*/
static void skip_ref_pic_list_modification(struct br *b,
uint32_t slice_type)
{
uint32_t st_mod5 = slice_type % 5;
if (st_mod5 != 2 && st_mod5 != 4) {
/* P, SP, B */
uint32_t ref_pic_list_modification_flag_l0 = br_read_u(b, 1);
if (ref_pic_list_modification_flag_l0) {
uint32_t mod_idc;
do {
mod_idc = br_read_ue(b);
if (mod_idc == 0 || mod_idc == 1)
br_read_ue(b); /* abs_diff_pic_num_minus1 */
else if (mod_idc == 2)
br_read_ue(b); /* long_term_pic_num */
if (b->error)
return;
} while (mod_idc != 3);
}
}
if (st_mod5 == 1) {
/* B */
uint32_t ref_pic_list_modification_flag_l1 = br_read_u(b, 1);
if (ref_pic_list_modification_flag_l1) {
uint32_t mod_idc;
do {
mod_idc = br_read_ue(b);
if (mod_idc == 0 || mod_idc == 1)
br_read_ue(b);
else if (mod_idc == 2)
br_read_ue(b);
if (b->error)
return;
} while (mod_idc != 3);
}
}
}
/*
* §7.3.3.2 pred_weight_table() — skip past it. Length depends on
* the active reference counts and chroma_format_idc.
*/
static void skip_pred_weight_table(struct br *b,
uint32_t slice_type,
uint8_t chroma_format_idc,
uint8_t bit_depth_luma_minus8,
uint8_t bit_depth_chroma_minus8,
uint32_t num_ref_idx_l0_active_minus1,
uint32_t num_ref_idx_l1_active_minus1)
{
uint32_t i, j;
uint32_t st_mod5 = slice_type % 5;
(void)bit_depth_luma_minus8;
(void)bit_depth_chroma_minus8;
br_read_ue(b); /* luma_log2_weight_denom */
if (chroma_format_idc != 0)
br_read_ue(b); /* chroma_log2_weight_denom */
for (i = 0; i <= num_ref_idx_l0_active_minus1 && !b->error; i++) {
uint32_t luma_weight_l0_flag = br_read_u(b, 1);
if (luma_weight_l0_flag) {
br_read_se(b); /* luma_weight_l0 */
br_read_se(b); /* luma_offset_l0 */
}
if (chroma_format_idc != 0) {
uint32_t chroma_weight_l0_flag = br_read_u(b, 1);
if (chroma_weight_l0_flag) {
for (j = 0; j < 2; j++) {
br_read_se(b);
br_read_se(b);
}
}
}
}
if (st_mod5 == 1) {
for (i = 0; i <= num_ref_idx_l1_active_minus1 && !b->error; i++) {
uint32_t luma_weight_l1_flag = br_read_u(b, 1);
if (luma_weight_l1_flag) {
br_read_se(b);
br_read_se(b);
}
if (chroma_format_idc != 0) {
uint32_t chroma_weight_l1_flag = br_read_u(b, 1);
if (chroma_weight_l1_flag) {
for (j = 0; j < 2; j++) {
br_read_se(b);
br_read_se(b);
}
}
}
}
}
}
int h264_parse_slice_header(const uint8_t *nal_payload,
size_t nal_payload_length,
const struct h264_slice_header_context *ctx,
struct h264_slice_header_info *out)
{
uint8_t unescaped[H264_SLICE_HEADER_SCAN_BYTES];
size_t unescaped_len;
struct br b = { 0 };
bool idr_pic_flag = (ctx->nal_unit_type == 5);
uint32_t slice_type;
uint32_t num_ref_idx_l0_active_minus1;
uint32_t num_ref_idx_l1_active_minus1;
size_t pic_order_cnt_start;
size_t pic_order_cnt_end;
size_t dec_ref_pic_marking_start;
size_t dec_ref_pic_marking_end;
bool field_pic_flag = false;
memset(out, 0, sizeof(*out));
if (!nal_payload || nal_payload_length == 0)
return -EINVAL;
unescaped_len = rbsp_unescape(unescaped, nal_payload,
nal_payload_length);
if (unescaped_len < 2)
return -EINVAL;
b.data = unescaped;
b.length = unescaped_len;
b.bit_pos = 0;
b.error = false;
/* slice_header() per §7.3.3 */
out->first_mb_in_slice = br_read_ue(&b);
slice_type = br_read_ue(&b);
out->slice_type = slice_type;
out->pic_parameter_set_id = br_read_ue(&b);
if (ctx->separate_colour_plane_flag)
(void)br_read_u(&b, 2); /* colour_plane_id */
out->frame_num = br_read_u(&b, ctx->log2_max_frame_num_minus4 + 4u);
if (!ctx->frame_mbs_only_flag) {
field_pic_flag = (br_read_u(&b, 1) != 0);
if (field_pic_flag)
(void)br_read_u(&b, 1); /* bottom_field_flag */
}
if (idr_pic_flag)
out->idr_pic_id = (uint16_t)br_read_ue(&b);
/*
* pic_order_cnt syntax — measure bit length from the start of
* pic_order_cnt_lsb / delta_pic_order_cnt[0] to the end of
* delta_pic_order_cnt_bottom / delta_pic_order_cnt[1]. This is
* what V4L2 calls pic_order_cnt_bit_size and what hantro G1
* writes into G1_REG_DEC_CTRL6_POC_LENGTH.
*/
pic_order_cnt_start = b.bit_pos;
if (ctx->pic_order_cnt_type == 0) {
out->pic_order_cnt_lsb = (uint16_t)br_read_u(
&b, ctx->log2_max_pic_order_cnt_lsb_minus4 + 4u);
if (ctx->bottom_field_pic_order_in_frame_present_flag &&
!field_pic_flag)
out->delta_pic_order_cnt_bottom = br_read_se(&b);
} else if (ctx->pic_order_cnt_type == 1 &&
!ctx->delta_pic_order_always_zero_flag) {
out->delta_pic_order_cnt0 = br_read_se(&b);
if (ctx->bottom_field_pic_order_in_frame_present_flag &&
!field_pic_flag)
out->delta_pic_order_cnt1 = br_read_se(&b);
}
pic_order_cnt_end = b.bit_pos;
out->pic_order_cnt_bit_size = (uint32_t)(pic_order_cnt_end -
pic_order_cnt_start);
if (ctx->redundant_pic_cnt_present_flag)
(void)br_read_ue(&b); /* redundant_pic_cnt */
if (slice_type % 5 == 1) /* B */
(void)br_read_u(&b, 1); /* direct_spatial_mv_pred_flag */
num_ref_idx_l0_active_minus1 = ctx->num_ref_idx_l0_default_active_minus1;
num_ref_idx_l1_active_minus1 = ctx->num_ref_idx_l1_default_active_minus1;
{
uint32_t st = slice_type % 5;
if (st == 0 || st == 3 || st == 1) {
/* P, SP, B */
uint32_t override = br_read_u(&b, 1);
if (override) {
num_ref_idx_l0_active_minus1 = br_read_ue(&b);
if (st == 1)
num_ref_idx_l1_active_minus1 = br_read_ue(&b);
}
}
}
skip_ref_pic_list_modification(&b, slice_type);
if (b.error)
return -EIO;
{
uint32_t st = slice_type % 5;
bool do_pwt =
(ctx->weighted_pred_flag && (st == 0 || st == 3)) ||
(ctx->weighted_bipred_idc == 1 && st == 1);
if (do_pwt) {
skip_pred_weight_table(&b, slice_type,
ctx->chroma_format_idc,
ctx->bit_depth_luma_minus8,
ctx->bit_depth_chroma_minus8,
num_ref_idx_l0_active_minus1,
num_ref_idx_l1_active_minus1);
if (b.error)
return -EIO;
}
}
/*
* dec_ref_pic_marking() per §7.3.3.3 — measure bit length;
* hantro G1 writes this into G1_REG_DEC_CTRL5_REFPIC_MK_LEN.
*/
dec_ref_pic_marking_start = b.bit_pos;
if (ctx->nal_ref_idc != 0) {
if (idr_pic_flag) {
(void)br_read_u(&b, 1); /* no_output_of_prior_pics_flag */
(void)br_read_u(&b, 1); /* long_term_reference_flag */
} else {
uint32_t adaptive = br_read_u(&b, 1);
if (adaptive) {
uint32_t mmco;
do {
mmco = br_read_ue(&b);
if (mmco == 1 || mmco == 3)
br_read_ue(&b); /* difference_of_pic_nums_minus1 */
if (mmco == 2)
br_read_ue(&b); /* long_term_pic_num */
if (mmco == 3 || mmco == 6)
br_read_ue(&b); /* long_term_frame_idx */
if (mmco == 4)
br_read_ue(&b); /* max_long_term_frame_idx_plus1 */
if (b.error)
return -EIO;
} while (mmco != 0);
}
}
}
dec_ref_pic_marking_end = b.bit_pos;
out->dec_ref_pic_marking_bit_size =
(uint32_t)(dec_ref_pic_marking_end - dec_ref_pic_marking_start);
if (b.error)
return -EIO;
return 0;
}