/* * Copyright (C) 2007 Intel Corporation * Copyright (C) 2016 Florent Revest * Copyright (C) 2018 Paul Kocialkowski * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* * fresnel-fourier iter2 Phase 6 commit B: rewrite h265.c against new * V4L2_CID_STATELESS_HEVC_{SPS,PPS,SLICE_PARAMS,SCALING_MATRIX, * DECODE_PARAMS,DECODE_MODE,START_CODE} stateless controls (mainline * kernel :2090-2300). * * Replaces the staging-era V4L2_CID_MPEG_VIDEO_HEVC_{SPS,PPS, * SLICE_PARAMS} CIDs that don't exist on modern kernels (verified via * test-compile in iter2 Phase 2). * * Per-frame submission: one batched VIDIOC_S_EXT_CTRLS, count=5, * ctrl_class=V4L2_CTRL_CLASS_CODEC_STATELESS: * 0xa40a90 SPS (40 bytes) * 0xa40a91 PPS (64 bytes) * 0xa40a92 SLICE_PARAMS (variable; dynamic-array; one entry per slice) * 0xa40a93 SCALING_MATRIX (1296 bytes; conditional on kernel availability) * 0xa40a94 DECODE_PARAMS (328 bytes; per-frame DPB info) * * Plus device-wide menus set once at context init: * 0xa40a95 DECODE_MODE (FRAME_BASED on rkvdec) * 0xa40a96 START_CODE (ANNEX_B on rkvdec) * * Reference: FFmpeg libavcodec/v4l2_request_hevc.c:505-565 * (v4l2_request_hevc_queue_decode batched submission shape). * * Key Phase 5 review amendments incorporated: * C1: data_byte_offset (NOT data_bit_offset); old bit-search dropped. * C2: dpb_entry.flags only LONG_TERM_REFERENCE bit; pic_order_cnt_val * (singular); poc_st_curr_*[] arrays are u8 DPB INDICES, not POC * values (per FFmpeg get_ref_pic_index pattern). * S1: PPS flags 19+20 (DEBLOCKING_FILTER_CONTROL_PRESENT, UNIFORM_SPACING) * included. * S2: PPS scalars pic_parameter_set_id, num_ref_idx_l0/l1_default_active_ * minus1 populated. * Q2: slice_segment_addr populated from VAAPI slice->slice_segment_address. * S3: SCALING_MATRIX content matches FFmpeg pattern — memset zero when * iqmatrix_set==false (BBB has no scaling list in SPS flags). */ #include "h265.h" #include "context.h" #include "object_heap.h" #include "request.h" #include "surface.h" #include #include #include #include #include #include #include #include #include "utils.h" #include "v4l2.h" /* * NAL unit header bit positions per ISO/IEC 23008-2 / H.265 spec. * Used for nal_unit_type + nuh_temporal_id_plus1 extraction from * the slice bitstream's first 2 bytes (after any ANNEX_B start code). */ #define H265_NAL_UNIT_TYPE_SHIFT 1 #define H265_NAL_UNIT_TYPE_MASK ((1 << 6) - 1) #define H265_NUH_TEMPORAL_ID_PLUS1_SHIFT 0 #define H265_NUH_TEMPORAL_ID_PLUS1_MASK ((1 << 3) - 1) /* ===== Clause 2: SPS (40 bytes) ===== */ static void h265_fill_sps(VAPictureParameterBufferHEVC *picture, struct v4l2_ctrl_hevc_sps *sps) { memset(sps, 0, sizeof(*sps)); sps->video_parameter_set_id = 0; /* not exposed by VAAPI */ sps->seq_parameter_set_id = 0; /* not exposed by VAAPI */ sps->pic_width_in_luma_samples = picture->pic_width_in_luma_samples; sps->pic_height_in_luma_samples = picture->pic_height_in_luma_samples; sps->bit_depth_luma_minus8 = picture->bit_depth_luma_minus8; sps->bit_depth_chroma_minus8 = picture->bit_depth_chroma_minus8; sps->log2_max_pic_order_cnt_lsb_minus4 = picture->log2_max_pic_order_cnt_lsb_minus4; sps->sps_max_dec_pic_buffering_minus1 = picture->sps_max_dec_pic_buffering_minus1; /* * iter11 α-13: VAAPI doesn't forward sps_max_num_reorder_pics or * sps_max_latency_increase_plus1. kdirect parses the SPS NAL and * submits the bitstream's true values; libva used to hardcode 0 * (a structurally wrong "no reordering" hint, even though Phase 5b * empirically confirmed rkvdec ignores both fields on RK3399, so * this is wire-hygiene only — matches kdirect's payload more * closely without behavior change). sps_max_dec_pic_buffering_minus1 * is a safe upper bound per H.265 §A.4.2 (sps_max_num_reorder_pics ≤ * sps_max_dec_pic_buffering_minus1 always holds). latency_increase_plus1 * stays at 0 = spec "unconstrained". */ sps->sps_max_num_reorder_pics = picture->sps_max_dec_pic_buffering_minus1; sps->sps_max_latency_increase_plus1 = 0; sps->log2_min_luma_coding_block_size_minus3 = picture->log2_min_luma_coding_block_size_minus3; sps->log2_diff_max_min_luma_coding_block_size = picture->log2_diff_max_min_luma_coding_block_size; sps->log2_min_luma_transform_block_size_minus2 = picture->log2_min_transform_block_size_minus2; sps->log2_diff_max_min_luma_transform_block_size = picture->log2_diff_max_min_transform_block_size; sps->max_transform_hierarchy_depth_inter = picture->max_transform_hierarchy_depth_inter; sps->max_transform_hierarchy_depth_intra = picture->max_transform_hierarchy_depth_intra; sps->pcm_sample_bit_depth_luma_minus1 = picture->pcm_sample_bit_depth_luma_minus1; sps->pcm_sample_bit_depth_chroma_minus1 = picture->pcm_sample_bit_depth_chroma_minus1; sps->log2_min_pcm_luma_coding_block_size_minus3 = picture->log2_min_pcm_luma_coding_block_size_minus3; sps->log2_diff_max_min_pcm_luma_coding_block_size = picture->log2_diff_max_min_pcm_luma_coding_block_size; sps->num_short_term_ref_pic_sets = picture->num_short_term_ref_pic_sets; sps->num_long_term_ref_pics_sps = picture->num_long_term_ref_pic_sps; sps->chroma_format_idc = picture->pic_fields.bits.chroma_format_idc; sps->sps_max_sub_layers_minus1 = 0; /* not exposed */ /* reserved[6] zeroed by memset */ /* 9 boolean flags collapsed to u64 */ if (picture->pic_fields.bits.separate_colour_plane_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE; if (picture->pic_fields.bits.scaling_list_enabled_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED; if (picture->pic_fields.bits.amp_enabled_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED; if (picture->slice_parsing_fields.bits.sample_adaptive_offset_enabled_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET; if (picture->pic_fields.bits.pcm_enabled_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED; if (picture->pic_fields.bits.pcm_loop_filter_disabled_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED; if (picture->slice_parsing_fields.bits.long_term_ref_pics_present_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT; if (picture->slice_parsing_fields.bits.sps_temporal_mvp_enabled_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED; if (picture->pic_fields.bits.strong_intra_smoothing_enabled_flag) sps->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED; } /* ===== Clause 3: PPS (64 bytes; 21 flags + 3 newly-mapped scalars per S1+S2) ===== */ static void h265_fill_pps(VAPictureParameterBufferHEVC *picture, VASliceParameterBufferHEVC *slice, struct v4l2_ctrl_hevc_pps *pps) { memset(pps, 0, sizeof(*pps)); pps->pic_parameter_set_id = 0; /* S2: not exposed by VAAPI; default 0 */ pps->num_extra_slice_header_bits = picture->num_extra_slice_header_bits; pps->num_ref_idx_l0_default_active_minus1 = picture->num_ref_idx_l0_default_active_minus1; /* S2 */ pps->num_ref_idx_l1_default_active_minus1 = picture->num_ref_idx_l1_default_active_minus1; /* S2 */ pps->init_qp_minus26 = picture->init_qp_minus26; pps->diff_cu_qp_delta_depth = picture->diff_cu_qp_delta_depth; pps->pps_cb_qp_offset = picture->pps_cb_qp_offset; pps->pps_cr_qp_offset = picture->pps_cr_qp_offset; pps->num_tile_columns_minus1 = picture->num_tile_columns_minus1; pps->num_tile_rows_minus1 = picture->num_tile_rows_minus1; /* column_width_minus1[20] + row_height_minus1[22] left zero — BBB single-tile */ pps->pps_beta_offset_div2 = picture->pps_beta_offset_div2; pps->pps_tc_offset_div2 = picture->pps_tc_offset_div2; pps->log2_parallel_merge_level_minus2 = picture->log2_parallel_merge_level_minus2; /* reserved zeroed by memset */ /* 21 boolean flags (bits 0-20) collapsed to u64 */ if (slice && slice->LongSliceFlags.fields.dependent_slice_segment_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED; if (picture->slice_parsing_fields.bits.output_flag_present_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT; if (picture->pic_fields.bits.sign_data_hiding_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED; if (picture->slice_parsing_fields.bits.cabac_init_present_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT; if (picture->pic_fields.bits.constrained_intra_pred_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED; if (picture->pic_fields.bits.transform_skip_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED; if (picture->pic_fields.bits.cu_qp_delta_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED; if (picture->slice_parsing_fields.bits.pps_slice_chroma_qp_offsets_present_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT; if (picture->pic_fields.bits.weighted_pred_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED; if (picture->pic_fields.bits.weighted_bipred_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED; if (picture->pic_fields.bits.transquant_bypass_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED; if (picture->pic_fields.bits.tiles_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED; if (picture->pic_fields.bits.entropy_coding_sync_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED; if (picture->pic_fields.bits.loop_filter_across_tiles_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; if (picture->pic_fields.bits.pps_loop_filter_across_slices_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED; if (picture->slice_parsing_fields.bits.deblocking_filter_override_enabled_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED; if (picture->slice_parsing_fields.bits.pps_disable_deblocking_filter_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER; if (picture->slice_parsing_fields.bits.lists_modification_present_flag) pps->flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT; /* SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (bit 18) — not exposed; skip */ /* DEBLOCKING_FILTER_CONTROL_PRESENT (bit 19, S1) — not exposed by VAAPI */ /* DEBLOCKING_FILTER_CONTROL_PRESENT (bit 19) and UNIFORM_SPACING (bit 20): * VAAPI does not expose either flag in VAPictureParameterBufferHEVC. * BBB-720p10s_hevc.mp4 uses neither tiles nor explicit deblocking- * control parameters; leaving these bits zero is correct for the * iter2 binding-cell fixture. */ } /* ===== Clause 6: DECODE_PARAMS (328 bytes) ===== * * NEW in modern API. Houses DPB info that was inside slice_params in * the staging-era. Per Phase 5 C2: dpb[].flags has only LONG_TERM_REFERENCE * bit; dpb[].pic_order_cnt_val (singular); poc_st_curr_*[] arrays hold * u8 DPB INDICES (not POC values). * * Pattern: classify each VAAPI ReferenceFrames[i] into ST_CURR_BEFORE / * ST_CURR_AFTER / LT_CURR; populate dpb[] sequentially; record the DPB * index in the matching classification array. */ static void h265_fill_decode_params(struct request_data *driver_data, VAPictureParameterBufferHEVC *picture, struct v4l2_ctrl_hevc_decode_params *decode_params) { struct object_surface *surface_object; VAPictureHEVC *hevc_picture; unsigned int i; uint8_t n_active = 0; uint8_t n_st_before = 0, n_st_after = 0, n_lt = 0; memset(decode_params, 0, sizeof(*decode_params)); decode_params->pic_order_cnt_val = picture->CurrPic.pic_order_cnt; for (i = 0; i < 15; i++) { hevc_picture = &picture->ReferenceFrames[i]; if (hevc_picture->picture_id == VA_INVALID_SURFACE || (hevc_picture->flags & VA_PICTURE_HEVC_INVALID)) continue; surface_object = (struct object_surface *) object_heap_lookup(&driver_data->surface_heap, hevc_picture->picture_id); if (surface_object == NULL) continue; if (n_active >= V4L2_HEVC_DPB_ENTRIES_NUM_MAX) break; decode_params->dpb[n_active].timestamp = v4l2_timeval_to_ns(&surface_object->timestamp); decode_params->dpb[n_active].pic_order_cnt_val = hevc_picture->pic_order_cnt; decode_params->dpb[n_active].field_pic = !!(hevc_picture->flags & VA_PICTURE_HEVC_FIELD_PIC); decode_params->dpb[n_active].flags = (hevc_picture->flags & VA_PICTURE_HEVC_RPS_LT_CURR) ? V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE : 0; /* dpb[n_active].reserved zeroed by memset */ /* Classify into one of the three "current" lists. * Each list holds the DPB INDEX (u8), not the POC value. */ if (hevc_picture->flags & VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE) { if (n_st_before < V4L2_HEVC_DPB_ENTRIES_NUM_MAX) decode_params->poc_st_curr_before[n_st_before++] = n_active; } else if (hevc_picture->flags & VA_PICTURE_HEVC_RPS_ST_CURR_AFTER) { if (n_st_after < V4L2_HEVC_DPB_ENTRIES_NUM_MAX) decode_params->poc_st_curr_after[n_st_after++] = n_active; } else if (hevc_picture->flags & VA_PICTURE_HEVC_RPS_LT_CURR) { if (n_lt < V4L2_HEVC_DPB_ENTRIES_NUM_MAX) decode_params->poc_lt_curr[n_lt++] = n_active; } n_active++; } decode_params->num_active_dpb_entries = n_active; decode_params->num_poc_st_curr_before = n_st_before; decode_params->num_poc_st_curr_after = n_st_after; decode_params->num_poc_lt_curr = n_lt; /* * iter26 α-26: VAAPI DOES expose short_term_ref_pic_set bit-count * via picture->st_rps_bits. Without populating this, rkvdec's * DPB reference resolution for P/B frames uses the wrong slice- * header skip and reads the wrong reference; frame 1 (IDR) decodes * correctly but frames 2+ diverge (iter25 evidence: cmp differs at * byte 1382401 = frame 2 boundary, kdirect bytes 4-5 = 0x0a 0x00, * libva = 0x00 0x00). * * long_term_ref_pic_set_size and num_delta_pocs_of_ref_rps_idx still * left zero (VAAPI doesn't expose either). */ decode_params->short_term_ref_pic_set_size = picture->st_rps_bits; /* * iter11 α-14: IRAP/IDR/NO_OUTPUT_OF_PRIOR flags. VAAPI doesn't * expose these in VAPictureParameterBufferHEVC. The iter2 binding * cell hardcoded them to 0 with the comment "BBB B/P-frames don't * need these set" — but IDR keyframes DO need IDR_PIC|IRAP_PIC. * Without them rkvdec doesn't recognise the keyframe boundary, * treats the IDR as inter without references, and produces all-zero * CAPTURE output (Bug 5). * * The flags are derived at h265_set_controls level after slice_params * have been parsed (slice_params[0].nal_unit_type carries the NAL * type extracted from the bitstream). Initialise to 0 here; the caller * patches the IRAP/IDR bits. */ decode_params->flags = 0; } /* ===== Clause 4: SLICE_PARAMS per slice ===== * * Called per slice in a loop in h265_set_controls. Output is one entry * in the dynamic-array of slice_params submitted to the kernel. * * source_offset is the byte offset within the surface_object->source_data * buffer where this slice's bitstream begins (after any ANNEX_B start * code prefix). data_byte_offset is the offset within the buffer to the * first byte of slice header data. * * Per Phase 5 C1: data_byte_offset is a BYTE offset (not a bit offset). * The old bit-search at h265.c:184-209 has been DROPPED. */ static void h265_fill_slice_params(VAPictureParameterBufferHEVC *picture, VASliceParameterBufferHEVC *slice, void *source_data, unsigned int source_offset, struct v4l2_ctrl_hevc_slice_params *slice_params) { uint8_t *b; uint8_t nal_unit_type, nuh_temporal_id_plus1; uint8_t pic_struct; uint8_t slice_type; unsigned int i, j; memset(slice_params, 0, sizeof(*slice_params)); /* NAL header parse from slice bitstream (after ANNEX_B start code). * source_offset points at the byte AFTER the start code (start code * was prepended by codec_store_buffer:68-75 if context->h264_start_code * is set). The first 2 bytes are the NAL unit header. */ b = (uint8_t *)source_data + source_offset; nal_unit_type = (b[0] >> H265_NAL_UNIT_TYPE_SHIFT) & H265_NAL_UNIT_TYPE_MASK; nuh_temporal_id_plus1 = (b[1] >> H265_NUH_TEMPORAL_ID_PLUS1_SHIFT) & H265_NUH_TEMPORAL_ID_PLUS1_MASK; /* * iter28 α-28: bit_size formula. * * VAAPI's slice_data_size is the size of the slice's source-data * buffer INCLUDING the NAL header and slice header. rkvdec_hevc * expects bit_size to cover the slice_data area starting at * data_byte_offset (the slice payload). Setting bit_size = * slice_data_size * 8 makes rkvdec read past the slice payload * into trailing bytes → wrong entropy state → frame 2+ visual * garbage. * * Empirical match with ffmpeg-v4l2request (which uses * (size+extra_size)*8 for the data it actually appended): * bit_size = (slice_data_size - slice_data_byte_offset) * 8 * yields 44096 bits for BBB frame 2 (= 5512 bytes), matching * kdirect exactly per iter27 dmesg printk. */ slice_params->bit_size = (slice->slice_data_size - slice->slice_data_byte_offset) * 8; /* C1: data_byte_offset, NOT data_bit_offset. Plain byte offset to * the first byte of slice segment header data within the OUTPUT * buffer. FFmpeg pattern at v4l2_request_hevc.c:190. */ slice_params->data_byte_offset = source_offset + slice->slice_data_byte_offset; /* * iter27 α-27: populate num_entry_point_offsets from VAAPI. * * BBB HEVC uses WPP (entropy_coding_sync_enabled_flag); each CTU row * after the first creates an entry point. For 720p with 32-pixel * CTUs that's 22 entry points per slice. Hardcoding 0 made rkvdec * miscount the slice header skip distance → wrong slice data * boundary → frame 2+ decoded with garbage reference data. * * Comment "iter2 doesn't do tiles" was inaccurate: WPP isn't tiles * but uses the same entry_point_offsets mechanism. */ /* * iter27 diagnostic: VAAPI ffmpeg-vaapi front-end reports * slice->num_entry_point_offsets = 0 for all slices even on WPP * streams (ffmpeg-vaapi doesn't parse this). Kernel rkvdec_hevc * doesn't reference num_entry_point_offsets either, so this field * is harmless either way. Leaving the VAAPI propagation in place * for future when ffmpeg-vaapi may populate it. */ slice_params->num_entry_point_offsets = slice->num_entry_point_offsets; slice_params->nal_unit_type = nal_unit_type; slice_params->nuh_temporal_id_plus1 = nuh_temporal_id_plus1; slice_type = slice->LongSliceFlags.fields.slice_type; slice_params->slice_type = slice_type; slice_params->colour_plane_id = slice->LongSliceFlags.fields.color_plane_id; slice_params->slice_pic_order_cnt = picture->CurrPic.pic_order_cnt; slice_params->num_ref_idx_l0_active_minus1 = slice->num_ref_idx_l0_active_minus1; slice_params->num_ref_idx_l1_active_minus1 = slice->num_ref_idx_l1_active_minus1; slice_params->collocated_ref_idx = slice->collocated_ref_idx; slice_params->five_minus_max_num_merge_cand = slice->five_minus_max_num_merge_cand; slice_params->slice_qp_delta = slice->slice_qp_delta; slice_params->slice_cb_qp_offset = slice->slice_cb_qp_offset; slice_params->slice_cr_qp_offset = slice->slice_cr_qp_offset; slice_params->slice_act_y_qp_offset = 0; /* VAAPI doesn't expose */ slice_params->slice_act_cb_qp_offset = 0; slice_params->slice_act_cr_qp_offset = 0; slice_params->slice_beta_offset_div2 = slice->slice_beta_offset_div2; slice_params->slice_tc_offset_div2 = slice->slice_tc_offset_div2; if (picture->CurrPic.flags & VA_PICTURE_HEVC_FIELD_PIC) { if (picture->CurrPic.flags & VA_PICTURE_HEVC_BOTTOM_FIELD) pic_struct = 2; else pic_struct = 1; } else { pic_struct = 0; } slice_params->pic_struct = pic_struct; /* reserved0[3] zeroed by memset */ /* Q2: slice_segment_addr from VAAPI (was missing in old h265.c). */ slice_params->slice_segment_addr = slice->slice_segment_address; /* Ref index arrays (DPB indices). For I-slices both are unused. */ for (i = 0; i < V4L2_HEVC_DPB_ENTRIES_NUM_MAX && slice_type != V4L2_HEVC_SLICE_TYPE_I; i++) { if (i < (slice->num_ref_idx_l0_active_minus1 + 1U)) slice_params->ref_idx_l0[i] = slice->RefPicList[0][i]; } for (i = 0; i < V4L2_HEVC_DPB_ENTRIES_NUM_MAX && slice_type == V4L2_HEVC_SLICE_TYPE_B; i++) { if (i < (slice->num_ref_idx_l1_active_minus1 + 1U)) slice_params->ref_idx_l1[i] = slice->RefPicList[1][i]; } /* * iter31 α-29: VAAPI's picture->st_rps_bits IS the bit-count of * short_term_ref_pic_set() in the slice header (per va_dec_hevc.h * doc-comment for st_rps_bits). This field is required by rkvdec * (assemble_sw_rps: line 386 in kernel rkvdec-hevc.c). When zero, * rkvdec falls back to fls(sps->num_short_term_ref_pic_sets - 1), * which is wrong when num_short_term_ref_pic_sets == 1 (BBB case). * * α-26 mis-targeted this onto decode_params->short_term_ref_pic_set_size * which rkvdec doesn't use. The actual consumer is slice_params. * * Note: VAAPI defines st_rps_bits as 0 when short_term_ref_pic_set_sps_flag=1 * (i.e. when slice uses an SPS-defined RPS rather than inline). For BBB, * st_rps_bits is non-zero for non-IDR slices. * * long_term_ref_pic_set_size still 0 — VAAPI doesn't expose this. */ slice_params->short_term_ref_pic_set_size = picture->st_rps_bits; slice_params->long_term_ref_pic_set_size = 0; /* Pred weight table */ slice_params->pred_weight_table.luma_log2_weight_denom = slice->luma_log2_weight_denom; slice_params->pred_weight_table.delta_chroma_log2_weight_denom = slice->delta_chroma_log2_weight_denom; for (i = 0; i < V4L2_HEVC_DPB_ENTRIES_NUM_MAX && slice_type != V4L2_HEVC_SLICE_TYPE_I; i++) { slice_params->pred_weight_table.delta_luma_weight_l0[i] = slice->delta_luma_weight_l0[i]; slice_params->pred_weight_table.luma_offset_l0[i] = slice->luma_offset_l0[i]; for (j = 0; j < 2; j++) { slice_params->pred_weight_table.delta_chroma_weight_l0[i][j] = slice->delta_chroma_weight_l0[i][j]; slice_params->pred_weight_table.chroma_offset_l0[i][j] = slice->ChromaOffsetL0[i][j]; } } for (i = 0; i < V4L2_HEVC_DPB_ENTRIES_NUM_MAX && slice_type == V4L2_HEVC_SLICE_TYPE_B; i++) { slice_params->pred_weight_table.delta_luma_weight_l1[i] = slice->delta_luma_weight_l1[i]; slice_params->pred_weight_table.luma_offset_l1[i] = slice->luma_offset_l1[i]; for (j = 0; j < 2; j++) { slice_params->pred_weight_table.delta_chroma_weight_l1[i][j] = slice->delta_chroma_weight_l1[i][j]; slice_params->pred_weight_table.chroma_offset_l1[i][j] = slice->ChromaOffsetL1[i][j]; } } /* reserved1[2] zeroed by memset */ /* 10 SLICE_PARAMS flag bits */ if (slice->LongSliceFlags.fields.slice_sao_luma_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA; if (slice->LongSliceFlags.fields.slice_sao_chroma_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA; if (slice->LongSliceFlags.fields.slice_temporal_mvp_enabled_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED; if (slice->LongSliceFlags.fields.mvd_l1_zero_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO; if (slice->LongSliceFlags.fields.cabac_init_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT; if (slice->LongSliceFlags.fields.collocated_from_l0_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0; /* USE_INTEGER_MV — VAAPI doesn't expose; leave 0 */ if (slice->LongSliceFlags.fields.slice_deblocking_filter_disabled_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED; if (slice->LongSliceFlags.fields.slice_loop_filter_across_slices_enabled_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED; if (slice->LongSliceFlags.fields.dependent_slice_segment_flag) slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT; } /* ===== Clause 5: SCALING_MATRIX (1296 bytes; conditional fill) ===== * * Per Phase 5 S3: when iqmatrix_set==false (BBB has no scaling list * in SPS flags), send memset-zero. Matches FFmpeg's pattern when the * stream has no scaling list. When iqmatrix_set==true, copy from VAAPI * VAIQMatrixBufferHEVC. */ static void h265_fill_scaling_matrix(VAIQMatrixBufferHEVC *iqmatrix, bool iqmatrix_set, struct v4l2_ctrl_hevc_scaling_matrix *scaling_matrix) { memset(scaling_matrix, 0, sizeof(*scaling_matrix)); if (!iqmatrix_set) return; /* memset zero matches FFmpeg sl=NULL path */ memcpy(scaling_matrix->scaling_list_4x4, iqmatrix->ScalingList4x4, sizeof(iqmatrix->ScalingList4x4)); memcpy(scaling_matrix->scaling_list_8x8, iqmatrix->ScalingList8x8, sizeof(iqmatrix->ScalingList8x8)); memcpy(scaling_matrix->scaling_list_16x16, iqmatrix->ScalingList16x16, sizeof(iqmatrix->ScalingList16x16)); memcpy(scaling_matrix->scaling_list_32x32, iqmatrix->ScalingList32x32, sizeof(iqmatrix->ScalingList32x32)); memcpy(scaling_matrix->scaling_list_dc_coef_16x16, iqmatrix->ScalingListDC16x16, sizeof(iqmatrix->ScalingListDC16x16)); memcpy(scaling_matrix->scaling_list_dc_coef_32x32, iqmatrix->ScalingListDC32x32, sizeof(iqmatrix->ScalingListDC32x32)); } /* ===== Clause 1: orchestrator — batched 5-control submission ===== */ int h265_set_controls(struct request_data *driver_data, struct object_context *context_object, struct object_surface *surface_object) { VAPictureParameterBufferHEVC *picture = &surface_object->params.h265.picture; VAIQMatrixBufferHEVC *iqmatrix = &surface_object->params.h265.iqmatrix; bool iqmatrix_set = surface_object->params.h265.iqmatrix_set; unsigned int num_slices = surface_object->params.h265.num_slices; struct v4l2_ctrl_hevc_sps sps; struct v4l2_ctrl_hevc_pps pps; struct v4l2_ctrl_hevc_decode_params decode_params; struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix; struct v4l2_ctrl_hevc_slice_params *slice_params_array = NULL; struct v4l2_ext_control controls[5]; unsigned int n = 0; unsigned int i; unsigned int prefix_bytes; unsigned int cumulative_offset = 0; int rc; if (num_slices == 0) return VA_STATUS_ERROR_OPERATION_FAILED; slice_params_array = calloc(num_slices, sizeof(struct v4l2_ctrl_hevc_slice_params)); if (slice_params_array == NULL) return VA_STATUS_ERROR_ALLOCATION_FAILED; /* Per-slice fill. ANNEX_B start code (3 bytes 0x00 0x00 0x01) is * prepended per slice by codec_store_buffer:68-75 when * context->h264_start_code is true. Track cumulative offset * accordingly. */ prefix_bytes = context_object->h264_start_code ? 3 : 0; for (i = 0; i < num_slices; i++) { VASliceParameterBufferHEVC *slice = &surface_object->params.h265.slices[i]; cumulative_offset += prefix_bytes; /* skip start code prefix for this slice */ h265_fill_slice_params(picture, slice, surface_object->source_data, cumulative_offset, &slice_params_array[i]); /* iter29 DIAG: dump trailing 80 bytes of each HEVC slice. * Set LIBVA_HEVC_DUMP_SLICE_TAIL=1 to enable. Goal: characterise * the 40-byte inflation in ffmpeg-vaapi vs ffmpeg-v4l2request for * frame 2+ slices (see iter27/28 close). */ if (getenv("LIBVA_HEVC_DUMP_SLICE_TAIL")) { uint32_t sz = slice->slice_data_size; uint32_t boff = slice->slice_data_byte_offset; uint8_t *p = (uint8_t *)surface_object->source_data + cumulative_offset; uint32_t dump_n = sz < 80 ? sz : 80; uint32_t start = sz - dump_n; uint32_t k; fprintf(stderr, "iter29 slice[%u] nut=%u size=%u boff=%u start_in_slice=%u tail80:", i, slice_params_array[i].nal_unit_type, sz, boff, start); for (k = 0; k < dump_n; k++) { if ((k & 0xf) == 0) fprintf(stderr, "\n +%04x:", start + k); fprintf(stderr, " %02x", p[start + k]); } fprintf(stderr, "\n"); } cumulative_offset += slice->slice_data_size; } h265_fill_sps(picture, &sps); h265_fill_pps(picture, &surface_object->params.h265.slices[0], &pps); h265_fill_decode_params(driver_data, picture, &decode_params); h265_fill_scaling_matrix(iqmatrix, iqmatrix_set, &scaling_matrix); /* * iter11 α-14: derive IRAP_PIC / IDR_PIC flags from the first * slice's nal_unit_type (already parsed by h265_fill_slice_params * from the bitstream into slice_params_array[0].nal_unit_type). * * H.265 §7.4.2.2: * nal_unit_type 16..23 are IRAP (random access). * nal_unit_type 19 (IDR_W_RADL) and 20 (IDR_N_LP) are IDR. * * Without setting these, rkvdec doesn't recognise the keyframe * boundary, treats the IDR as inter without references, and * produces all-zero CAPTURE output. Phase 3 confirmed kdirect * (ffmpeg-v4l2request) sets flags=0x03 (IRAP|IDR) on frame 1 * and decodes correctly through the same kernel. */ if (num_slices > 0) { uint8_t nut = slice_params_array[0].nal_unit_type; if (nut >= 16 && nut <= 23) decode_params.flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC; if (nut == 19 || nut == 20) decode_params.flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC; } controls[n++] = (struct v4l2_ext_control){ .id = V4L2_CID_STATELESS_HEVC_SPS, .ptr = &sps, .size = sizeof(sps), }; controls[n++] = (struct v4l2_ext_control){ .id = V4L2_CID_STATELESS_HEVC_PPS, .ptr = &pps, .size = sizeof(pps), }; controls[n++] = (struct v4l2_ext_control){ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, .ptr = slice_params_array, .size = sizeof(struct v4l2_ctrl_hevc_slice_params) * num_slices, }; controls[n++] = (struct v4l2_ext_control){ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, .ptr = &scaling_matrix, .size = sizeof(scaling_matrix), }; controls[n++] = (struct v4l2_ext_control){ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS, .ptr = &decode_params, .size = sizeof(decode_params), }; rc = v4l2_set_controls(driver_data->video_fd, surface_object->request_fd, controls, n); free(slice_params_array); if (rc < 0) return VA_STATUS_ERROR_OPERATION_FAILED; return 0; }