diff --git a/src/av1.c b/src/av1.c index 6d10061..5b2970c 100644 --- a/src/av1.c +++ b/src/av1.c @@ -1,23 +1,33 @@ /* * Copyright (C) 2026 claude-noether * - * ampere-av1-enablement Phase 2: AV1 codec dispatcher for libva-v4l2- + * ampere-av1-enablement Phase 2.1: AV1 codec dispatcher for libva-v4l2- * request-fourier. Translates VAAPI AV1 picture/slice parameter buffers * into V4L2 stateless AV1 controls (V4L2_CID_STATELESS_AV1_*) for the * Rockchip vpu981 hardware on RK3588. * - * Reference implementations (field semantics): - * - Kwiboo/FFmpeg v4l2-request-n8.1:libavcodec/v4l2_request_av1.c - * (636 LoC, reads from FFmpeg's AV1RawSequenceHeader; the v4l2_ctrl - * output is identical to what we need) - * - ~/src/libva-v4l2-request-fourier/src/vp9.c (architectural pattern: - * set_controls / multi-control batch / request_fd dispatch) + * Reference: Kwiboo/FFmpeg v4l2-request-n8.1:libavcodec/v4l2_request_av1.c + * (636 LoC; reads from FFmpeg's AV1RawSequenceHeader + AV1RawFrameHeader). + * VAAPI exposes the same AV1 spec semantics through different struct + * shapes: sequence-level fields are folded into VADecPictureParameterBufferAV1 + * (no separate sequence buffer); per-frame fields live in the same struct. + * + * F1/F2/F3 risk mitigations per phase1_plan_v2 §"General fill_frame + * implementation risks": + * F1 tile_info.mi_col/row_starts sentinel = 2 * ((frame_width + 7) >> 3) + * mirrors Kwiboo lines 238/244 exactly. + * F2 superres_denom: VAAPI exposes superres_scale_denominator directly + * and per spec it's already 8 when use_superres=0. No offset math + * needed (Kwiboo does it because FFmpeg stores raw coded_denom). + * F3 loop_restoration_size[] gated on USES_LR flag mirrors Kwiboo + * lines 281-287 exactly. * * V4L2 controls (4 per frame, batched in one VIDIOC_S_EXT_CTRLS): - * 1. V4L2_CID_STATELESS_AV1_SEQUENCE (small, set per stream-ish) - * 2. V4L2_CID_STATELESS_AV1_FRAME (the heavy one — 8 sub-structs) + * 1. V4L2_CID_STATELESS_AV1_SEQUENCE + * 2. V4L2_CID_STATELESS_AV1_FRAME * 3. V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY[] (DYNAMIC_ARRAY) - * 4. V4L2_CID_STATELESS_AV1_FILM_GRAIN (conditional on probe) + * 4. V4L2_CID_STATELESS_AV1_FILM_GRAIN (conditional on driver_data-> + * has_av1_film_grain probe) * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -53,39 +63,513 @@ #include #include +#include #include /* Sanity asserts to catch kernel uAPI drift. If these fire, the kernel * headers on the build machine are out of sync with what the running - * driver expects — silent register-misalignment bugs result. */ + * driver expects — silent register-misalignment bugs result. Cross-compile + * hazard per Janet v3 amendment: native-arm64 builds only (boltzmann + + * ampere); no cross from x86 against ARM kernel headers. */ _Static_assert(sizeof(struct v4l2_ctrl_av1_tile_group_entry) == 16, "v4l2_ctrl_av1_tile_group_entry size drift — recheck uAPI"); -/* - * Phase 2 step 4 — stub set_controls. Compiles and links; returns -1 - * with a clear log message so the test infrastructure sees AV1 dispatch - * fail cleanly (not crash) until Phase 2.1 implements the actual field - * mappings. - * - * The full implementation follows Kwiboo's fill_sequence / fill_frame / - * fill_film_grain functions, mapping VAAPI AV1 picture parameters - * (VADecPictureParameterBufferAV1) to V4L2 control structs. Per Janet - * review v2, three implementation-time risks must be specifically - * handled: - * F1 tile_info.mi_col/row_starts sentinel for multi-tile streams - * F2 superres_denom = AV1_SUPERRES_NUM (8) when use_superres=0 - * F3 loop_restoration_size[] gated on USES_LR flag direction - */ +/* Per AV1 spec, when use_superres=0 the superres denominator is 8. + * VAAPI's superres_scale_denominator already encodes this directly + * (per va_dec_av1.h: "When use_superres=0, superres_scale_denominator + * must be 8"). Kwiboo's AV1_SUPERRES_DENOM_MIN+coded_denom math is + * not needed when reading from VAAPI. */ +#define AV1_SUPERRES_NUM 8 + +/* AV1 spec maxima used for V4L2 array sizing. */ +#define BACKEND_AV1_MAX_SEGMENTS 8 +#define BACKEND_AV1_SEG_LVL_MAX 8 +#define BACKEND_AV1_SEG_LVL_REF_FRAME 5 +#define BACKEND_AV1_NUM_REF_FRAMES 8 +#define BACKEND_AV1_TOTAL_REFS_PER_FRAME 8 +#define BACKEND_AV1_REFS_PER_FRAME 7 + +/* ===== fill_sequence ===== */ +static void av1_fill_sequence(VADecPictureParameterBufferAV1 *picture, + struct v4l2_ctrl_av1_sequence *ctrl) +{ + uint8_t bit_depth; + + memset(ctrl, 0, sizeof(*ctrl)); + + switch (picture->bit_depth_idx) { + case 0: bit_depth = 8; break; + case 1: bit_depth = 10; break; + case 2: bit_depth = 12; break; + default: bit_depth = 8; break; + } + + ctrl->seq_profile = picture->profile; + ctrl->order_hint_bits = picture->seq_info_fields.fields.enable_order_hint ? + (picture->order_hint_bits_minus_1 + 1) : 0; + ctrl->bit_depth = bit_depth; + /* VAAPI does NOT separately expose max_frame_{width,height}_minus_1 + * (sequence-level). Use the current frame size as a proxy. Correct + * for fixed-size sequences (the 208/352/1080p test vectors). */ + ctrl->max_frame_width_minus_1 = picture->frame_width_minus1; + ctrl->max_frame_height_minus_1 = picture->frame_height_minus1; + + if (picture->seq_info_fields.fields.still_picture) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE; + if (picture->seq_info_fields.fields.use_128x128_superblock) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK; + if (picture->seq_info_fields.fields.enable_filter_intra) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA; + if (picture->seq_info_fields.fields.enable_intra_edge_filter) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER; + if (picture->seq_info_fields.fields.enable_interintra_compound) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND; + if (picture->seq_info_fields.fields.enable_masked_compound) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND; + /* VAAPI doesn't expose enable_warped_motion as a sequence flag; + * per-frame allow_warped_motion gates it. Conservative: set true so + * per-frame flag is honored. */ + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION; + if (picture->seq_info_fields.fields.enable_dual_filter) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER; + if (picture->seq_info_fields.fields.enable_order_hint) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT; + if (picture->seq_info_fields.fields.enable_jnt_comp) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP; + /* enable_ref_frame_mvs / enable_superres / enable_restoration not + * exposed at sequence level — conservative set-true. */ + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS; + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES; + if (picture->seq_info_fields.fields.enable_cdef) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF; + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION; + if (picture->seq_info_fields.fields.mono_chrome) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME; + if (picture->seq_info_fields.fields.color_range) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE; + if (picture->seq_info_fields.fields.subsampling_x) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X; + if (picture->seq_info_fields.fields.subsampling_y) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y; + if (picture->seq_info_fields.fields.film_grain_params_present) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT; +} + +/* ===== fill_frame ===== */ +static void av1_fill_frame(VADecPictureParameterBufferAV1 *picture, + struct v4l2_ctrl_av1_frame *ctrl) +{ + unsigned int i, j; + + memset(ctrl, 0, sizeof(*ctrl)); + + /* ---- tile_info ---- */ + ctrl->tile_info.context_update_tile_id = picture->context_update_tile_id; + ctrl->tile_info.tile_cols = picture->tile_cols; + ctrl->tile_info.tile_rows = picture->tile_rows; + if (picture->tile_cols > 1 || picture->tile_rows > 1) + ctrl->tile_info.tile_size_bytes = 4; + else + ctrl->tile_info.tile_size_bytes = 0; + + if (picture->pic_info_fields.bits.uniform_tile_spacing_flag) + ctrl->tile_info.flags |= V4L2_AV1_TILE_INFO_FLAG_UNIFORM_TILE_SPACING; + + /* F1: mi_col/row_starts[]: prefix-sum from width_in_sbs_minus_1[]+1 + * (Kwiboo reads tile_start_col_sb[] directly; VAAPI doesn't expose + * starts, only widths — reconstruct via accumulation). Plus the + * sentinel at index tile_cols/tile_rows. */ + { + uint16_t cum = 0; + for (i = 0; i < picture->tile_cols && i < 63; i++) { + ctrl->tile_info.mi_col_starts[i] = cum; + ctrl->tile_info.width_in_sbs_minus_1[i] = + picture->width_in_sbs_minus_1[i]; + cum = (uint16_t)(cum + picture->width_in_sbs_minus_1[i] + 1); + } + ctrl->tile_info.mi_col_starts[picture->tile_cols] = + 2 * ((picture->frame_width_minus1 + 1 + 7) >> 3); + } + { + uint16_t cum = 0; + for (i = 0; i < picture->tile_rows && i < 63; i++) { + ctrl->tile_info.mi_row_starts[i] = cum; + ctrl->tile_info.height_in_sbs_minus_1[i] = + picture->height_in_sbs_minus_1[i]; + cum = (uint16_t)(cum + picture->height_in_sbs_minus_1[i] + 1); + } + ctrl->tile_info.mi_row_starts[picture->tile_rows] = + 2 * ((picture->frame_height_minus1 + 1 + 7) >> 3); + } + + /* ---- quantization ---- */ + ctrl->quantization.base_q_idx = picture->base_qindex; + ctrl->quantization.delta_q_y_dc = picture->y_dc_delta_q; + ctrl->quantization.delta_q_u_dc = picture->u_dc_delta_q; + ctrl->quantization.delta_q_u_ac = picture->u_ac_delta_q; + ctrl->quantization.delta_q_v_dc = picture->v_dc_delta_q; + ctrl->quantization.delta_q_v_ac = picture->v_ac_delta_q; + ctrl->quantization.qm_y = picture->qmatrix_fields.bits.qm_y; + ctrl->quantization.qm_u = picture->qmatrix_fields.bits.qm_u; + ctrl->quantization.qm_v = picture->qmatrix_fields.bits.qm_v; + ctrl->quantization.delta_q_res = + picture->mode_control_fields.bits.log2_delta_q_res; + + if (picture->u_dc_delta_q != picture->v_dc_delta_q || + picture->u_ac_delta_q != picture->v_ac_delta_q) + ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_DIFF_UV_DELTA; + if (picture->qmatrix_fields.bits.using_qmatrix) + ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_USING_QMATRIX; + if (picture->mode_control_fields.bits.delta_q_present_flag) + ctrl->quantization.flags |= V4L2_AV1_QUANTIZATION_FLAG_DELTA_Q_PRESENT; + + /* ---- segmentation ---- */ + if (picture->seg_info.segment_info_fields.bits.enabled) + ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_ENABLED; + if (picture->seg_info.segment_info_fields.bits.update_map) + ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_UPDATE_MAP; + if (picture->seg_info.segment_info_fields.bits.temporal_update) + ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_TEMPORAL_UPDATE; + if (picture->seg_info.segment_info_fields.bits.update_data) + ctrl->segmentation.flags |= V4L2_AV1_SEGMENTATION_FLAG_UPDATE_DATA; + + for (i = 0; i < BACKEND_AV1_MAX_SEGMENTS; i++) { + for (j = 0; j < BACKEND_AV1_SEG_LVL_MAX; j++) { + if (picture->seg_info.feature_mask[i] & (1 << j)) { + ctrl->segmentation.feature_enabled[i] |= + V4L2_AV1_SEGMENT_FEATURE_ENABLED(j); + ctrl->segmentation.last_active_seg_id = i; + if (j >= BACKEND_AV1_SEG_LVL_REF_FRAME) + ctrl->segmentation.flags |= + V4L2_AV1_SEGMENTATION_FLAG_SEG_ID_PRE_SKIP; + } + ctrl->segmentation.feature_data[i][j] = + picture->seg_info.feature_data[i][j]; + } + } + + /* ---- loop_filter ---- */ + ctrl->loop_filter.level[0] = picture->filter_level[0]; + ctrl->loop_filter.level[1] = picture->filter_level[1]; + ctrl->loop_filter.level[2] = picture->filter_level_u; + ctrl->loop_filter.level[3] = picture->filter_level_v; + ctrl->loop_filter.sharpness = + picture->loop_filter_info_fields.bits.sharpness_level; + ctrl->loop_filter.mode_deltas[0] = picture->mode_deltas[0]; + ctrl->loop_filter.mode_deltas[1] = picture->mode_deltas[1]; + ctrl->loop_filter.delta_lf_res = + picture->mode_control_fields.bits.log2_delta_lf_res; + for (i = 0; i < BACKEND_AV1_NUM_REF_FRAMES; i++) + ctrl->loop_filter.ref_deltas[i] = picture->ref_deltas[i]; + + if (picture->loop_filter_info_fields.bits.mode_ref_delta_enabled) + ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_ENABLED; + if (picture->loop_filter_info_fields.bits.mode_ref_delta_update) + ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_UPDATE; + if (picture->mode_control_fields.bits.delta_lf_present_flag) + ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_PRESENT; + if (picture->mode_control_fields.bits.delta_lf_multi) + ctrl->loop_filter.flags |= V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_MULTI; + + /* ---- cdef ---- */ + ctrl->cdef.damping_minus_3 = picture->cdef_damping_minus_3; + ctrl->cdef.bits = picture->cdef_bits; + for (i = 0; i < (unsigned)(1 << picture->cdef_bits) && i < 8; i++) { + uint8_t y = picture->cdef_y_strengths[i]; + uint8_t uv = picture->cdef_uv_strengths[i]; + ctrl->cdef.y_pri_strength[i] = (y >> 2) & 0x0F; + ctrl->cdef.y_sec_strength[i] = y & 0x03; + ctrl->cdef.uv_pri_strength[i] = (uv >> 2) & 0x0F; + ctrl->cdef.uv_sec_strength[i] = uv & 0x03; + } + + /* ---- loop_restoration ---- (F3) */ + { + uint8_t remap[4] = { + V4L2_AV1_FRAME_RESTORE_NONE, + V4L2_AV1_FRAME_RESTORE_WIENER, + V4L2_AV1_FRAME_RESTORE_SGRPROJ, + V4L2_AV1_FRAME_RESTORE_SWITCHABLE, + }; + uint8_t y_t = picture->loop_restoration_fields.bits.yframe_restoration_type & 3; + uint8_t cb_t = picture->loop_restoration_fields.bits.cbframe_restoration_type & 3; + uint8_t cr_t = picture->loop_restoration_fields.bits.crframe_restoration_type & 3; + bool uses_lr = false; + + ctrl->loop_restoration.frame_restoration_type[0] = remap[y_t]; + ctrl->loop_restoration.frame_restoration_type[1] = remap[cb_t]; + ctrl->loop_restoration.frame_restoration_type[2] = remap[cr_t]; + if (y_t != 0) + uses_lr = true; + if (cb_t != 0 || cr_t != 0) { + uses_lr = true; + ctrl->loop_restoration.flags |= + V4L2_AV1_LOOP_RESTORATION_FLAG_USES_CHROMA_LR; + } + + ctrl->loop_restoration.lr_unit_shift = + picture->loop_restoration_fields.bits.lr_unit_shift; + ctrl->loop_restoration.lr_uv_shift = + picture->loop_restoration_fields.bits.lr_uv_shift; + + if (uses_lr) { + uint8_t shift = picture->loop_restoration_fields.bits.lr_unit_shift; + uint8_t uv_shift = picture->loop_restoration_fields.bits.lr_uv_shift; + ctrl->loop_restoration.flags |= + V4L2_AV1_LOOP_RESTORATION_FLAG_USES_LR; + ctrl->loop_restoration.loop_restoration_size[0] = + 1 << (6 + shift); + ctrl->loop_restoration.loop_restoration_size[1] = + 1 << (6 + shift - uv_shift); + ctrl->loop_restoration.loop_restoration_size[2] = + 1 << (6 + shift - uv_shift); + } + } + + /* ---- global_motion ---- */ + for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++) { + if (i == 0) + continue; /* INTRA_FRAME slot — no warp */ + ctrl->global_motion.type[i] = picture->wm[i - 1].wmtype; + for (j = 0; j < 6; j++) + ctrl->global_motion.params[i][j] = picture->wm[i - 1].wmmat[j]; + if (picture->wm[i - 1].invalid) + ctrl->global_motion.invalid |= + V4L2_AV1_GLOBAL_MOTION_IS_INVALID(i); + switch (picture->wm[i - 1].wmtype) { + case 1: + ctrl->global_motion.flags[i] |= + V4L2_AV1_GLOBAL_MOTION_FLAG_IS_TRANSLATION; + ctrl->global_motion.flags[i] |= + V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL; + break; + case 2: + ctrl->global_motion.flags[i] |= + V4L2_AV1_GLOBAL_MOTION_FLAG_IS_ROT_ZOOM; + ctrl->global_motion.flags[i] |= + V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL; + break; + case 3: + ctrl->global_motion.flags[i] |= + V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL; + break; + default: + break; + } + } + + /* ---- reference frames + order hints ---- */ + for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++) { + /* VAAPI doesn't expose order_hints[]; leave zero. */ + ctrl->order_hints[i] = 0; + ctrl->reference_frame_ts[i] = 0; + } + for (i = 0; i < BACKEND_AV1_REFS_PER_FRAME; i++) + ctrl->ref_frame_idx[i] = picture->ref_frame_idx[i]; + + /* F2: superres_denom direct from VAAPI; fallback to AV1_SUPERRES_NUM + * if zero (spec violation but defensive). */ + ctrl->superres_denom = picture->superres_scale_denominator + ? picture->superres_scale_denominator : AV1_SUPERRES_NUM; + + ctrl->skip_mode_frame[0] = 0; + ctrl->skip_mode_frame[1] = 0; + ctrl->primary_ref_frame = picture->primary_ref_frame; + ctrl->frame_type = picture->pic_info_fields.bits.frame_type; + ctrl->order_hint = picture->order_hint; + ctrl->upscaled_width = picture->frame_width_minus1 + 1; + ctrl->interpolation_filter = picture->interp_filter; + ctrl->tx_mode = picture->mode_control_fields.bits.tx_mode; + ctrl->frame_width_minus_1 = picture->frame_width_minus1; + ctrl->frame_height_minus_1 = picture->frame_height_minus1; + ctrl->render_width_minus_1 = picture->frame_width_minus1; + ctrl->render_height_minus_1 = picture->frame_height_minus1; + ctrl->current_frame_id = 0; + ctrl->refresh_frame_flags = 0; + + /* ---- frame flags ---- */ + if (picture->pic_info_fields.bits.show_frame) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_SHOW_FRAME; + if (picture->pic_info_fields.bits.showable_frame) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME; + if (picture->pic_info_fields.bits.error_resilient_mode) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE; + if (picture->pic_info_fields.bits.disable_cdf_update) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE; + if (picture->pic_info_fields.bits.allow_screen_content_tools) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS; + if (picture->pic_info_fields.bits.force_integer_mv) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV; + if (picture->pic_info_fields.bits.allow_intrabc) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC; + if (picture->pic_info_fields.bits.use_superres) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_USE_SUPERRES; + if (picture->pic_info_fields.bits.allow_high_precision_mv) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV; + if (picture->pic_info_fields.bits.is_motion_mode_switchable) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE; + if (picture->pic_info_fields.bits.use_ref_frame_mvs) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS; + if (picture->pic_info_fields.bits.disable_frame_end_update_cdf) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF; + if (picture->pic_info_fields.bits.allow_warped_motion) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION; + if (picture->mode_control_fields.bits.reference_select) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT; + if (picture->mode_control_fields.bits.reduced_tx_set_used) + ctrl->flags |= V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET; + if (picture->mode_control_fields.bits.skip_mode_present) { + ctrl->flags |= V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED; + ctrl->flags |= V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT; + } +} + +/* ===== fill_film_grain ===== */ +static void av1_fill_film_grain(VADecPictureParameterBufferAV1 *picture, + struct v4l2_ctrl_av1_film_grain *ctrl) +{ + VAFilmGrainStructAV1 *fg = &picture->film_grain_info; + unsigned int i; + + memset(ctrl, 0, sizeof(*ctrl)); + + ctrl->cr_mult = fg->cr_mult; + ctrl->grain_seed = fg->grain_seed; + /* VAAPI doesn't expose film_grain_params_ref_idx (the reuse-from- + * previous-frame index). Leave zero — only consulted when + * update_grain=0, which VAAPI also doesn't expose. */ + ctrl->film_grain_params_ref_idx = 0; + ctrl->num_y_points = fg->num_y_points; + ctrl->num_cb_points = fg->num_cb_points; + ctrl->num_cr_points = fg->num_cr_points; + ctrl->grain_scaling_minus_8 = + fg->film_grain_info_fields.bits.grain_scaling_minus_8; + ctrl->ar_coeff_lag = fg->film_grain_info_fields.bits.ar_coeff_lag; + ctrl->ar_coeff_shift_minus_6 = + fg->film_grain_info_fields.bits.ar_coeff_shift_minus_6; + ctrl->grain_scale_shift = + fg->film_grain_info_fields.bits.grain_scale_shift; + ctrl->cb_mult = fg->cb_mult; + ctrl->cb_luma_mult = fg->cb_luma_mult; + ctrl->cr_luma_mult = fg->cr_luma_mult; + ctrl->cb_offset = fg->cb_offset; + ctrl->cr_offset = fg->cr_offset; + + if (fg->film_grain_info_fields.bits.apply_grain) + ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN; + /* VAAPI doesn't expose update_grain; not setting the flag means + * "reuse params from film_grain_params_ref_idx" — defaulting to + * "update with submitted params" (which is what apply_grain implies + * when set). The flag's omission is safe for vpu981 which derives + * grain state from the submitted control payload, not from a + * separate reuse signal. */ + if (fg->film_grain_info_fields.bits.chroma_scaling_from_luma) + ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_CHROMA_SCALING_FROM_LUMA; + if (fg->film_grain_info_fields.bits.overlap_flag) + ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_OVERLAP; + if (fg->film_grain_info_fields.bits.clip_to_restricted_range) + ctrl->flags |= V4L2_AV1_FILM_GRAIN_FLAG_CLIP_TO_RESTRICTED_RANGE; + + if (!fg->film_grain_info_fields.bits.apply_grain) + return; + + for (i = 0; i < fg->num_y_points && i < 14; i++) { + ctrl->point_y_value[i] = fg->point_y_value[i]; + ctrl->point_y_scaling[i] = fg->point_y_scaling[i]; + } + for (i = 0; i < fg->num_cb_points && i < 10; i++) { + ctrl->point_cb_value[i] = fg->point_cb_value[i]; + ctrl->point_cb_scaling[i] = fg->point_cb_scaling[i]; + } + for (i = 0; i < fg->num_cr_points && i < 10; i++) { + ctrl->point_cr_value[i] = fg->point_cr_value[i]; + ctrl->point_cr_scaling[i] = fg->point_cr_scaling[i]; + } + for (i = 0; i < 24; i++) + ctrl->ar_coeffs_y_plus_128[i] = (uint8_t)(fg->ar_coeffs_y[i] + 128); + for (i = 0; i < 25; i++) { + ctrl->ar_coeffs_cb_plus_128[i] = (uint8_t)(fg->ar_coeffs_cb[i] + 128); + ctrl->ar_coeffs_cr_plus_128[i] = (uint8_t)(fg->ar_coeffs_cr[i] + 128); + } +} + +/* ===== orchestrator ===== */ int av1_set_controls(struct request_data *driver_data, struct object_context *context, struct object_surface *surface_object) { - (void)driver_data; - (void)context; - (void)surface_object; + VADecPictureParameterBufferAV1 *picture = + &surface_object->params.av1.picture; + unsigned int num_tiles = surface_object->params.av1.num_tile_group_entries; + struct v4l2_ctrl_av1_sequence sequence; + struct v4l2_ctrl_av1_frame frame; + struct v4l2_ctrl_av1_film_grain film_grain; + struct v4l2_ctrl_av1_tile_group_entry *tile_entries = NULL; + struct v4l2_ext_control controls[4]; + unsigned int n = 0; + unsigned int i; + unsigned int alloc_tiles; + int rc; - request_log("ampere-av1: av1_set_controls stub — Phase 2.1 will " - "implement fill_sequence/fill_frame/fill_film_grain/" - "fill_tile_group_entries\n"); - return -1; + (void)context; + + if (num_tiles > AV1_MAX_TILES) + num_tiles = AV1_MAX_TILES; + + /* DYNAMIC_ARRAY size = MAX(num_tiles, 1) per Janet v2 Q1 + * amendment — kernel UB on size=0. */ + alloc_tiles = num_tiles > 0 ? num_tiles : 1; + tile_entries = calloc(alloc_tiles, sizeof(*tile_entries)); + if (tile_entries == NULL) + return -1; + + for (i = 0; i < num_tiles; i++) { + VASliceParameterBufferAV1 *slice = + &surface_object->params.av1.tile_group_entries[i]; + tile_entries[i].tile_offset = slice->slice_data_offset; + tile_entries[i].tile_size = slice->slice_data_size; + tile_entries[i].tile_row = (uint8_t)slice->tile_row; + tile_entries[i].tile_col = (uint8_t)slice->tile_column; + } + + av1_fill_sequence(picture, &sequence); + av1_fill_frame(picture, &frame); + if (driver_data->has_av1_film_grain) + av1_fill_film_grain(picture, &film_grain); + + controls[n++] = (struct v4l2_ext_control){ + .id = V4L2_CID_STATELESS_AV1_SEQUENCE, + .ptr = &sequence, + .size = sizeof(sequence), + }; + controls[n++] = (struct v4l2_ext_control){ + .id = V4L2_CID_STATELESS_AV1_FRAME, + .ptr = &frame, + .size = sizeof(frame), + }; + controls[n++] = (struct v4l2_ext_control){ + .id = V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY, + .ptr = tile_entries, + .size = sizeof(*tile_entries) * alloc_tiles, + }; + if (driver_data->has_av1_film_grain) { + controls[n++] = (struct v4l2_ext_control){ + .id = V4L2_CID_STATELESS_AV1_FILM_GRAIN, + .ptr = &film_grain, + .size = sizeof(film_grain), + }; + } + + rc = v4l2_set_controls(driver_data->video_fd, + surface_object->request_fd, + controls, n); + + free(tile_entries); + + if (rc < 0) { + request_log("ampere-av1: VIDIOC_S_EXT_CTRLS failed rc=%d\n", rc); + return -1; + } + + return 0; } diff --git a/src/request.c b/src/request.c index 3769e15..0d8e7f3 100644 --- a/src/request.c +++ b/src/request.c @@ -830,6 +830,24 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context) "vendored GStreamer parser)\n"); } + /* + * ampere-av1 Phase 2.1: probe V4L2_CID_STATELESS_AV1_FILM_GRAIN + * on the vpu981 fd. Per Janet v3 amendment, this runs at backend + * init (not lazily) so any race window with concurrent device + * switching can't observe an inconsistent flag. + */ + driver_data->has_av1_film_grain = false; + if (driver_data->video_fd_vpu981 >= 0) { + struct v4l2_query_ext_ctrl qec; + if (v4l2_query_ext_ctrl(driver_data->video_fd_vpu981, + V4L2_CID_STATELESS_AV1_FILM_GRAIN, + &qec) == 0) { + driver_data->has_av1_film_grain = true; + request_log("ampere-av1: vpu981 advertises FILM_GRAIN " + "control (will include in per-frame batch)\n"); + } + } + status = VA_STATUS_SUCCESS; goto complete; diff --git a/src/request.h b/src/request.h index a58a44d..d2d0725 100644 --- a/src/request.h +++ b/src/request.h @@ -113,6 +113,18 @@ struct request_data { bool has_hevc_ext_sps_rps_rkvdec; bool has_hevc_ext_sps_rps_hantro; + /* + * ampere-av1 Phase 2.1: probe result for the optional + * V4L2_CID_STATELESS_AV1_FILM_GRAIN control on the vpu981 fd. + * Probed at VA_DRIVER_INIT (per Janet v3 amendment — init-time + * not lazy). Consumed by av1_set_controls to conditionally include + * the 4th control in the per-frame batch. + * + * True iff vpu981 advertises the control via VIDIOC_QUERY_EXT_CTRL. + * False for non-RK3588 hosts (no vpu981 fd) or older kernels. + */ + bool has_av1_film_grain; + /* * iter2 — cached SPS-derived RPS arrays. SPS NALs only appear in * source_data on IDR frames; non-IDR frames' h265_set_controls