From d7ef0f6cd95fdf5894119acc3e16673c56ccca9c Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 17 May 2026 10:55:07 +0000 Subject: [PATCH] ampere-av1 Phase 3: SEQUENCE byte-equal kdirect; 3/10 frames PASS bit-exact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three more fixes after strace-diff localization vs kdirect. Fix 6 — fill_sequence ENABLE_SUPERRES: gate on picture->pic_info_fields.bits.use_superres instead of unconditional set-true. VAAPI doesn't expose enable_superres at sequence level; per strace diff kdirect clears the flag for streams not using superres (byte 1 of flags was the only SEQUENCE diff). After this fix, SEQUENCE ctrl byte-equal kdirect on every call. Fix 7 — refresh_frame_flags = 0xff (was 0): VAAPI doesn't expose refresh_frame_flags. Default 0xff = "refresh all DPB slots" matches kdirect's submission and AV1 spec default for KEY/SWITCH frames; for inter frames simple P-frame chains naturally tolerate this. Fix 8 — surface_object->av1_order_hint per-surface tracking. Set in av1_set_controls from picture->order_hint of the current frame. Also propagated to the linked display surface (when apply_grain=1 → cur_frame != cur_display) so future frames referencing the display surface find the order_hint via the linked_decode_surface_id. Tried + reverted: ref-name iteration of reference_frame_ts / order_hints via picture->ref_frame_idx[i-1] → DPB slot (Kwiboo's convention via FFmpeg's s->ref[i]). Empirically regressed 3/10 → 1/10. V4L2 uAPI's indexing here looks DPB-slot-direct despite the AV1 spec lexicon — needs kernel-side disambiguation to settle. Verification on ampere (av1_larger.ivf 352x288, 10 frames): Frames 0, 2, 4: PASS bit-exact (apply_grain=1, grain HW path) Frames 1, 3, 5-9: DIFF (apply_grain=0) 3/10 PASS (was 1/10 after iter checkpoint). test_av1.ivf 208x208: unchanged bit-exact PASS sha 029ee72c214b37c1 Remaining open: frame 1 (apply_grain=0, first inter) submits IDENTICAL FRAME ctrl bytes to kdirect (verified strace-diff post-fix), yet decoded output diverges. That means the divergence is no longer in control submission — points at OUTPUT-side bitstream differences between ffmpeg-vaapi and ffmpeg-v4l2request, or at DPB CAPTURE buffer state (grain-applied data being used as reference vs pre-grain). Co-Authored-By: Claude Opus 4.7 --- src/av1.c | 54 +++++++++++++++++++++++++++++++++++++++++---------- src/surface.c | 1 + src/surface.h | 10 ++++++++++ 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/src/av1.c b/src/av1.c index f559b5d..e47fb5d 100644 --- a/src/av1.c +++ b/src/av1.c @@ -139,10 +139,17 @@ static void av1_fill_sequence(VADecPictureParameterBufferAV1 *picture, ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT; if (picture->seq_info_fields.fields.enable_jnt_comp) ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP; - /* enable_ref_frame_mvs / enable_superres / enable_restoration not - * exposed at sequence level — conservative set-true. */ + /* enable_ref_frame_mvs / enable_restoration not exposed at sequence + * level — conservative set-true (kdirect also sets these for the + * test streams; gating doesn't matter because per-frame flags + * govern actual use). */ ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS; - ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES; + /* enable_superres: gate on the current frame's use_superres so the + * SEQUENCE flag matches the bitstream-derived value. Empirical + * strace diff vs kdirect: kdirect clears this for streams that + * never use superres; we were unconditionally setting it true. */ + if (picture->pic_info_fields.bits.use_superres) + ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES; if (picture->seq_info_fields.fields.enable_cdef) ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF; ctrl->flags |= V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION; @@ -390,7 +397,13 @@ static void av1_fill_frame(VADecPictureParameterBufferAV1 *picture, ctrl->render_width_minus_1 = picture->frame_width_minus1; ctrl->render_height_minus_1 = picture->frame_height_minus1; ctrl->current_frame_id = 0; - ctrl->refresh_frame_flags = 0; + /* Phase 3: VAAPI doesn't expose refresh_frame_flags. For KEY/SWITCH + * frames the AV1 spec mandates 0xff (refresh all DPB slots). For + * inter frames we default to 0xff too — simple P-frame chains will + * naturally rotate through slots without a precise per-slot value. + * If the stream needs precise control, this needs SPS-side parsing. + * Empirical diff vs kdirect shows kdirect always sends 0xff here. */ + ctrl->refresh_frame_flags = 0xff; /* ---- frame flags ---- */ if (picture->pic_info_fields.bits.show_frame) @@ -574,6 +587,14 @@ int av1_set_controls(struct request_data *driver_data, * VA_INVALID_SURFACE entries stay at the calloc'd zero timestamp * (kernel reads zero, doesn't try to dereference). */ + /* + * Empirical: DPB-slot iteration (i over ref_frame_map[i]) gives + * better correctness than ref-name iteration via ref_frame_idx[]. + * Tried the ref-name reindex (Kwiboo convention via FFmpeg s->ref[i]) + * and lost frames that previously PASSed (3/10 → 1/10) — so the V4L2 + * uAPI semantic here may be DPB-slot-indexed despite the AV1 spec + * lexicon. Phase 3 open question pending kernel-side disambiguation. + */ for (i = 0; i < BACKEND_AV1_TOTAL_REFS_PER_FRAME; i++) { VASurfaceID ref_id = picture->ref_frame_map[i]; struct object_surface *ref_surface; @@ -584,22 +605,35 @@ int av1_set_controls(struct request_data *driver_data, if (ref_surface == NULL) continue; ts = v4l2_timeval_to_ns(&ref_surface->timestamp); - /* AV1 film_grain: if the referenced surface is a display - * surface (linked to a decode surface), its own timestamp - * may be zero (never bound for OUTPUT QBUF). Follow the - * link to find the decode surface that carries the real - * timestamp. */ if (ts == 0 && ref_surface->linked_decode_surface_id != VA_INVALID_SURFACE) { struct object_surface *dec = SURFACE(driver_data, ref_surface->linked_decode_surface_id); - if (dec != NULL) + if (dec != NULL) { ts = v4l2_timeval_to_ns(&dec->timestamp); + frame.order_hints[i] = dec->av1_order_hint; + } + } else { + frame.order_hints[i] = ref_surface->av1_order_hint; } frame.reference_frame_ts[i] = ts; } + /* Phase 3: record this frame's order_hint on the surface so the + * NEXT frame's ref-loop can populate order_hints[] for slots that + * reference us. */ + surface_object->av1_order_hint = picture->order_hint; + /* Also propagate to the linked display surface (if any), since + * future frames' ref_frame_map[] may point at either. */ + if (picture->current_display_picture != VA_INVALID_SURFACE && + picture->current_display_picture != picture->current_frame) { + struct object_surface *disp = + SURFACE(driver_data, picture->current_display_picture); + if (disp != NULL) + disp->av1_order_hint = picture->order_hint; + } + if (driver_data->has_av1_film_grain) av1_fill_film_grain(picture, &film_grain); diff --git a/src/surface.c b/src/surface.c index 09298bf..bf78804 100644 --- a/src/surface.c +++ b/src/surface.c @@ -200,6 +200,7 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format, surface_object->current_slot = NULL; /* iter2 Fix 3 */ surface_object->linked_decode_surface_id = VA_INVALID_SURFACE; + surface_object->av1_order_hint = 0; surface_object->destination_index = 0; /* set on bind */ surface_object->destination_planes_count = 0; /* set at CreateContext */ surface_object->destination_buffers_count = 0; /* set at CreateContext */ diff --git a/src/surface.h b/src/surface.h index d47caa7..6d5b65e 100644 --- a/src/surface.h +++ b/src/surface.h @@ -106,6 +106,16 @@ struct object_surface { */ VASurfaceID linked_decode_surface_id; + /* + * AV1 Phase 3: AV1 order_hint of the frame currently decoded into + * this surface. VAAPI's VADecPictureParameterBufferAV1.order_hint + * is per-frame; kernel's v4l2_ctrl_av1_frame.order_hints[8] is + * per-reference. We track each decoded frame's order_hint here so + * the next frame's av1_set_controls can populate order_hints[i] + * from ref_frame_map[i] → SURFACE → av1_order_hint. + */ + uint8_t av1_order_hint; + union { struct { VAPictureParameterBufferMPEG2 picture;