diff --git a/src/picture.c b/src/picture.c
index 29d9d4e..fe44d35 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -76,6 +76,39 @@ static VAStatus codec_store_buffer(struct request_data *driver_data,
 			       start_code, sizeof(start_code));
 			surface_object->slices_size += sizeof(start_code);
 		}
+		/*
+		 * iter33 α-30: VP8 OUTPUT buffer needs the uncompressed
+		 * frame header that ffmpeg-vaapi stripped before submitting
+		 * VASliceData. Hantro's vp8_dec_run reads OUTPUT[0..N] with
+		 * an assumed offset of 10 bytes (keyframe) or 3 bytes
+		 * (interframe) before the first_partition data — see
+		 * rockchip_vpu2_hw_vp8_dec.c:349.
+		 *
+		 * ffmpeg-vaapi (vaapi_vp8.c:191-192) strips
+		 *   header_size = 3 + 7 * s->keyframe
+		 * before submitting the slice data, so libva needs to
+		 * pre-pad the OUTPUT with that many bytes. Hantro only
+		 * uses these bytes for offset arithmetic, not parsing,
+		 * so zero-filled placeholder is sufficient.
+		 *
+		 * ffmpeg-v4l2request (kdirect path) does NOT strip the
+		 * header, hence its OUTPUT is byte-equal to SW reference
+		 * and decode works correctly. This is the only material
+		 * difference between the two front-ends for VP8.
+		 *
+		 * key_frame in VAAPI's pic_fields.bits is INVERTED:
+		 *   0 → keyframe, 1 → interframe.
+		 */
+		if (profile == VAProfileVP8Version0_3 &&
+		    surface_object->params.vp8.iqmatrix_set /* picture parsed by now */) {
+			unsigned int header_size =
+				surface_object->params.vp8.picture.pic_fields.bits.key_frame == 0 ?
+					10 : 3;
+			memset(surface_object->source_data +
+			       surface_object->slices_size,
+			       0, header_size);
+			surface_object->slices_size += header_size;
+		}
 		memcpy(surface_object->source_data +
 			       surface_object->slices_size,
 		       buffer_object->data,