diff -urN a/src/panfrost/vulkan/jm/panvk_cmd_buffer.h b/src/panfrost/vulkan/jm/panvk_cmd_buffer.h --- a/src/panfrost/vulkan/jm/panvk_cmd_buffer.h 2026-05-21 22:46:57.477785029 +0200 +++ b/src/panfrost/vulkan/jm/panvk_cmd_buffer.h 2026-05-22 10:17:41.214043265 +0200 @@ -88,8 +88,18 @@ struct panvk_cmd_compute_state compute; struct panvk_push_constant_state push_constants; } state; + + /* iter1: panvk-bifrost-video — current bound video session + params + * scoped by vkCmdBeginVideoCodingKHR..vkCmdEndVideoCodingKHR. */ + struct { + struct panvk_video_session *vs; + struct vk_video_session_parameters *params; + } video; }; +struct panvk_video_session; +struct vk_video_session_parameters; + VK_DEFINE_HANDLE_CASTS(panvk_cmd_buffer, vk.base, VkCommandBuffer, VK_OBJECT_TYPE_COMMAND_BUFFER) diff -urN a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build --- a/src/panfrost/vulkan/meson.build 2026-05-21 22:46:59.277811484 +0200 +++ b/src/panfrost/vulkan/meson.build 2026-05-22 10:17:41.214043265 +0200 @@ -41,6 +41,10 @@ 'panvk_device_memory.c', 'panvk_host_copy.c', 'panvk_image.c', + 'panvk_video_decode.c', + 'panvk_v4l2.c', + 'panvk_v4l2_h264.c', + 'panvk_v4l2_h264_slice_header.c', 'panvk_instance.c', 'panvk_mempool.c', 'panvk_physical_device.c', diff -urN a/src/panfrost/vulkan/panvk_buffer.c b/src/panfrost/vulkan/panvk_buffer.c --- a/src/panfrost/vulkan/panvk_buffer.c 2026-05-21 22:46:57.485785147 +0200 +++ b/src/panfrost/vulkan/panvk_buffer.c 2026-05-22 10:17:41.214043265 +0200 @@ -88,6 +88,8 @@ *bind_status->pResult = VK_SUCCESS; buffer->vk.device_address = mem->addr.dev + pBindInfos[i].memoryOffset; + buffer->mem = mem; + buffer->mem_offset = pBindInfos[i].memoryOffset; } return VK_SUCCESS; } diff -urN a/src/panfrost/vulkan/panvk_buffer.h b/src/panfrost/vulkan/panvk_buffer.h --- a/src/panfrost/vulkan/panvk_buffer.h 2026-05-21 22:46:57.485785147 +0200 +++ b/src/panfrost/vulkan/panvk_buffer.h 2026-05-22 10:17:41.214043265 +0200 @@ -14,8 +14,14 @@ struct panvk_priv_bo; +struct panvk_device_memory; struct panvk_buffer { struct vk_buffer vk; + + /* iter1: panvk-bifrost-video — bound memory tracking for dmabuf export. + * Set in panvk_BindBufferMemory. NULL until bound. */ + struct panvk_device_memory *mem; + uint64_t mem_offset; }; VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_buffer, vk.base, VkBuffer, diff -urN a/src/panfrost/vulkan/panvk_device.h b/src/panfrost/vulkan/panvk_device.h --- a/src/panfrost/vulkan/panvk_device.h 2026-05-21 22:46:57.489785206 +0200 +++ b/src/panfrost/vulkan/panvk_device.h 2026-05-22 10:17:41.214043265 +0200 @@ -45,6 +45,8 @@ enum panvk_queue_family { PANVK_QUEUE_FAMILY_GPU, PANVK_QUEUE_FAMILY_BIND, + /* iter1: video decode via V4L2-stateless hantro (PAN_ARCH < 9, runtime-gated). */ + PANVK_QUEUE_FAMILY_VIDEO_DECODE, PANVK_QUEUE_FAMILY_COUNT, }; @@ -97,6 +99,23 @@ struct panvk_device_queue_family queue_families[PANVK_QUEUE_FAMILY_COUNT]; + /* iter1: Phase 1 simplification — device-level active video session + + * params, set by Cmd{Begin,End}VideoCodingKHR. Single-session Phase 1 + * scope; per-cmdbuf state-tracking lives in Phase >>1 once per-arch + * cmd_buffer access from arch-agnostic source is wired. */ + struct { + simple_mtx_t lock; + struct panvk_video_session *vs; + struct vk_video_session_parameters *params; + } active_video; + /* iter1: Vulkan-visible queue family index ↔ panvk_qfi enum mapping. + * Needed because hideable families create gaps between the enum slot + * and the position the Vulkan loader sees. Populated at vkCreateDevice + * from pCreateInfo->pQueueCreateInfos[].queueFamilyIndex by querying + * physical-device queue family properties (which is what was advertised). */ + uint8_t vulkan_to_panvk_qfi[PANVK_QUEUE_FAMILY_COUNT]; + uint8_t num_vulkan_qfi; + struct panvk_precomp_cache *precomp_cache; struct { diff -urN a/src/panfrost/vulkan/panvk_physical_device.c b/src/panfrost/vulkan/panvk_physical_device.c --- a/src/panfrost/vulkan/panvk_physical_device.c 2026-05-21 22:46:57.497785323 +0200 +++ b/src/panfrost/vulkan/panvk_physical_device.c 2026-05-22 10:17:41.214043265 +0200 @@ -577,12 +577,22 @@ .queueFlags = VK_QUEUE_SPARSE_BINDING_BIT, .queueCount = 1, }, + [PANVK_QUEUE_FAMILY_VIDEO_DECODE] = { + /* iter1: video decode + transfer (Vulkan spec requires VIDEO families + * to also advertise TRANSFER for bitstream-buffer copies). */ + .queueFlags = VK_QUEUE_VIDEO_DECODE_BIT_KHR | VK_QUEUE_TRANSFER_BIT, + .queueCount = 1, + .minImageTransferGranularity = {1, 1, 1}, + }, }; for (uint32_t family = 0; family < ARRAY_SIZE(qfamily_props); family++) { if (family == PANVK_QUEUE_FAMILY_BIND && !physical_device->vk.supported_features.sparseBinding) - break; + continue; /* iter1: was break, but a later family (VIDEO_DECODE) must be reachable */ + if (family == PANVK_QUEUE_FAMILY_VIDEO_DECODE && + !physical_device->vk.supported_extensions.KHR_video_queue) + continue; vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) { p->queueFamilyProperties = qfamily_props[family]; @@ -591,6 +601,16 @@ vk_find_struct(p->pNext, QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR); if (prio) panvk_fill_global_priority(physical_device, family, prio); + + /* iter1: VK_KHR_video_queue advertises codec ops per family. */ + VkQueueFamilyVideoPropertiesKHR *vid = + vk_find_struct(p->pNext, QUEUE_FAMILY_VIDEO_PROPERTIES_KHR); + if (vid) { + vid->videoCodecOperations = 0; + if (family == PANVK_QUEUE_FAMILY_VIDEO_DECODE) + vid->videoCodecOperations |= + VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR; + } } } } @@ -1558,3 +1578,87 @@ .compatibleHandleTypes = handle_types, }; } + +/* panvk-bifrost-video Phase 4 commit 2: + * Per-physical-device video capability + format reporting. */ + +#include "vk_video.h" + +VKAPI_ATTR VkResult VKAPI_CALL +panvk_GetPhysicalDeviceVideoCapabilitiesKHR( + VkPhysicalDevice physicalDevice, + const VkVideoProfileInfoKHR *pVideoProfile, + VkVideoCapabilitiesKHR *pCapabilities) +{ + /* iter1: H.264 only; degrade to UNSUPPORTED for anything else. */ + if (pVideoProfile->videoCodecOperation != + VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR) + return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR; + + pCapabilities->flags = 0; + pCapabilities->minBitstreamBufferOffsetAlignment = 16; + pCapabilities->minBitstreamBufferSizeAlignment = 16; + pCapabilities->pictureAccessGranularity.width = 16; + pCapabilities->pictureAccessGranularity.height = 16; + pCapabilities->minCodedExtent.width = 16; + pCapabilities->minCodedExtent.height = 16; + /* RK3566 hantro max H.264 4Kp30 in spec; constrain to 1080p safe baseline + * until Commit 6 queries real V4L2 format-size limits. */ + pCapabilities->maxCodedExtent.width = 1920; + pCapabilities->maxCodedExtent.height = 1088; + pCapabilities->maxDpbSlots = 16; + pCapabilities->maxActiveReferencePictures = 16; + pCapabilities->stdHeaderVersion.extensionName[0] = 0; + strcpy(pCapabilities->stdHeaderVersion.extensionName, + VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_EXTENSION_NAME); + pCapabilities->stdHeaderVersion.specVersion = + VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_SPEC_VERSION; + + VkVideoDecodeCapabilitiesKHR *dec_caps = + vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_CAPABILITIES_KHR); + if (dec_caps) { + /* Hantro outputs to a CAPTURE buffer separate from the DPB; expose + * DISTINCT mode. (COINCIDE would be the GPU-engine-DPB-as-output mode.) */ + dec_caps->flags = + VK_VIDEO_DECODE_CAPABILITY_DPB_AND_OUTPUT_DISTINCT_BIT_KHR; + } + + VkVideoDecodeH264CapabilitiesKHR *h264_caps = + vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H264_CAPABILITIES_KHR); + if (h264_caps) { + h264_caps->maxLevelIdc = STD_VIDEO_H264_LEVEL_IDC_4_2; + h264_caps->fieldOffsetGranularity.x = 0; + h264_caps->fieldOffsetGranularity.y = 0; + } + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +panvk_GetPhysicalDeviceVideoFormatPropertiesKHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceVideoFormatInfoKHR *pVideoFormatInfo, + uint32_t *pVideoFormatPropertyCount, + VkVideoFormatPropertiesKHR *pVideoFormatProperties) +{ + /* iter1: NV12 (8:8:0 4:2:0 2-plane) is the only format hantro emits. */ + VK_OUTARRAY_MAKE_TYPED(VkVideoFormatPropertiesKHR, out, + pVideoFormatProperties, pVideoFormatPropertyCount); + + vk_outarray_append_typed(VkVideoFormatPropertiesKHR, &out, p) { + p->format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; + p->imageType = VK_IMAGE_TYPE_2D; + p->imageTiling = VK_IMAGE_TILING_OPTIMAL; + p->imageUsageFlags = pVideoFormatInfo->imageUsage & + (VK_IMAGE_USAGE_VIDEO_DECODE_DST_BIT_KHR | + VK_IMAGE_USAGE_VIDEO_DECODE_DPB_BIT_KHR | + VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_TRANSFER_SRC_BIT); + p->componentMapping.r = VK_COMPONENT_SWIZZLE_IDENTITY; + p->componentMapping.g = VK_COMPONENT_SWIZZLE_IDENTITY; + p->componentMapping.b = VK_COMPONENT_SWIZZLE_IDENTITY; + p->componentMapping.a = VK_COMPONENT_SWIZZLE_IDENTITY; + } + + return vk_outarray_status(&out); +} diff -urN a/src/panfrost/vulkan/panvk_v4l2.c b/src/panfrost/vulkan/panvk_v4l2.c --- a/src/panfrost/vulkan/panvk_v4l2.c 1970-01-01 01:00:00.000000000 +0100 +++ b/src/panfrost/vulkan/panvk_v4l2.c 2026-05-22 10:17:41.214043265 +0200 @@ -0,0 +1,615 @@ +/* + * panvk-bifrost-video Phase 4 commit 3: + * + * V4L2-stateless hantro VPU bridge for panvk video decode sessions. + * + * Mirrors the libva-v4l2-request-fourier probe + per-session-init + * pattern (proven on RK3566 hantro at 1.16x realtime). + * + * SPDX-License-Identifier: MIT + */ + +#include "panvk_video_decode.h" +#include "panvk_device.h" + +#include "util/macros.h" +#include "vk_alloc.h" +#include "vk_log.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Phase 2 D9: hard-coded paths first; topology-based enumeration is a + * later iter (libva-v4l2-request-fourier has the full version). */ +#define PANVK_V4L2_VIDEO_NODE "/dev/video1" +#define PANVK_V4L2_MEDIA_NODE "/dev/media0" + +/* Phase 1 max bitstream buffer: BBB peak ~2.4 MB/frame, 4MB is comfortable. */ +#define PANVK_V4L2_SOURCE_SIZE_MAX (4 * 1024 * 1024) + +/* Phase 2 D3: request_fd pool size = max_dpb_slots + 2 = 18. + * 16 DPB slots + current frame + safety margin. */ +#define PANVK_V4L2_REQUEST_FD_COUNT 18 + +/* Probe: try VIDIOC_QUERYCAP on /dev/video1, check the card string. */ +bool +panvk_v4l2_probe_hantro(void) +{ + int fd = open(PANVK_V4L2_VIDEO_NODE, O_RDWR | O_NONBLOCK); + if (fd < 0) + return false; + + struct v4l2_capability cap; + memset(&cap, 0, sizeof(cap)); + int rc = ioctl(fd, VIDIOC_QUERYCAP, &cap); + close(fd); + if (rc < 0) + return false; + + /* Hantro VPU on RK3566/RK3568 reports card = "hantro-vpu" or + * driver = "hantro-vpu". Accept either field matching. */ + bool is_hantro = (strncmp((const char *) cap.driver, "hantro", 6) == 0) || + (strncmp((const char *) cap.card, "hantro", 6) == 0); + return is_hantro; +} + +/* Detect whether device requires multi-planar buffer types. + * Hantro on rk3568 advertises V4L2_CAP_VIDEO_M2M_MPLANE — multi-planar only. */ +static bool +v4l2_device_is_mplane(int video_fd) +{ + struct v4l2_capability cap; + memset(&cap, 0, sizeof(cap)); + if (ioctl(video_fd, VIDIOC_QUERYCAP, &cap) < 0) + return false; + uint32_t caps = (cap.capabilities & V4L2_CAP_DEVICE_CAPS) + ? cap.device_caps : cap.capabilities; + return (caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0; +} + +/* Open V4L2 fds for one video session. /dev/media0 grants request_fds. */ +static int +v4l2_open_fds(struct panvk_video_session *vs) +{ + vs->video_fd = open(PANVK_V4L2_VIDEO_NODE, O_RDWR | O_NONBLOCK); + if (vs->video_fd < 0) { + mesa_loge("panvk_v4l2: open video failed: %s", strerror(errno)); + return -errno; + } + vs->media_fd = open(PANVK_V4L2_MEDIA_NODE, O_RDWR | O_NONBLOCK); + if (vs->media_fd < 0) { + mesa_loge("panvk_v4l2: open media failed: %s", strerror(errno)); + close(vs->video_fd); + vs->video_fd = -1; + return -errno; + } + return 0; +} + +/* Set OUTPUT (input bitstream) format to H264_SLICE; CAPTURE (output picture) + * format to NV12. Width/height come from VkVideoSessionCreateInfo. + * Hantro on RK3568 is multi-planar; rkvdec on RK3399 is single-planar. + * Detect at runtime via V4L2_CAP_VIDEO_M2M_MPLANE. */ +static int +v4l2_negotiate_formats(struct panvk_video_session *vs, + uint32_t width, uint32_t height) +{ + const bool mplane = v4l2_device_is_mplane(vs->video_fd); + vs->mplane = mplane; + + struct v4l2_format f; + + /* OUTPUT — H.264 stateless bitstream */ + memset(&f, 0, sizeof(f)); + if (mplane) { + f.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + f.fmt.pix_mp.width = width; + f.fmt.pix_mp.height = height; + f.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_H264_SLICE; + f.fmt.pix_mp.plane_fmt[0].sizeimage = PANVK_V4L2_SOURCE_SIZE_MAX; + f.fmt.pix_mp.num_planes = 1; + } else { + f.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; + f.fmt.pix.width = width; + f.fmt.pix.height = height; + f.fmt.pix.pixelformat = V4L2_PIX_FMT_H264_SLICE; + f.fmt.pix.sizeimage = PANVK_V4L2_SOURCE_SIZE_MAX; + } + if (ioctl(vs->video_fd, VIDIOC_S_FMT, &f) < 0) { + mesa_loge("panvk_v4l2: S_FMT OUTPUT (H264_SLICE, mplane=%d) failed: %s", + mplane, strerror(errno)); + return -errno; + } + memcpy(&vs->fmt_output, &f, sizeof(f)); + + /* CAPTURE — NV12 decoded frames */ + memset(&f, 0, sizeof(f)); + if (mplane) { + f.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + f.fmt.pix_mp.width = width; + f.fmt.pix_mp.height = height; + f.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_NV12; + f.fmt.pix_mp.num_planes = 1; + } else { + f.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + f.fmt.pix.width = width; + f.fmt.pix.height = height; + f.fmt.pix.pixelformat = V4L2_PIX_FMT_NV12; + } + if (ioctl(vs->video_fd, VIDIOC_S_FMT, &f) < 0) { + mesa_loge("panvk_v4l2: S_FMT CAPTURE (NV12, mplane=%d) failed: %s", + mplane, strerror(errno)); + return -errno; + } + memcpy(&vs->fmt_capture, &f, sizeof(f)); + + return 0; +} + +/* REQBUFS to register N buffers on each queue. Phase 1: minimal counts to + * exercise the path; full pipelining is a later iter. + * + * Commit 7c: BOTH OUTPUT + CAPTURE use MMAP (mirrors libva-v4l2-request-fourier + * working pattern exactly). Bitstream copied in CPU-side from VkBuffer host + * map. Decoded frame copied out CPU-side to VkImage (7d). Validates the + * IOC_QUEUE path without dma_buf-side variables. */ +#define PANVK_V4L2_CAPTURE_COUNT 18 + +static int +v4l2_reqbufs(struct panvk_video_session *vs) +{ + struct v4l2_requestbuffers rb; + + /* OUTPUT: MMAP (kernel-allocated, mmap'd to CPU for bitstream copy-in). */ + memset(&rb, 0, sizeof(rb)); + rb.count = PANVK_V4L2_REQUEST_FD_COUNT; + rb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE + : V4L2_BUF_TYPE_VIDEO_OUTPUT; + rb.memory = V4L2_MEMORY_MMAP; + if (ioctl(vs->video_fd, VIDIOC_REQBUFS, &rb) < 0) { + mesa_loge("panvk_v4l2: REQBUFS OUTPUT failed: %s", strerror(errno)); + return -errno; + } + /* REQBUFS may round up the count above the request — clamp to our + * fixed-size mmap arrays (Phase 5 review: prevents output_map OOB). */ + vs->num_output_buffers = MIN2(rb.count, 18); + vs->output_next = 0; + + /* CAPTURE: MMAP — kernel-allocated, mmap to CPU for copy-out path. */ + memset(&rb, 0, sizeof(rb)); + rb.count = PANVK_V4L2_CAPTURE_COUNT; + rb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE + : V4L2_BUF_TYPE_VIDEO_CAPTURE; + rb.memory = V4L2_MEMORY_MMAP; + if (ioctl(vs->video_fd, VIDIOC_REQBUFS, &rb) < 0) { + mesa_loge("panvk_v4l2: REQBUFS CAPTURE failed: %s", strerror(errno)); + return -errno; + } + vs->num_capture_buffers = MIN2(rb.count, 18); + vs->capture_next = 0; + + return 0; +} + +/* Allocate the request_fd pool via MEDIA_IOC_REQUEST_ALLOC. */ +static int +v4l2_alloc_request_pool(struct panvk_video_session *vs, + const VkAllocationCallbacks *alloc, + struct vk_device *dev) +{ + vs->request_fds = vk_alloc(&dev->alloc, + sizeof(int) * PANVK_V4L2_REQUEST_FD_COUNT, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!vs->request_fds) + return -ENOMEM; + for (unsigned i = 0; i < PANVK_V4L2_REQUEST_FD_COUNT; i++) + vs->request_fds[i] = -1; + + for (unsigned i = 0; i < PANVK_V4L2_REQUEST_FD_COUNT; i++) { + int rfd = -1; + if (ioctl(vs->media_fd, MEDIA_IOC_REQUEST_ALLOC, &rfd) < 0) { + mesa_loge("panvk_v4l2: MEDIA_IOC_REQUEST_ALLOC [%u] failed: %s", + i, strerror(errno)); + return -errno; + } + vs->request_fds[i] = rfd; + } + vs->num_request_fds = PANVK_V4L2_REQUEST_FD_COUNT; + vs->request_fd_next = 0; + return 0; +} + +/* QUERYBUF + mmap CAPTURE buffers — NV12-decoded frame source. */ +static int +v4l2_mmap_capture_buffers(struct panvk_video_session *vs) +{ + for (unsigned i = 0; i < vs->num_capture_buffers && i < 18; i++) { + struct v4l2_buffer qb = { 0 }; + struct v4l2_plane planes[VIDEO_MAX_PLANES] = { 0 }; + qb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE + : V4L2_BUF_TYPE_VIDEO_CAPTURE; + qb.memory = V4L2_MEMORY_MMAP; + qb.index = i; + if (vs->mplane) { qb.length = 1; qb.m.planes = planes; } + if (ioctl(vs->video_fd, VIDIOC_QUERYBUF, &qb) < 0) { + mesa_loge("panvk_v4l2: QUERYBUF CAPTURE[%u] failed: %s", + i, strerror(errno)); + return -errno; + } + uint32_t length = vs->mplane ? planes[0].length : qb.length; + uint32_t offset = vs->mplane ? planes[0].m.mem_offset : qb.m.offset; + void *p = mmap(NULL, length, PROT_READ, MAP_SHARED, + vs->video_fd, offset); + if (p == MAP_FAILED) { + mesa_loge("panvk_v4l2: mmap CAPTURE[%u] failed: %s", + i, strerror(errno)); + return -errno; + } + vs->capture_map[i] = p; + vs->capture_map_size[i] = length; + } + return 0; +} + +/* QUERYBUF + mmap each OUTPUT buffer — bitstream-copy-in destination. */ +static int +v4l2_mmap_output_buffers(struct panvk_video_session *vs) +{ + for (unsigned i = 0; i < vs->num_output_buffers; i++) { + struct v4l2_buffer qb = { 0 }; + struct v4l2_plane planes[VIDEO_MAX_PLANES] = { 0 }; + qb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE + : V4L2_BUF_TYPE_VIDEO_OUTPUT; + qb.memory = V4L2_MEMORY_MMAP; + qb.index = i; + if (vs->mplane) { qb.length = 1; qb.m.planes = planes; } + if (ioctl(vs->video_fd, VIDIOC_QUERYBUF, &qb) < 0) { + mesa_loge("panvk_v4l2: QUERYBUF OUTPUT[%u] failed: %s", + i, strerror(errno)); + return -errno; + } + uint32_t length = vs->mplane ? planes[0].length : qb.length; + uint32_t offset = vs->mplane ? planes[0].m.mem_offset : qb.m.offset; + void *p = mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, + vs->video_fd, offset); + if (p == MAP_FAILED) { + mesa_loge("panvk_v4l2: mmap OUTPUT[%u] failed: %s", + i, strerror(errno)); + return -errno; + } + vs->output_map[i] = p; + vs->output_map_size[i] = length; + } + return 0; +} + +/* STREAMON both queues. Must happen after REQBUFS, before first QBUF. */ +static int +v4l2_streamon(struct panvk_video_session *vs) +{ + enum v4l2_buf_type t; + t = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE + : V4L2_BUF_TYPE_VIDEO_OUTPUT; + if (ioctl(vs->video_fd, VIDIOC_STREAMON, &t) < 0) { + mesa_loge("panvk_v4l2: STREAMON OUTPUT failed: %s", strerror(errno)); + return -errno; + } + t = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE + : V4L2_BUF_TYPE_VIDEO_CAPTURE; + if (ioctl(vs->video_fd, VIDIOC_STREAMON, &t) < 0) { + mesa_loge("panvk_v4l2: STREAMON CAPTURE failed: %s", strerror(errno)); + return -errno; + } + return 0; +} + +/* Set device-level H.264 controls (DECODE_MODE + START_CODE) before STREAMON. + * libva-v4l2-request-fourier calls these with request_fd=-1; we mirror. + * Errors are non-fatal — some backing drivers may default-only. */ +static void +v4l2_set_device_h264_controls(struct panvk_video_session *vs) +{ + struct v4l2_ext_control dev[2] = { 0 }; + dev[0].id = V4L2_CID_STATELESS_H264_DECODE_MODE; + dev[0].value = V4L2_STATELESS_H264_DECODE_MODE_FRAME_BASED; + dev[1].id = V4L2_CID_STATELESS_H264_START_CODE; + dev[1].value = V4L2_STATELESS_H264_START_CODE_ANNEX_B; + + struct v4l2_ext_controls c = { 0 }; + c.controls = dev; + c.count = 2; + (void) ioctl(vs->video_fd, VIDIOC_S_EXT_CTRLS, &c); + /* intentionally ignoring rc — device-level probes; some drivers reject. */ +} + +int +panvk_v4l2_session_init(struct panvk_video_session *vs, + struct vk_device *vk_dev, + const VkAllocationCallbacks *alloc, + uint32_t width, uint32_t height) +{ + int rc = v4l2_open_fds(vs); + if (rc) return rc; + + rc = v4l2_negotiate_formats(vs, width, height); + if (rc) goto fail; + + rc = v4l2_reqbufs(vs); + if (rc) goto fail; + + rc = v4l2_alloc_request_pool(vs, alloc, vk_dev); + if (rc) goto fail; + + /* Set device-level H.264 mode controls (non-fatal). */ + v4l2_set_device_h264_controls(vs); + + /* mmap OUTPUT buffers for bitstream copy-in. */ + rc = v4l2_mmap_output_buffers(vs); + if (rc) goto fail; + + /* mmap CAPTURE buffers for NV12 frame readback (Phase 1 verification). */ + rc = v4l2_mmap_capture_buffers(vs); + if (rc) goto fail; + + /* Stream on both queues. */ + rc = v4l2_streamon(vs); + if (rc) goto fail; + + return 0; + +fail: + panvk_v4l2_session_finish(vs, vk_dev, alloc); + return rc; +} + +/* Allocate one V4L2 OUTPUT-queue buffer index for this submit; round-robin + * through the request_fd pool. Returns the request_fd to use. + * + * REINIT contract: a freshly-allocated request is in QUEUEABLE state; after + * MEDIA_REQUEST_IOC_QUEUE + dequeue it's in COMPLETE state and S_EXT_CTRLS + * on it returns EBUSY. MEDIA_REQUEST_IOC_REINIT puts it back in QUEUEABLE. + * We track per-fd "ever been queued" so the very-first use skips REINIT + * (which returns EBUSY on never-queued requests). */ +static int +v4l2_pick_request_fd(struct panvk_video_session *vs) +{ + uint32_t idx = vs->request_fd_next; + int rfd = vs->request_fds[idx]; + if (vs->request_fd_used[idx]) { + if (ioctl(rfd, MEDIA_REQUEST_IOC_REINIT) < 0) { + mesa_loge("panvk_v4l2: MEDIA_REQUEST_IOC_REINIT rfd=%d failed: %s", + rfd, strerror(errno)); + } + } + vs->request_fd_used[idx] = true; + vs->request_fd_next = (idx + 1) % vs->num_request_fds; + return rfd; +} + +/* The 14-step ioctl dance for one decode op (Phase 2 D7). + * Operates synchronously at record time per Phase 1 D8 lock. + * Returns 0 on success, -errno on failure. + * + * Commit 7c MMAP-side: src_bitstream is a CPU pointer (NOT a dma_buf fd). + * We copy it into the mmap'd OUTPUT buffer at index `out_idx`. */ +int +panvk_v4l2_submit_h264_decode(struct panvk_video_session *vs, + const struct v4l2_ctrl_h264_sps *sps, + const struct v4l2_ctrl_h264_pps *pps, + const struct v4l2_ctrl_h264_scaling_matrix *scaling, + const struct v4l2_ctrl_h264_decode_params *dec, + const void *src_bitstream, uint32_t src_bytes, + int dst_dmabuf_fd_unused, + uint64_t qbuf_ts) +{ + int rfd = v4l2_pick_request_fd(vs); + const bool mp = vs->mplane; + + /* Pick an OUTPUT buffer index + copy bitstream in. */ + const uint32_t out_idx = vs->output_next; + vs->output_next = (vs->output_next + 1) % vs->num_output_buffers; + if (src_bytes > vs->output_map_size[out_idx]) { + mesa_loge("panvk_v4l2: bitstream %u > buffer %u", + src_bytes, vs->output_map_size[out_idx]); + return -ENOSPC; + } + memcpy(vs->output_map[out_idx], src_bitstream, src_bytes); + + /* 1-7: build extended controls batch with request_fd binding */ + struct v4l2_ext_control ctrls[4] = { 0 }; + ctrls[0].id = V4L2_CID_STATELESS_H264_SPS; + ctrls[0].ptr = (void *) sps; + ctrls[0].size = sizeof(*sps); + ctrls[1].id = V4L2_CID_STATELESS_H264_PPS; + ctrls[1].ptr = (void *) pps; + ctrls[1].size = sizeof(*pps); + ctrls[2].id = V4L2_CID_STATELESS_H264_DECODE_PARAMS; + ctrls[2].ptr = (void *) dec; + ctrls[2].size = sizeof(*dec); + ctrls[3].id = V4L2_CID_STATELESS_H264_SCALING_MATRIX; + ctrls[3].ptr = (void *) scaling; + ctrls[3].size = sizeof(*scaling); + + struct v4l2_ext_controls batch = { 0 }; + batch.controls = ctrls; + batch.count = 4; + batch.which = V4L2_CTRL_WHICH_REQUEST_VAL; + batch.request_fd = rfd; + + if (ioctl(vs->video_fd, VIDIOC_S_EXT_CTRLS, &batch) < 0) { + mesa_loge("panvk_v4l2: S_EXT_CTRLS request_fd=%d failed: %s " + "(error_idx=%u/%u)", + rfd, strerror(errno), batch.error_idx, batch.count); + return -errno; + } + + /* 8: QBUF OUTPUT (bitstream input) — MMAP, index out_idx. */ + struct v4l2_buffer qb = { 0 }; + struct v4l2_plane planes[VIDEO_MAX_PLANES] = { 0 }; + qb.memory = V4L2_MEMORY_MMAP; + qb.index = out_idx; + qb.flags = V4L2_BUF_FLAG_REQUEST_FD; + qb.request_fd = rfd; + qb.timestamp.tv_sec = (uint32_t)(qbuf_ts / 1000000000ULL); + qb.timestamp.tv_usec = (uint32_t)((qbuf_ts / 1000ULL) % 1000000ULL); + if (mp) { + qb.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + qb.length = 1; + qb.m.planes = planes; + planes[0].bytesused = src_bytes; + } else { + qb.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; + qb.bytesused = src_bytes; + } + if (ioctl(vs->video_fd, VIDIOC_QBUF, &qb) < 0) { + mesa_loge("panvk_v4l2: QBUF OUTPUT (mmap idx=%u) failed: %s", + out_idx, strerror(errno)); + return -errno; + } + + /* 9: QBUF CAPTURE (output frame) — MMAP-backed kernel-allocated buffer. + * dst_dmabuf_fd is ignored in 7c; copy-out to VkImage is 7d. */ + const uint32_t cap_idx = vs->capture_next; + vs->capture_next = (vs->capture_next + 1) % vs->num_capture_buffers; + memset(&qb, 0, sizeof(qb)); + memset(&planes, 0, sizeof(planes)); + qb.memory = V4L2_MEMORY_MMAP; + qb.index = cap_idx; + if (mp) { + qb.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + qb.length = 1; + qb.m.planes = planes; + } else { + qb.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + } + if (ioctl(vs->video_fd, VIDIOC_QBUF, &qb) < 0) { + mesa_loge("panvk_v4l2: QBUF CAPTURE (mmap idx=%u) failed: %s", + cap_idx, strerror(errno)); + return -errno; + } + + /* 10: MEDIA_REQUEST_IOC_QUEUE */ + if (ioctl(rfd, MEDIA_REQUEST_IOC_QUEUE) < 0) { + mesa_loge("panvk_v4l2: REQUEST_IOC_QUEUE failed: %s", strerror(errno)); + return -errno; + } + + /* 11: poll(rfd, POLLPRI) — 200ms timeout per Phase 2 D7 */ + struct pollfd pfd = { .fd = rfd, .events = POLLPRI }; + int pr = poll(&pfd, 1, 200); + if (pr <= 0) { + mesa_loge("panvk_v4l2: poll request_fd timeout/err pr=%d errno=%d", + pr, errno); + return -ETIMEDOUT; + } + + /* 12: DQBUF OUTPUT — must match the memory type used at QBUF (MMAP, not + * DMABUF). With the wrong memory type the kernel rejects the DQBUF and + * the OUTPUT slot stays in flight, which leaks request_fd resources + * (mostly cosmetic for Phase 1 single-decode tests, but breaks the + * pipelined case). */ + memset(&qb, 0, sizeof(qb)); + memset(&planes, 0, sizeof(planes)); + qb.memory = V4L2_MEMORY_MMAP; + qb.type = mp ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE : V4L2_BUF_TYPE_VIDEO_OUTPUT; + if (mp) { qb.length = 1; qb.m.planes = planes; } + if (ioctl(vs->video_fd, VIDIOC_DQBUF, &qb) < 0) { + mesa_loge("panvk_v4l2: DQBUF OUTPUT failed: %s", strerror(errno)); + /* non-fatal — capture might still have completed */ + } + + /* 13: DQBUF CAPTURE — MMAP, kernel-allocated. */ + memset(&qb, 0, sizeof(qb)); + memset(&planes, 0, sizeof(planes)); + qb.memory = V4L2_MEMORY_MMAP; + qb.type = mp ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE : V4L2_BUF_TYPE_VIDEO_CAPTURE; + if (mp) { qb.length = 1; qb.m.planes = planes; } + if (ioctl(vs->video_fd, VIDIOC_DQBUF, &qb) < 0) { + mesa_loge("panvk_v4l2: DQBUF CAPTURE failed: %s", strerror(errno)); + return -errno; + } + if (qb.flags & V4L2_BUF_FLAG_ERROR) { + mesa_loge("panvk_v4l2: CAPTURE buffer flagged ERROR"); + return -EIO; + } + + /* Verification print: first 16 bytes of decoded Y plane. + * If hantro wrote real data this should NOT be all zeros. */ + return 0; +} + +void +panvk_v4l2_session_finish(struct panvk_video_session *vs, + struct vk_device *vk_dev, + const VkAllocationCallbacks *alloc) +{ + /* Unwind in reverse order of session_init. Each step is guarded by + * "have we got far enough to need this" so the function is safe to + * call on partially-initialised sessions (the session_init failure + * paths jump here via `goto fail`). */ + + /* munmap CAPTURE + OUTPUT (no-op for entries left at NULL by an + * earlier-failed mmap loop). */ + for (unsigned i = 0; i < 18; i++) { + if (vs->capture_map[i]) { + munmap(vs->capture_map[i], vs->capture_map_size[i]); + vs->capture_map[i] = NULL; + vs->capture_map_size[i] = 0; + } + if (vs->output_map[i]) { + munmap(vs->output_map[i], vs->output_map_size[i]); + vs->output_map[i] = NULL; + vs->output_map_size[i] = 0; + } + } + + if (vs->video_fd >= 0) { + /* STREAMOFF (safe to call even if STREAMON never ran — kernel + * returns EINVAL which we ignore). */ + enum v4l2_buf_type t; + t = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE + : V4L2_BUF_TYPE_VIDEO_OUTPUT; + (void) ioctl(vs->video_fd, VIDIOC_STREAMOFF, &t); + t = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE + : V4L2_BUF_TYPE_VIDEO_CAPTURE; + (void) ioctl(vs->video_fd, VIDIOC_STREAMOFF, &t); + + /* Release the kernel buffer queues via REQBUFS count=0. */ + struct v4l2_requestbuffers rb; + memset(&rb, 0, sizeof(rb)); + rb.memory = V4L2_MEMORY_MMAP; + rb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE + : V4L2_BUF_TYPE_VIDEO_OUTPUT; + (void) ioctl(vs->video_fd, VIDIOC_REQBUFS, &rb); + rb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE + : V4L2_BUF_TYPE_VIDEO_CAPTURE; + (void) ioctl(vs->video_fd, VIDIOC_REQBUFS, &rb); + } + + if (vs->request_fds) { + for (unsigned i = 0; i < vs->num_request_fds; i++) + if (vs->request_fds[i] >= 0) + close(vs->request_fds[i]); + vk_free(&vk_dev->alloc, vs->request_fds); + vs->request_fds = NULL; + vs->num_request_fds = 0; + } + if (vs->video_fd >= 0) { + close(vs->video_fd); + vs->video_fd = -1; + } + if (vs->media_fd >= 0) { + close(vs->media_fd); + vs->media_fd = -1; + } +} diff -urN a/src/panfrost/vulkan/panvk_v4l2_h264.c b/src/panfrost/vulkan/panvk_v4l2_h264.c --- a/src/panfrost/vulkan/panvk_v4l2_h264.c 1970-01-01 01:00:00.000000000 +0100 +++ b/src/panfrost/vulkan/panvk_v4l2_h264.c 2026-05-22 10:17:41.214043265 +0200 @@ -0,0 +1,478 @@ +/* + * panvk-bifrost-video Phase 4: Vulkan StdVideo H.264 → V4L2 stateless H.264 + * control-struct translation. + * + * This file is the protocol-translation bridge that lets a Vulkan video + * decode session (VkVideoDecodeH264PictureInfoKHR + StdVideoH264*) drive a + * V4L2 stateless H.264 decoder (the hantro VPU on RK3566/RK3568 via + * /dev/video1) by filling the four FRAME_BASED controls: + * + * - V4L2_CID_STATELESS_H264_SPS → struct v4l2_ctrl_h264_sps + * - V4L2_CID_STATELESS_H264_PPS → struct v4l2_ctrl_h264_pps + * - V4L2_CID_STATELESS_H264_SCALING_MATRIX → struct v4l2_ctrl_h264_scaling_matrix + * - V4L2_CID_STATELESS_H264_DECODE_PARAMS → struct v4l2_ctrl_h264_decode_params + * + * The ioctl-side (VIDIOC_S_EXT_CTRLS on a request_fd) is the caller's + * responsibility — see panvk_v4l2.c. This file is pure data-shape + * translation; no syscalls, no GPU/shader work. + * + * Cross-references: + * - V4L2 UAPI structs and field semantics: + * /usr/include/linux/v4l2-controls.h + * Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst + * - Vulkan StdVideo H.264 structs: + * include/vk_video/vulkan_video_codec_h264std.h + * include/vk_video/vulkan_video_codec_h264std_decode.h + * include/vulkan/vulkan_core.h (VkVideoDecodeH264*KHR) + * - VAAPI→V4L2 reference impl (semantically equivalent, different carrier): + * libva-v4l2-request-fourier/src/h264.c + * + * Why every flag is mapped by name, not by bit position: + * StdVideoH264SpsFlags packs its flags as 1-bit bitfield members in a + * compiler-defined layout. V4L2_H264_SPS_FLAG_* are explicit + * bit-position #defines. The flag *names* match the H.264 spec — + * pic-by-pic mapping is mechanical — but the *bit positions* don't + * match between the two. Each flag is therefore translated by its + * spec name, never by raw bitmask copy. + * + * SPDX-License-Identifier: MIT + */ + +#include "panvk_video_decode.h" + +#include + +#include +#include +#include + +#include +#include + +/* ------------------------------------------------------------------ */ +/* SPS */ +/* ------------------------------------------------------------------ */ + +/* + * Translate StdVideoH264SequenceParameterSet → struct v4l2_ctrl_h264_sps. + * + * profile_idc: StdVideoH264ProfileIdc is the literal H.264 profile_idc value + * (BASELINE=66, MAIN=77, HIGH=100, …) so a direct cast is correct. + * + * level_idc: StdVideoH264LevelIdc is an *enum index* (1_0=0, 1_1=1, …, 6_2=18), + * NOT the spec-encoded level_idc byte (which V4L2 expects in units of + * level*10: Level 4.1 → 41, Level 5.1 → 51, etc). We must encode. + * + * pic_order_cnt_type: StdVideoH264PocType enum values (0/1/2) match the spec + * directly; cast is safe. + * + * constraint_set_flags: V4L2 packs constraint_set{0..5}_flag into a single + * __u8 (V4L2_H264_SPS_CONSTRAINT_SETN_FLAG = 0x01..0x20). The StdVideo flag + * bitfields hold each one separately. + */ +void +panvk_v4l2_h264_std_to_ctrl_sps(const StdVideoH264SequenceParameterSet *in, + struct v4l2_ctrl_h264_sps *out) +{ + memset(out, 0, sizeof(*out)); + + /* StdVideoH264LevelIdc → level_idc byte (level * 10). */ + static const __u8 level_idc_lut[] = { + [STD_VIDEO_H264_LEVEL_IDC_1_0] = 10, + [STD_VIDEO_H264_LEVEL_IDC_1_1] = 11, + [STD_VIDEO_H264_LEVEL_IDC_1_2] = 12, + [STD_VIDEO_H264_LEVEL_IDC_1_3] = 13, + [STD_VIDEO_H264_LEVEL_IDC_2_0] = 20, + [STD_VIDEO_H264_LEVEL_IDC_2_1] = 21, + [STD_VIDEO_H264_LEVEL_IDC_2_2] = 22, + [STD_VIDEO_H264_LEVEL_IDC_3_0] = 30, + [STD_VIDEO_H264_LEVEL_IDC_3_1] = 31, + [STD_VIDEO_H264_LEVEL_IDC_3_2] = 32, + [STD_VIDEO_H264_LEVEL_IDC_4_0] = 40, + [STD_VIDEO_H264_LEVEL_IDC_4_1] = 41, + [STD_VIDEO_H264_LEVEL_IDC_4_2] = 42, + [STD_VIDEO_H264_LEVEL_IDC_5_0] = 50, + [STD_VIDEO_H264_LEVEL_IDC_5_1] = 51, + [STD_VIDEO_H264_LEVEL_IDC_5_2] = 52, + [STD_VIDEO_H264_LEVEL_IDC_6_0] = 60, + [STD_VIDEO_H264_LEVEL_IDC_6_1] = 61, + [STD_VIDEO_H264_LEVEL_IDC_6_2] = 62, + }; + + out->profile_idc = (__u8) in->profile_idc; + if ((unsigned) in->level_idc < sizeof(level_idc_lut) / sizeof(level_idc_lut[0])) + out->level_idc = level_idc_lut[in->level_idc]; + else + out->level_idc = 0; + out->seq_parameter_set_id = in->seq_parameter_set_id; + out->chroma_format_idc = (__u8) in->chroma_format_idc; + out->bit_depth_luma_minus8 = in->bit_depth_luma_minus8; + out->bit_depth_chroma_minus8 = in->bit_depth_chroma_minus8; + out->log2_max_frame_num_minus4 = in->log2_max_frame_num_minus4; + out->pic_order_cnt_type = (__u8) in->pic_order_cnt_type; + out->log2_max_pic_order_cnt_lsb_minus4 = in->log2_max_pic_order_cnt_lsb_minus4; + out->max_num_ref_frames = in->max_num_ref_frames; + out->num_ref_frames_in_pic_order_cnt_cycle = + in->num_ref_frames_in_pic_order_cnt_cycle; + + out->offset_for_non_ref_pic = in->offset_for_non_ref_pic; + out->offset_for_top_to_bottom_field = in->offset_for_top_to_bottom_field; + + /* offset_for_ref_frame[]: StdVideo passes via pOffsetForRefFrame pointer + * sized num_ref_frames_in_pic_order_cnt_cycle. V4L2 has a 255-entry fixed + * array. Copy in-bounds entries. */ + if (in->pOffsetForRefFrame != NULL) { + unsigned n = in->num_ref_frames_in_pic_order_cnt_cycle; + if (n > 255) n = 255; + for (unsigned i = 0; i < n; i++) + out->offset_for_ref_frame[i] = in->pOffsetForRefFrame[i]; + } + + out->pic_width_in_mbs_minus1 = (__u16) in->pic_width_in_mbs_minus1; + out->pic_height_in_map_units_minus1 = (__u16) in->pic_height_in_map_units_minus1; + + /* Constraint set flags — V4L2 packs into __u8 constraint_set_flags. */ + __u8 cs = 0; + if (in->flags.constraint_set0_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET0_FLAG; + if (in->flags.constraint_set1_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET1_FLAG; + if (in->flags.constraint_set2_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET2_FLAG; + if (in->flags.constraint_set3_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET3_FLAG; + if (in->flags.constraint_set4_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET4_FLAG; + if (in->flags.constraint_set5_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET5_FLAG; + out->constraint_set_flags = cs; + + /* Plain SPS flags — translated by spec name, NOT by bit position. */ + __u32 f = 0; + if (in->flags.separate_colour_plane_flag) + f |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE; + if (in->flags.qpprime_y_zero_transform_bypass_flag) + f |= V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS; + if (in->flags.delta_pic_order_always_zero_flag) + f |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO; + if (in->flags.gaps_in_frame_num_value_allowed_flag) + f |= V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED; + if (in->flags.frame_mbs_only_flag) + f |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY; + if (in->flags.mb_adaptive_frame_field_flag) + f |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD; + if (in->flags.direct_8x8_inference_flag) + f |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE; + out->flags = f; + /* + * StdVideoH264SpsFlags also has: frame_cropping_flag, + * seq_scaling_matrix_present_flag, vui_parameters_present_flag. + * V4L2 has no equivalent SPS flags for these — frame cropping is + * communicated via S_FMT cropping rectangles, scaling matrix presence is + * carried in PPS's V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT, and VUI is + * not exposed at all. Intentionally dropped. + */ +} + +/* ------------------------------------------------------------------ */ +/* PPS */ +/* ------------------------------------------------------------------ */ + +/* + * Translate StdVideoH264PictureParameterSet → struct v4l2_ctrl_h264_pps. + * + * num_slice_groups_minus1: not in StdVideoH264PictureParameterSet at all + * (Vulkan H.264 video core profile excludes FMO). Set to 0 (one slice + * group, the only value FMO-free decoders accept). + * + * V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT: per kernel doc this should be + * set when a non-flat matrix applies to the picture. We translate it from + * StdVideoH264PpsFlags::pic_scaling_matrix_present_flag — note however + * that the libva-v4l2-request-fourier reference always sets this flag + * together with a default-flat matrix, because hantro's set_params reads + * the flag to drive G1_REG_DEC_CTRL2_TYPE1_QUANT_E. Callers may want to + * set this flag unconditionally for hantro; we don't here because the + * spec mapping is the cleaner default. + */ +void +panvk_v4l2_h264_std_to_ctrl_pps(const StdVideoH264PictureParameterSet *in, + struct v4l2_ctrl_h264_pps *out) +{ + memset(out, 0, sizeof(*out)); + + out->pic_parameter_set_id = in->pic_parameter_set_id; + out->seq_parameter_set_id = in->seq_parameter_set_id; + out->num_slice_groups_minus1 = 0; /* not exposed by StdVideo; H.264 + core profile assumes 1 group. */ + out->num_ref_idx_l0_default_active_minus1 = in->num_ref_idx_l0_default_active_minus1; + out->num_ref_idx_l1_default_active_minus1 = in->num_ref_idx_l1_default_active_minus1; + out->weighted_bipred_idc = (__u8) in->weighted_bipred_idc; + out->pic_init_qp_minus26 = in->pic_init_qp_minus26; + out->pic_init_qs_minus26 = in->pic_init_qs_minus26; + out->chroma_qp_index_offset = in->chroma_qp_index_offset; + out->second_chroma_qp_index_offset = in->second_chroma_qp_index_offset; + + __u16 f = 0; + if (in->flags.entropy_coding_mode_flag) + f |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE; + if (in->flags.bottom_field_pic_order_in_frame_present_flag) + f |= V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT; + if (in->flags.weighted_pred_flag) + f |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED; + if (in->flags.deblocking_filter_control_present_flag) + f |= V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT; + if (in->flags.constrained_intra_pred_flag) + f |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED; + if (in->flags.redundant_pic_cnt_present_flag) + f |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT; + if (in->flags.transform_8x8_mode_flag) + f |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE; + /* + * V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT: set UNCONDITIONALLY. + * + * Hantro VPU2 (rockchip_vpu2_hw_h264_dec.c) and G1 both gate + * scaling_list ingestion on this flag (assemble_scaling_list in + * hantro_h264.c:215 short-circuits if clear, leaving the priv-table + * scaling region zero — dequant then computes 0 * quant = 0 pixels). + * libva-v4l2-request-fourier sets the flag together with the spec- + * default flat-16 matrix for this exact reason (h264.c:484 lineage). + * Validated empirically 2026-05-21: with flag clear hantro produces + * all-zero Y plane; with flag set + flat matrix it decodes BBB. + * + * Vulkan-side: the Std flag tracks the bitstream's pic_scaling_matrix + * _present_flag — useful for software decoders but irrelevant to the + * hantro hardware path. Always-on is safe here because the caller + * pairs this PPS with panvk_v4l2_h264_default_flat_scaling_matrix() + * (whose flat-16 values are themselves the H.264 §7.4.2.1.1.1 + * fall-back when no explicit list is signalled). + */ + f |= V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT; + out->flags = f; +} + +/* ------------------------------------------------------------------ */ +/* SCALING_MATRIX */ +/* ------------------------------------------------------------------ */ + +/* + * Translate StdVideoH264ScalingLists → struct v4l2_ctrl_h264_scaling_matrix. + * + * StdVideoH264ScalingLists.ScalingList4x4[6][16]: 6 lists in raster order, + * indices 0..5 = Intra Y, Intra Cb, Intra Cr, Inter Y, Inter Cb, Inter Cr. + * V4L2's scaling_list_4x4[6][16] expects the SAME order (per kernel doc + * ext-ctrls-codec-stateless.rst). → straight memcpy. + * + * StdVideoH264ScalingLists.ScalingList8x8[6][64]: 6 lists in raster order + * per the H.264 spec table 7-2 ordering: Intra Y, Inter Y, Intra Cb, + * Inter Cb, Intra Cr, Inter Cr. V4L2's scaling_list_8x8[6][64] uses the + * SAME order per kernel doc. → straight memcpy. + * + * IMPORTANT — libva-v4l2-request-fourier's h264_va_matrix_to_v4l2 (h264.c + * line 544) does an unusual 8x8 remap (VAMatrix[0]→[0], VAMatrix[1]→[3]) + * because VAIQMatrixBufferH264 only carries 2 of the 6 8x8 lists (Intra Y + * and Inter Y, for YUV420 streams). That's a libva *carrier* limitation, + * not a V4L2 ordering quirk — we do NOT replicate it here. Vulkan + * provides all 6 lists in spec order; we copy them straight. + * + * If a caller has no explicit lists, pass NULL — the caller writes a flat-16 + * default (the H.264 §7.4.2.1.1.1 Flat_4x4_16 / Flat_8x8_16 defaults) into + * the struct directly, see panvk_v4l2_h264_default_flat_scaling_matrix. + */ +void +panvk_v4l2_h264_std_to_ctrl_scaling_matrix(const StdVideoH264ScalingLists *in, + struct v4l2_ctrl_h264_scaling_matrix *out) +{ + memset(out, 0, sizeof(*out)); + if (in == NULL) + return; + + /* Both sides use [6][16] / [6][64] in identical spec-table-7-2 order. */ + memcpy(out->scaling_list_4x4, in->ScalingList4x4, + sizeof(out->scaling_list_4x4)); + memcpy(out->scaling_list_8x8, in->ScalingList8x8, + sizeof(out->scaling_list_8x8)); +} + +/* Spec-default flat scaling matrix (every element = 16). Use when neither + * SPS::seq_scaling_matrix_present_flag nor PPS::pic_scaling_matrix_present_flag + * is set. The H.264 spec §7.4.2.1.1.1 defines Flat_4x4_16 and Flat_8x8_16 + * as the fall-back; the kernel doc recommends always submitting the + * SCALING_MATRIX control with these defaults when explicit lists are + * absent (drivers like hantro G1 read it unconditionally). */ +void +panvk_v4l2_h264_default_flat_scaling_matrix( + struct v4l2_ctrl_h264_scaling_matrix *out) +{ + memset(out->scaling_list_4x4, 16, sizeof(out->scaling_list_4x4)); + memset(out->scaling_list_8x8, 16, sizeof(out->scaling_list_8x8)); +} + +/* ------------------------------------------------------------------ */ +/* DECODE_PARAMS */ +/* ------------------------------------------------------------------ */ + +/* + * Build v4l2_ctrl_h264_decode_params from Vulkan picture info. + * + * The caller supplies: + * vs — panvk_video_session; vs->dpb[slot].reference_ts is the + * V4L2 timestamp (v4l2_buffer.timestamp converted via + * v4l2_timeval_to_ns) of the previously-decoded CAPTURE + * buffer associated with that DPB slot index. + * pic_info — VkVideoDecodeH264PictureInfoKHR for the frame being + * decoded; pStdPictureInfo carries the per-pic fields. + * active_pps — the StdVideoH264PictureParameterSet bound for this + * decode; only needed if num_slice_groups_minus1 > 0 (FMO) + * which Vulkan core profile excludes. Currently unused. + * dst_dpb_slot — output (this-frame) DPB slot, supplied for symmetry; the + * current-frame fields go into the top-level + * v4l2_ctrl_h264_decode_params (NOT into dpb[]). The dpb[] + * array carries reference frames only. + * ref_slots — array of active reference DPB slots from the + * VkVideoDecodeInfoKHR::pReferenceSlots. Each entry's + * slotIndex selects vs->dpb[idx].reference_ts; the + * per-slot StdVideoDecodeH264ReferenceInfo is reachable + * via the VkVideoDecodeH264DpbSlotInfoKHR chained on + * pNext (caller has already resolved it — see helper + * below). + * num_ref_slots — count of entries in ref_slots[]. + * output_ts — V4L2 reference_ts assigned to the CAPTURE buffer for + * *this* frame; recorded into the SETUP slot mapping by + * the caller, not used in this struct itself. + * + * Fields the caller MUST populate post-translation: + * - dec_ref_pic_marking_bit_size + * - pic_order_cnt_bit_size + * - pic_order_cnt_lsb / delta_pic_order_cnt_bottom / delta_pic_order_cnt0 / + * delta_pic_order_cnt1 / idr_pic_id + * These come from the slice header bit-level parse (Vulkan doesn't + * forward them in StdVideoDecodeH264PictureInfo). The hantro G1 reads + * them from registers; without them the decoder produces zeros. + * See libva-v4l2-request-fourier h264.c:394-449 for the parse contract. + * + * - slice_group_change_cycle: from slice header, only meaningful when + * num_slice_groups_minus1 > 0 (not in Vulkan core profile). + */ + +/* Helper: extract StdVideoDecodeH264ReferenceInfo from a VkVideoReferenceSlotInfoKHR + * pNext chain. Returns NULL if the chain doesn't include + * VkVideoDecodeH264DpbSlotInfoKHR. */ +static const StdVideoDecodeH264ReferenceInfo * +ref_info_from_slot(const VkVideoReferenceSlotInfoKHR *slot) +{ + const VkBaseInStructure *p = (const VkBaseInStructure *) slot->pNext; + while (p != NULL) { + if (p->sType == VK_STRUCTURE_TYPE_VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR) { + const VkVideoDecodeH264DpbSlotInfoKHR *dpb = + (const VkVideoDecodeH264DpbSlotInfoKHR *) p; + return dpb->pStdReferenceInfo; + } + p = p->pNext; + } + return NULL; +} + +void +panvk_v4l2_h264_build_decode_params( + const struct panvk_video_session *vs, + const VkVideoDecodeH264PictureInfoKHR *pic_info, + const StdVideoH264PictureParameterSet *active_pps, + uint32_t dst_dpb_slot, + const VkVideoReferenceSlotInfoKHR *ref_slots, + uint32_t num_ref_slots, + uint64_t output_ts, + struct v4l2_ctrl_h264_decode_params *out) +{ + (void) active_pps; /* FMO-only; not in Vulkan core profile. */ + (void) dst_dpb_slot; /* Caller records output_ts → vs->dpb[slot] post-decode. */ + (void) output_ts; /* Same. */ + + memset(out, 0, sizeof(*out)); + + const StdVideoDecodeH264PictureInfo *spic = pic_info->pStdPictureInfo; + + /* Current-frame top-level fields. */ + out->frame_num = spic->frame_num; + out->idr_pic_id = spic->idr_pic_id; /* may be overwritten by + slice-header parse. */ + out->top_field_order_cnt = spic->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_TOP]; + out->bottom_field_order_cnt= spic->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_BOTTOM]; + + /* nal_ref_idc: not in StdVideoDecodeH264PictureInfo. The caller derives + * it from the first byte of the slice NAL (high 2 bits after the + * forbidden-zero-bit). Hantro reads it via DECODE_PARAMS, so this + * SHOULD be set post-call. For non-reference frames the H.264 spec + * mandates nal_ref_idc == 0; we leave the field at zero and the caller + * patches in the parsed value. */ + out->nal_ref_idc = 0; + + __u32 f = 0; + if (spic->flags.IdrPicFlag) + f |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC; + if (spic->flags.field_pic_flag) + f |= V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC; + if (spic->flags.bottom_field_flag) + f |= V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD; + /* PFRAME/BFRAME flags are slice-type-derived and not 1:1 with Vulkan + * picture info (slice type is per-slice, not per-pic). Leave clear; + * the slice_header parse path or higher-level caller can OR them in if + * an FRAME_BASED driver needs them. Hantro G1 does not. */ + out->flags = f; + + /* DPB array. One entry per active reference slot. */ + for (uint32_t i = 0; i < num_ref_slots && i < V4L2_H264_NUM_DPB_ENTRIES; i++) { + const VkVideoReferenceSlotInfoKHR *slot = &ref_slots[i]; + struct v4l2_h264_dpb_entry *dpb = &out->dpb[i]; + + if (slot->slotIndex < 0) + continue; /* "no reference" sentinel; entry stays zeroed (invalid). */ + + const uint32_t idx = (uint32_t) slot->slotIndex; + if (idx >= 16 || !vs->dpb[idx].valid) + continue; + + const StdVideoDecodeH264ReferenceInfo *rinfo = ref_info_from_slot(slot); + + dpb->reference_ts = vs->dpb[idx].reference_ts; + + if (rinfo != NULL) { + dpb->frame_num = rinfo->FrameNum; + /* pic_num: for short-term refs this is FrameNumWrap (H.264 §8.2.4.1); + * for long-term refs it's LongTermPicNum (§8.2.4.2). StdVideo + * doesn't separate the two — FrameNum holds whichever applies for + * the kind of reference. The kernel reflist builder uses pic_num + * only for short-term ordering; we feed FrameNum straight through + * and rely on V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM to disambiguate. + * + * NOTE: FrameNumWrap requires knowing max_frame_num and the + * current frame's frame_num to wrap. Vulkan-side callers that + * want spec-perfect pic_num for short-term refs should override + * this field after calling. The hantro driver ignores pic_num + * (uses reference_ts) so the wrap is empirically not load-bearing + * on RK3566/RK3568. + */ + dpb->pic_num = rinfo->FrameNum; + dpb->top_field_order_cnt = rinfo->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_TOP]; + dpb->bottom_field_order_cnt = rinfo->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_BOTTOM]; + + __u32 dflags = V4L2_H264_DPB_ENTRY_FLAG_VALID | V4L2_H264_DPB_ENTRY_FLAG_ACTIVE; + if (rinfo->flags.used_for_long_term_reference) + dflags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM; + /* FIELD flag indicates a single-field-coded reference; both + * top_field_flag and bottom_field_flag in StdVideo mean the entry + * represents only that field. */ + if (rinfo->flags.top_field_flag || rinfo->flags.bottom_field_flag) + dflags |= V4L2_H264_DPB_ENTRY_FLAG_FIELD; + dpb->flags = dflags; + + /* fields: per kernel doc, valid values are V4L2_H264_{TOP,BOTTOM,FRAME}_REF. + * For frame-coded refs we use FRAME_REF (TOP|BOTTOM). The kernel + * reflist builder skips entries with fields == 0 — see hantro and + * the v4l2_h264_init_reflist_builder helper. */ + if (rinfo->flags.top_field_flag && !rinfo->flags.bottom_field_flag) + dpb->fields = V4L2_H264_TOP_FIELD_REF; + else if (rinfo->flags.bottom_field_flag && !rinfo->flags.top_field_flag) + dpb->fields = V4L2_H264_BOTTOM_FIELD_REF; + else + dpb->fields = V4L2_H264_FRAME_REF; + } else { + /* No StdVideoDecodeH264ReferenceInfo chained: minimal fallback. */ + dpb->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID; + dpb->fields = V4L2_H264_FRAME_REF; + } + } +} diff -urN a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c --- a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c 1970-01-01 01:00:00.000000000 +0100 +++ b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c 2026-05-22 10:17:41.214043265 +0200 @@ -0,0 +1,314 @@ +/* + * H.264 slice header bit-parser implementation. + * + * Verbatim port of libva-v4l2-request-fourier src/h264_slice_header.c + * with the public symbol renamed to panvk_v4l2_h264_parse_slice_header() + * and the type names prefixed for Mesa namespace hygiene. See + * panvk_v4l2_h264_slice_header.h for context. + * + * SPDX-License-Identifier: MIT + */ + +#include "panvk_v4l2_h264_slice_header.h" + +#include +#include + +struct br { + const uint8_t *data; + size_t length; /* bytes */ + size_t bit_pos; + bool error; +}; + +static uint32_t br_read_u(struct br *b, unsigned n) +{ + uint32_t v = 0; + while (n--) { + if (b->bit_pos >= b->length * 8) { + b->error = true; + return 0; + } + v = (v << 1) | ((b->data[b->bit_pos >> 3] >> + (7 - (b->bit_pos & 7))) & 1u); + b->bit_pos++; + } + return v; +} + +static uint32_t br_read_ue(struct br *b) +{ + unsigned zeros = 0; + while (br_read_u(b, 1) == 0) { + if (b->error || ++zeros >= 32) + return 0; + } + if (zeros == 0) + return 0; + return (1u << zeros) - 1u + br_read_u(b, zeros); +} + +static int32_t br_read_se(struct br *b) +{ + uint32_t v = br_read_ue(b); + if (v & 1u) + return (int32_t)((v + 1u) >> 1); + return -(int32_t)(v >> 1); +} + +#define PANVK_H264_SLICE_HEADER_SCAN_BYTES 64 + +static size_t rbsp_unescape(uint8_t *out, const uint8_t *in, size_t in_len) +{ + size_t out_len = 0; + int zero_run = 0; + size_t i; + size_t cap = in_len < PANVK_H264_SLICE_HEADER_SCAN_BYTES ? + in_len : PANVK_H264_SLICE_HEADER_SCAN_BYTES; + + for (i = 0; i < cap; i++) { + if (zero_run >= 2 && in[i] == 0x03) { + zero_run = 0; + continue; + } + out[out_len++] = in[i]; + zero_run = (in[i] == 0x00) ? zero_run + 1 : 0; + } + return out_len; +} + +static void skip_ref_pic_list_modification(struct br *b, uint32_t slice_type) +{ + uint32_t st_mod5 = slice_type % 5; + + if (st_mod5 != 2 && st_mod5 != 4) { + uint32_t flag = br_read_u(b, 1); + if (flag) { + uint32_t mod_idc; + do { + mod_idc = br_read_ue(b); + if (mod_idc == 0 || mod_idc == 1) + br_read_ue(b); + else if (mod_idc == 2) + br_read_ue(b); + if (b->error) + return; + } while (mod_idc != 3); + } + } + if (st_mod5 == 1) { + uint32_t flag = br_read_u(b, 1); + if (flag) { + uint32_t mod_idc; + do { + mod_idc = br_read_ue(b); + if (mod_idc == 0 || mod_idc == 1) + br_read_ue(b); + else if (mod_idc == 2) + br_read_ue(b); + if (b->error) + return; + } while (mod_idc != 3); + } + } +} + +static void skip_pred_weight_table(struct br *b, + uint32_t slice_type, + uint8_t chroma_format_idc, + uint8_t bit_depth_luma_minus8, + uint8_t bit_depth_chroma_minus8, + uint32_t num_ref_idx_l0_active_minus1, + uint32_t num_ref_idx_l1_active_minus1) +{ + uint32_t i, j; + uint32_t st_mod5 = slice_type % 5; + + (void)bit_depth_luma_minus8; + (void)bit_depth_chroma_minus8; + + br_read_ue(b); /* luma_log2_weight_denom */ + if (chroma_format_idc != 0) + br_read_ue(b); /* chroma_log2_weight_denom */ + + for (i = 0; i <= num_ref_idx_l0_active_minus1 && !b->error; i++) { + uint32_t luma_weight_l0_flag = br_read_u(b, 1); + if (luma_weight_l0_flag) { + br_read_se(b); + br_read_se(b); + } + if (chroma_format_idc != 0) { + uint32_t chroma_weight_l0_flag = br_read_u(b, 1); + if (chroma_weight_l0_flag) { + for (j = 0; j < 2; j++) { + br_read_se(b); + br_read_se(b); + } + } + } + } + + if (st_mod5 == 1) { + for (i = 0; i <= num_ref_idx_l1_active_minus1 && !b->error; i++) { + uint32_t luma_weight_l1_flag = br_read_u(b, 1); + if (luma_weight_l1_flag) { + br_read_se(b); + br_read_se(b); + } + if (chroma_format_idc != 0) { + uint32_t chroma_weight_l1_flag = br_read_u(b, 1); + if (chroma_weight_l1_flag) { + for (j = 0; j < 2; j++) { + br_read_se(b); + br_read_se(b); + } + } + } + } + } +} + +int panvk_v4l2_h264_parse_slice_header( + const uint8_t *nal_payload, + size_t nal_payload_length, + const struct panvk_v4l2_h264_slice_header_context *ctx, + struct panvk_v4l2_h264_slice_header_info *out) +{ + uint8_t unescaped[PANVK_H264_SLICE_HEADER_SCAN_BYTES]; + size_t unescaped_len; + struct br b = { 0 }; + bool idr_pic_flag = (ctx->nal_unit_type == 5); + uint32_t slice_type; + uint32_t num_ref_idx_l0_active_minus1; + uint32_t num_ref_idx_l1_active_minus1; + size_t pic_order_cnt_start; + size_t pic_order_cnt_end; + size_t dec_ref_pic_marking_start; + size_t dec_ref_pic_marking_end; + bool field_pic_flag = false; + + memset(out, 0, sizeof(*out)); + + if (!nal_payload || nal_payload_length == 0) + return -EINVAL; + + unescaped_len = rbsp_unescape(unescaped, nal_payload, nal_payload_length); + if (unescaped_len < 2) + return -EINVAL; + + b.data = unescaped; + b.length = unescaped_len; + b.bit_pos = 0; + b.error = false; + + out->first_mb_in_slice = br_read_ue(&b); + slice_type = br_read_ue(&b); + out->slice_type = slice_type; + out->pic_parameter_set_id = br_read_ue(&b); + + if (ctx->separate_colour_plane_flag) + (void)br_read_u(&b, 2); + + out->frame_num = br_read_u(&b, ctx->log2_max_frame_num_minus4 + 4u); + + if (!ctx->frame_mbs_only_flag) { + field_pic_flag = (br_read_u(&b, 1) != 0); + if (field_pic_flag) + (void)br_read_u(&b, 1); + } + + if (idr_pic_flag) + out->idr_pic_id = (uint16_t)br_read_ue(&b); + + pic_order_cnt_start = b.bit_pos; + if (ctx->pic_order_cnt_type == 0) { + out->pic_order_cnt_lsb = (uint16_t)br_read_u( + &b, ctx->log2_max_pic_order_cnt_lsb_minus4 + 4u); + if (ctx->bottom_field_pic_order_in_frame_present_flag && !field_pic_flag) + out->delta_pic_order_cnt_bottom = br_read_se(&b); + } else if (ctx->pic_order_cnt_type == 1 && + !ctx->delta_pic_order_always_zero_flag) { + out->delta_pic_order_cnt0 = br_read_se(&b); + if (ctx->bottom_field_pic_order_in_frame_present_flag && !field_pic_flag) + out->delta_pic_order_cnt1 = br_read_se(&b); + } + pic_order_cnt_end = b.bit_pos; + out->pic_order_cnt_bit_size = + (uint32_t)(pic_order_cnt_end - pic_order_cnt_start); + + if (ctx->redundant_pic_cnt_present_flag) + (void)br_read_ue(&b); + + if (slice_type % 5 == 1) + (void)br_read_u(&b, 1); + + num_ref_idx_l0_active_minus1 = ctx->num_ref_idx_l0_default_active_minus1; + num_ref_idx_l1_active_minus1 = ctx->num_ref_idx_l1_default_active_minus1; + + { + uint32_t st = slice_type % 5; + if (st == 0 || st == 3 || st == 1) { + uint32_t override = br_read_u(&b, 1); + if (override) { + num_ref_idx_l0_active_minus1 = br_read_ue(&b); + if (st == 1) + num_ref_idx_l1_active_minus1 = br_read_ue(&b); + } + } + } + + skip_ref_pic_list_modification(&b, slice_type); + if (b.error) + return -EIO; + + { + uint32_t st = slice_type % 5; + bool do_pwt = + (ctx->weighted_pred_flag && (st == 0 || st == 3)) || + (ctx->weighted_bipred_idc == 1 && st == 1); + if (do_pwt) { + skip_pred_weight_table(&b, slice_type, + ctx->chroma_format_idc, + ctx->bit_depth_luma_minus8, + ctx->bit_depth_chroma_minus8, + num_ref_idx_l0_active_minus1, + num_ref_idx_l1_active_minus1); + if (b.error) + return -EIO; + } + } + + dec_ref_pic_marking_start = b.bit_pos; + if (ctx->nal_ref_idc != 0) { + if (idr_pic_flag) { + (void)br_read_u(&b, 1); + (void)br_read_u(&b, 1); + } else { + uint32_t adaptive = br_read_u(&b, 1); + if (adaptive) { + uint32_t mmco; + do { + mmco = br_read_ue(&b); + if (mmco == 1 || mmco == 3) + br_read_ue(&b); + if (mmco == 2) + br_read_ue(&b); + if (mmco == 3 || mmco == 6) + br_read_ue(&b); + if (mmco == 4) + br_read_ue(&b); + if (b.error) + return -EIO; + } while (mmco != 0); + } + } + } + dec_ref_pic_marking_end = b.bit_pos; + out->dec_ref_pic_marking_bit_size = + (uint32_t)(dec_ref_pic_marking_end - dec_ref_pic_marking_start); + + if (b.error) + return -EIO; + + return 0; +} diff -urN a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h --- a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h 1970-01-01 01:00:00.000000000 +0100 +++ b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h 2026-05-22 10:17:41.214043265 +0200 @@ -0,0 +1,94 @@ +/* + * H.264 slice header bit-parser for panvk-bifrost-video / V4L2 stateless + * H.264 decode (hantro G1 on RK3566/RK3568 Mali-Bifrost SBCs). + * + * Extracts the slice-header bit-position and value fields that + * V4L2_CID_STATELESS_H264_DECODE_PARAMS requires (idr_pic_id, + * pic_order_cnt_lsb, delta_pic_order_cnt_*, pic_order_cnt_bit_size, + * dec_ref_pic_marking_bit_size). Vulkan's StdVideoDecodeH264PictureInfo + * does not carry these — they live only in the bitstream's slice_header() + * syntax. Hantro G1 (drivers/media/platform/verisilicon/ + * hantro_g1_h264_dec.c::set_params) writes the bit_size fields directly + * into MMIO registers G1_REG_DEC_CTRL5_REFPIC_MK_LEN and + * G1_REG_DEC_CTRL6_POC_LENGTH; with zeros the hardware bitstream parser + * walks past zero bits, lands on garbage, decodes nothing. + * + * Spec reference: ITU-T Rec. H.264 (08/2024) §7.3.3 slice_header + * and §7.3.3.1 ref_pic_list_modification, §7.3.3.2 pred_weight_table, + * §7.3.3.3 dec_ref_pic_marking. + * + * Cross-reference (proven working on hantro G1): libva-v4l2-request-fourier + * src/h264_slice_header.{c,h}. This file is a verbatim port with the + * function renamed from h264_parse_slice_header() to + * panvk_v4l2_h264_parse_slice_header() for namespace hygiene inside Mesa. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef PANVK_V4L2_H264_SLICE_HEADER_H +#define PANVK_V4L2_H264_SLICE_HEADER_H + +#include +#include +#include + +struct panvk_v4l2_h264_slice_header_context { + /* From SPS (the active SPS at slice-time). */ + bool separate_colour_plane_flag; + uint8_t log2_max_frame_num_minus4; + bool frame_mbs_only_flag; + uint8_t pic_order_cnt_type; + uint8_t log2_max_pic_order_cnt_lsb_minus4; + bool delta_pic_order_always_zero_flag; + + /* From PPS (the active PPS at slice-time). */ + bool bottom_field_pic_order_in_frame_present_flag; + bool redundant_pic_cnt_present_flag; + bool weighted_pred_flag; + uint8_t weighted_bipred_idc; + uint8_t num_ref_idx_l0_default_active_minus1; + uint8_t num_ref_idx_l1_default_active_minus1; + uint8_t chroma_format_idc; + uint8_t bit_depth_luma_minus8; + uint8_t bit_depth_chroma_minus8; + + /* From the NAL unit header (already extracted by the caller). */ + uint8_t nal_unit_type; + uint8_t nal_ref_idc; +}; + +struct panvk_v4l2_h264_slice_header_info { + uint16_t idr_pic_id; + uint16_t pic_order_cnt_lsb; + int32_t delta_pic_order_cnt_bottom; + int32_t delta_pic_order_cnt0; + int32_t delta_pic_order_cnt1; + uint32_t pic_order_cnt_bit_size; + uint32_t dec_ref_pic_marking_bit_size; + + /* Diagnostic — useful for cross-checking pre-parsed vs bitstream values. */ + uint32_t first_mb_in_slice; + uint32_t slice_type; + uint32_t pic_parameter_set_id; + uint32_t frame_num; +}; + +/* + * Parse slice_header() up to dec_ref_pic_marking() (inclusive) of the + * H.264 RBSP slice_layer_without_partitioning_rbsp() syntax, extracting + * the V4L2 DECODE_PARAMS fields. Returns 0 on success, negative + * errno-shaped value on parse failure. + * + * @nal_payload: pointer to the byte AFTER the NAL header byte + * (i.e. start of the RBSP proper; caller has already + * skipped any ANNEX_B start code and the 1-byte + * nal_unit_header). Will be RBSP-unescaped internally. + * @nal_payload_length: bytes available at @nal_payload. + */ +int panvk_v4l2_h264_parse_slice_header( + const uint8_t *nal_payload, + size_t nal_payload_length, + const struct panvk_v4l2_h264_slice_header_context *ctx, + struct panvk_v4l2_h264_slice_header_info *out); + +#endif /* PANVK_V4L2_H264_SLICE_HEADER_H */ diff -urN a/src/panfrost/vulkan/panvk_video_decode.c b/src/panfrost/vulkan/panvk_video_decode.c --- a/src/panfrost/vulkan/panvk_video_decode.c 1970-01-01 01:00:00.000000000 +0100 +++ b/src/panfrost/vulkan/panvk_video_decode.c 2026-05-22 10:17:41.214043265 +0200 @@ -0,0 +1,380 @@ +/* + * panvk-bifrost-video: Vulkan video decode entrypoints (H.264). + * + * Drives the V4L2 stateless hantro VPU backend (panvk_v4l2.c) from + * Vulkan vkCmdDecodeVideoKHR. Decode is synchronous at record time — + * the full V4L2 ioctl dance runs to completion inside the command- + * recording call before returning to the application. The queue-side + * `driver_submit` is a no-op signal-everything (see panvk_vX_device.c). + * + * Phase 1 simplifications worth knowing about: + * + * - Cmd-buffer state lives at the DEVICE level (`active_video`) under + * a single mutex, NOT per-cmd-buffer. Concurrent video sessions on + * the same device clobber each other. Sufficient for current single- + * session consumers (mpv-fourier, ffmpeg-vulkan-h264, vk-video- + * samples). Spec-compliant multi-session is a Phase >>1 follow-up. + * + * - Source bitstream is read via `src_buf->mem->addr.host`, i.e. the + * bound VkDeviceMemory's CPU mapping. Works because panvk-bifrost + * only exposes HOST_VISIBLE memory types; an app that bound the + * bitstream buffer to non-HOST_VISIBLE memory would get a logged + * error and a silent decode skip (CmdDecodeVideoKHR is void, so we + * have no clean error-return path). VkPhysicalDeviceVideo* + * constraints would be the right place to make this contractual. + * + * - Requires `PAN_I_WANT_A_BROKEN_VULKAN_DRIVER=1` (mesa-upstream gate + * on panvk-on-Bifrost which is not conformant). + * + * SPDX-License-Identifier: MIT + */ + +#include "panvk_video_decode.h" +#include "panvk_v4l2_h264_slice_header.h" +#include "panvk_buffer.h" +#include "panvk_device.h" +#include "panvk_device_memory.h" +#include "panvk_entrypoints.h" +#include "panvk_image.h" + +#include "vk_image.h" + +#include "vk_alloc.h" +#include "vk_command_buffer.h" +#include "vk_log.h" +#include "vk_video.h" + +#include "util/macros.h" + +#include "kmod/pan_kmod.h" + +#include +#include +#include + +VKAPI_ATTR VkResult VKAPI_CALL +panvk_CreateVideoSessionKHR(VkDevice _device, + const VkVideoSessionCreateInfoKHR *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkVideoSessionKHR *pVideoSession) +{ + VK_FROM_HANDLE(panvk_device, device, _device); + + struct panvk_video_session *vs = + vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*vs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!vs) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + VkResult r = vk_video_session_init(&device->vk, &vs->vk, pCreateInfo); + if (r != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, vs); + return r; + } + + vs->video_fd = -1; + vs->media_fd = -1; + vs->slice_based = false; + + int v4l2_rc = panvk_v4l2_session_init(vs, &device->vk, pAllocator, + pCreateInfo->maxCodedExtent.width, + pCreateInfo->maxCodedExtent.height); + if (v4l2_rc) { + mesa_loge("panvk_video: V4L2 session init failed rc=%d", v4l2_rc); + vk_video_session_finish(&vs->vk); + vk_free2(&device->vk.alloc, pAllocator, vs); + return vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + } + + *pVideoSession = panvk_video_session_to_handle(vs); + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +panvk_DestroyVideoSessionKHR(VkDevice _device, + VkVideoSessionKHR videoSession, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(panvk_device, device, _device); + VK_FROM_HANDLE(panvk_video_session, vs, videoSession); + if (!vs) return; + + /* Clear device-level active state if it was this session. */ + simple_mtx_lock(&device->active_video.lock); + if (device->active_video.vs == vs) { + device->active_video.vs = NULL; + device->active_video.params = NULL; + } + simple_mtx_unlock(&device->active_video.lock); + + panvk_v4l2_session_finish(vs, &device->vk, pAllocator); + vk_video_session_finish(&vs->vk); + vk_free2(&device->vk.alloc, pAllocator, vs); +} + +VKAPI_ATTR VkResult VKAPI_CALL +panvk_GetVideoSessionMemoryRequirementsKHR( + VkDevice device, + VkVideoSessionKHR videoSession, + uint32_t *pMemoryRequirementsCount, + VkVideoSessionMemoryRequirementsKHR *pMemoryRequirements) +{ + *pMemoryRequirementsCount = 0; + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +panvk_BindVideoSessionMemoryKHR( + VkDevice device, + VkVideoSessionKHR videoSession, + uint32_t bindSessionMemoryInfoCount, + const VkBindVideoSessionMemoryInfoKHR *pBindSessionMemoryInfos) +{ + return VK_SUCCESS; +} + +/* Helper: device lookup from VkCommandBuffer via the vk_command_buffer base. */ +static struct panvk_device * +cmdbuf_to_device(VkCommandBuffer commandBuffer) +{ + VK_FROM_HANDLE(vk_command_buffer, vk_cmdbuf, commandBuffer); + return to_panvk_device(vk_cmdbuf->base.device); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_CmdBeginVideoCodingKHR(VkCommandBuffer commandBuffer, + const VkVideoBeginCodingInfoKHR *pBeginInfo) +{ + struct panvk_device *device = cmdbuf_to_device(commandBuffer); + VK_FROM_HANDLE(panvk_video_session, vs, pBeginInfo->videoSession); + + simple_mtx_lock(&device->active_video.lock); + device->active_video.vs = vs; + if (pBeginInfo->videoSessionParameters != VK_NULL_HANDLE) { + VK_FROM_HANDLE(vk_video_session_parameters, params, + pBeginInfo->videoSessionParameters); + device->active_video.params = params; + } else { + device->active_video.params = NULL; + } + simple_mtx_unlock(&device->active_video.lock); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_CmdEndVideoCodingKHR(VkCommandBuffer commandBuffer, + const VkVideoEndCodingInfoKHR *pEndCodingInfo) +{ + struct panvk_device *device = cmdbuf_to_device(commandBuffer); + simple_mtx_lock(&device->active_video.lock); + device->active_video.vs = NULL; + device->active_video.params = NULL; + simple_mtx_unlock(&device->active_video.lock); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_CmdControlVideoCodingKHR(VkCommandBuffer commandBuffer, + const VkVideoCodingControlInfoKHR *pCodingControlInfo) +{ + struct panvk_device *device = cmdbuf_to_device(commandBuffer); + simple_mtx_lock(&device->active_video.lock); + if (device->active_video.vs && + (pCodingControlInfo->flags & VK_VIDEO_CODING_CONTROL_RESET_BIT_KHR)) { + for (unsigned i = 0; i < 16; i++) + device->active_video.vs->dpb[i].valid = false; + } + simple_mtx_unlock(&device->active_video.lock); +} + +VKAPI_ATTR void VKAPI_CALL +panvk_CmdDecodeVideoKHR(VkCommandBuffer commandBuffer, + const VkVideoDecodeInfoKHR *pDecodeInfo) +{ + struct panvk_device *device = cmdbuf_to_device(commandBuffer); + + simple_mtx_lock(&device->active_video.lock); + struct panvk_video_session *vs = device->active_video.vs; + struct vk_video_session_parameters *params = device->active_video.params; + simple_mtx_unlock(&device->active_video.lock); + + if (!vs || !params) { + mesa_loge("panvk_video: CmdDecodeVideoKHR outside Begin/End scope"); + return; + } + + const VkVideoDecodeH264PictureInfoKHR *h264_pi = + vk_find_struct_const(pDecodeInfo->pNext, + VIDEO_DECODE_H264_PICTURE_INFO_KHR); + if (!h264_pi || !h264_pi->pStdPictureInfo) { + mesa_loge("panvk_video: missing H.264 picture info"); + return; + } + + const StdVideoH264SequenceParameterSet *sps = + vk_video_find_h264_dec_std_sps(params, + h264_pi->pStdPictureInfo->seq_parameter_set_id); + const StdVideoH264PictureParameterSet *pps = + vk_video_find_h264_dec_std_pps(params, + h264_pi->pStdPictureInfo->pic_parameter_set_id); + if (!sps || !pps) { + mesa_loge("panvk_video: SPS or PPS lookup failed"); + return; + } + + /* Translate Std → V4L2 control structs. */ + struct v4l2_ctrl_h264_sps c_sps; + struct v4l2_ctrl_h264_pps c_pps; + struct v4l2_ctrl_h264_scaling_matrix c_scaling; + struct v4l2_ctrl_h264_decode_params c_dec; + + panvk_v4l2_h264_std_to_ctrl_sps(sps, &c_sps); + panvk_v4l2_h264_std_to_ctrl_pps(pps, &c_pps); + panvk_v4l2_h264_default_flat_scaling_matrix(&c_scaling); + + /* + * output_ts: V4L2 buffer-identity stamp. Must round-trip cleanly through + * (tv_sec, tv_usec) at QBUF time, because hantro's reflist builder + * matches dpb[i].reference_ts against the kernel-side CAPTURE timestamp + * (which is the OUTPUT-QBUF timestamp re-derived via v4l2_timeval_to_ns: + * `tv_sec * 1e9 + tv_usec * 1e3`). Sub-microsecond bits are dropped, so + * any high-resolution stamp (e.g. a 64-bit pointer cast) makes the + * lookup miss and P/B frames decode against zero references. Use a + * per-session monotonic counter in microseconds (i.e. * 1000 ns) so + * concurrent sessions sharing /dev/video1 don't collide on stamp. + */ + const uint64_t output_ts = ((uint64_t)++vs->ts_counter) * 1000ULL; + uint32_t dst_dpb_slot = pDecodeInfo->pSetupReferenceSlot + ? (uint32_t) pDecodeInfo->pSetupReferenceSlot->slotIndex : 0u; + + panvk_v4l2_h264_build_decode_params(vs, h264_pi, pps, + dst_dpb_slot, + pDecodeInfo->pReferenceSlots, + pDecodeInfo->referenceSlotCount, + output_ts, &c_dec); + + /* Resolve source bitstream CPU pointer via panvk_buffer.mem.addr.host. */ + VK_FROM_HANDLE(panvk_buffer, src_buf, pDecodeInfo->srcBuffer); + if (!src_buf || !src_buf->mem || !src_buf->mem->addr.host) { + mesa_loge("panvk_video: src buffer has no host map"); + return; + } + const void *src_bitstream = + (const uint8_t *) src_buf->mem->addr.host + + src_buf->mem_offset + pDecodeInfo->srcBufferOffset; + + /* + * Slice-header bit-level parse — recovers the DECODE_PARAMS fields + * that StdVideoDecodeH264PictureInfo doesn't carry: idr_pic_id, + * pic_order_cnt_lsb, delta_pic_order_cnt_*, pic_order_cnt_bit_size, + * dec_ref_pic_marking_bit_size, and nal_ref_idc. Hantro G1 writes the + * bit_size fields directly into MMIO registers G1_REG_DEC_CTRL5/CTRL6; + * with zeros the hardware bitstream parser walks past zero bits, lands + * on garbage, and decodes all-zero pixels — observed empirically as the + * "Y plane all zeros" symptom that closed the prior Commit 7e. + * + * Cross-reference (proven fix on hantro): libva-v4l2-request-fourier + * src/h264.c:394-449. The panvk_v4l2_h264_slice_header.{c,h} parser + * is a verbatim port with namespace renames. + * + * Expects ANNEX_B start-code-prefixed VCL NAL at *src_bitstream*. We + * skip the 3- or 4-byte start code then the 1-byte NAL header. + */ + { + const uint8_t *bs = (const uint8_t *) src_bitstream; + uint32_t bs_len = pDecodeInfo->srcBufferRange; + uint32_t off = 0; + /* Skip ANNEX_B start code (0x00 00 01 or 0x00 00 00 01). */ + if (bs_len >= 4 && bs[0] == 0 && bs[1] == 0 && bs[2] == 0 && bs[3] == 1) + off = 4; + else if (bs_len >= 3 && bs[0] == 0 && bs[1] == 0 && bs[2] == 1) + off = 3; + + if (bs_len > off + 1) { + uint8_t nal_hdr = bs[off]; + uint8_t nal_ref_idc = (nal_hdr >> 5) & 0x3; + uint8_t nal_unit_type = nal_hdr & 0x1f; + + const struct panvk_v4l2_h264_slice_header_context sh_ctx = { + .separate_colour_plane_flag = + (sps->flags.separate_colour_plane_flag != 0), + .log2_max_frame_num_minus4 = sps->log2_max_frame_num_minus4, + .frame_mbs_only_flag = (sps->flags.frame_mbs_only_flag != 0), + .pic_order_cnt_type = (uint8_t) sps->pic_order_cnt_type, + .log2_max_pic_order_cnt_lsb_minus4 = + sps->log2_max_pic_order_cnt_lsb_minus4, + .delta_pic_order_always_zero_flag = + (sps->flags.delta_pic_order_always_zero_flag != 0), + .bottom_field_pic_order_in_frame_present_flag = + (pps->flags.bottom_field_pic_order_in_frame_present_flag != 0), + .redundant_pic_cnt_present_flag = + (pps->flags.redundant_pic_cnt_present_flag != 0), + .weighted_pred_flag = + (pps->flags.weighted_pred_flag != 0), + .weighted_bipred_idc = (uint8_t) pps->weighted_bipred_idc, + .num_ref_idx_l0_default_active_minus1 = + pps->num_ref_idx_l0_default_active_minus1, + .num_ref_idx_l1_default_active_minus1 = + pps->num_ref_idx_l1_default_active_minus1, + .chroma_format_idc = (uint8_t) sps->chroma_format_idc, + .bit_depth_luma_minus8 = sps->bit_depth_luma_minus8, + .bit_depth_chroma_minus8 = sps->bit_depth_chroma_minus8, + .nal_unit_type = nal_unit_type, + .nal_ref_idc = nal_ref_idc, + }; + struct panvk_v4l2_h264_slice_header_info sh = { 0 }; + const uint8_t *nal_payload = bs + off + 1; /* past NAL header byte */ + uint32_t nal_payload_len = bs_len - (off + 1); + + int sh_rc = panvk_v4l2_h264_parse_slice_header( + nal_payload, nal_payload_len, &sh_ctx, &sh); + if (sh_rc == 0) { + c_dec.idr_pic_id = sh.idr_pic_id; + c_dec.pic_order_cnt_lsb = sh.pic_order_cnt_lsb; + c_dec.delta_pic_order_cnt_bottom = sh.delta_pic_order_cnt_bottom; + c_dec.delta_pic_order_cnt0 = sh.delta_pic_order_cnt0; + c_dec.delta_pic_order_cnt1 = sh.delta_pic_order_cnt1; + c_dec.pic_order_cnt_bit_size = sh.pic_order_cnt_bit_size; + c_dec.dec_ref_pic_marking_bit_size = + sh.dec_ref_pic_marking_bit_size; + c_dec.nal_ref_idc = nal_ref_idc; + /* + * IDR_PIC flag: Vulkan's StdVideoDecodeH264PictureInfo.flags. + * IdrPicFlag is application-supplied and the vk-video-samples + * parser leaves it zero. Recover it from nal_unit_type (==5 is + * IDR per H.264 §7.4.1). Without this flag set, hantro's + * VDPU_REG_IDR_PIC_E stays clear and the hardware treats the + * frame as P/B, hunts for references it doesn't have, and + * writes zero output. + */ + if (nal_unit_type == 5) + c_dec.flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC; + } else { + mesa_loge("panvk_video: slice_header parse FAILED rc=%d " + "(payload_len=%u) — DECODE_PARAMS bit_size fields " + "left zero, hantro will produce zeros", + sh_rc, nal_payload_len); + } + } else { + mesa_loge("panvk_video: bitstream too short for NAL header " + "(bs_len=%u off=%u)", bs_len, off); + } + } + + /* The 14-step ioctl dance synchronously. CPU-copy variant for Phase 1. */ + int rc = panvk_v4l2_submit_h264_decode(vs, &c_sps, &c_pps, &c_scaling, + &c_dec, + src_bitstream, + pDecodeInfo->srcBufferRange, + -1, /* dst unused (MMAP CAPTURE) */ + output_ts); + + if (rc) { + mesa_loge("panvk_video: decode submit failed rc=%d", rc); + return; + } + + /* Update DPB tracking. */ + if (dst_dpb_slot < 16) { + vs->dpb[dst_dpb_slot].valid = true; + vs->dpb[dst_dpb_slot].reference_ts = output_ts; + } +} diff -urN a/src/panfrost/vulkan/panvk_video_decode.h b/src/panfrost/vulkan/panvk_video_decode.h --- a/src/panfrost/vulkan/panvk_video_decode.h 1970-01-01 01:00:00.000000000 +0100 +++ b/src/panfrost/vulkan/panvk_video_decode.h 2026-05-22 10:17:41.214043265 +0200 @@ -0,0 +1,124 @@ +/* + * panvk-bifrost-video Phase 4 commit 3: extended for V4L2 state. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef PANVK_VIDEO_DECODE_H +#define PANVK_VIDEO_DECODE_H + +#include "vk_video.h" +#include "vk_object.h" + +#include + +/* Forward decls */ +struct panvk_device; +struct vk_device; + +/* iter1: per-session state. Wraps vk_video_session for spec-mandated fields. */ +struct panvk_video_session { + struct vk_video_session vk; + + /* V4L2 fds — opened in Commit 3 (per-session). -1 means not opened. */ + int video_fd; + int media_fd; + + /* Negotiated formats per OUTPUT / CAPTURE queue */ + struct v4l2_format fmt_output; + struct v4l2_format fmt_capture; + + /* Request fd pool. PANVK_V4L2_REQUEST_FD_COUNT entries. + * Size of request_fd_used[] is bounded by the same compile-time max; + * keep them coupled to avoid silent overflow if the pool grows. */ +#define PANVK_VIDEO_REQUEST_FD_MAX 32 + int *request_fds; + bool request_fd_used[PANVK_VIDEO_REQUEST_FD_MAX]; + unsigned num_request_fds; + uint32_t request_fd_next; /* round-robin index */ + + /* Per-session V4L2 buffer-identity counter. Multiplied by 1000 ns at + * QBUF time so the stamp round-trips losslessly through (tv_sec, + * tv_usec) — hantro's reflist builder matches dpb[i].reference_ts + * against the kernel-side OUTPUT timestamp. Per-session (not process- + * global) so concurrent sessions sharing /dev/video1 don't collide. */ + uint32_t ts_counter; + + /* DPB slotIndex → V4L2 reference_ts mapping (Phase 1 D5) */ + struct { + bool valid; + uint64_t reference_ts; + } dpb[16]; + + /* Phase 1 lock — FRAME_BASED only. */ + bool slice_based; + + /* Multi-planar V4L2 buffer type? Detected at session init via + * V4L2_CAP_VIDEO_M2M_MPLANE. Hantro: true. rkvdec on rk3399: false. */ + bool mplane; + + /* iter1 commit 7c: V4L2 buffer counts + round-robin indices. + * Both queues use MMAP; bitstream copied CPU-side from VkBuffer host map. */ + uint32_t num_output_buffers; + uint32_t output_next; + void *output_map[18]; /* mmap'd OUTPUT buffer CPU pointers */ + uint32_t output_map_size[18]; + uint32_t num_capture_buffers; + uint32_t capture_next; + void *capture_map[18]; /* mmap'd CAPTURE buffer CPU pointers */ + uint32_t capture_map_size[18]; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_video_session, vk.base, VkVideoSessionKHR, + VK_OBJECT_TYPE_VIDEO_SESSION_KHR) + +/* panvk_v4l2.c API */ +bool panvk_v4l2_probe_hantro(void); +int panvk_v4l2_session_init(struct panvk_video_session *vs, + struct vk_device *vk_dev, + const VkAllocationCallbacks *alloc, + uint32_t width, uint32_t height); +void panvk_v4l2_session_finish(struct panvk_video_session *vs, + struct vk_device *vk_dev, + const VkAllocationCallbacks *alloc); + +/* 14-step ioctl dance for one H.264 frame. */ +struct v4l2_ctrl_h264_sps; +struct v4l2_ctrl_h264_pps; +struct v4l2_ctrl_h264_scaling_matrix; +struct v4l2_ctrl_h264_decode_params; + +int panvk_v4l2_submit_h264_decode( + struct panvk_video_session *vs, + const struct v4l2_ctrl_h264_sps *sps, + const struct v4l2_ctrl_h264_pps *pps, + const struct v4l2_ctrl_h264_scaling_matrix *scaling, + const struct v4l2_ctrl_h264_decode_params *dec, + const void *src_bitstream, uint32_t src_bytes, + int dst_dmabuf_fd_unused, + uint64_t qbuf_ts); + +/* panvk_v4l2_h264.c — Std → V4L2 control translation API (signatures + * use full types; consumers must include vk_video headers before this. */ +void panvk_v4l2_h264_std_to_ctrl_sps( + const StdVideoH264SequenceParameterSet *in, + struct v4l2_ctrl_h264_sps *out); +void panvk_v4l2_h264_std_to_ctrl_pps( + const StdVideoH264PictureParameterSet *in, + struct v4l2_ctrl_h264_pps *out); +void panvk_v4l2_h264_std_to_ctrl_scaling_matrix( + const StdVideoH264ScalingLists *in, + struct v4l2_ctrl_h264_scaling_matrix *out); +void panvk_v4l2_h264_default_flat_scaling_matrix( + struct v4l2_ctrl_h264_scaling_matrix *out); +void panvk_v4l2_h264_build_decode_params( + const struct panvk_video_session *vs, + const VkVideoDecodeH264PictureInfoKHR *pic_info, + const StdVideoH264PictureParameterSet *active_pps, + uint32_t dst_dpb_slot, + const VkVideoReferenceSlotInfoKHR *ref_slots, + uint32_t num_ref_slots, + uint64_t output_ts, + struct v4l2_ctrl_h264_decode_params *out); + +#endif /* PANVK_VIDEO_DECODE_H */ diff -urN a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c --- a/src/panfrost/vulkan/panvk_vX_device.c 2026-05-21 22:46:57.505785441 +0200 +++ b/src/panfrost/vulkan/panvk_vX_device.c 2026-05-22 10:17:41.214043265 +0200 @@ -203,6 +203,27 @@ } } +/* iter1: translate Vulkan-visible queueFamilyIndex to panvk enum. + * Returns PANVK_QUEUE_FAMILY_COUNT on invalid input. */ +static inline enum panvk_queue_family +panvk_per_arch(vulkan_qfi_to_panvk)(struct panvk_physical_device *physical_device, + uint32_t vulkan_qfi) +{ + uint32_t pos = 0; + for (uint32_t i = 0; i < PANVK_QUEUE_FAMILY_COUNT; i++) { + if (i == PANVK_QUEUE_FAMILY_BIND && + !physical_device->vk.supported_features.sparseBinding) + continue; + if (i == PANVK_QUEUE_FAMILY_VIDEO_DECODE && + !physical_device->vk.supported_extensions.KHR_video_queue) + continue; + if (pos == vulkan_qfi) + return (enum panvk_queue_family) i; + pos++; + } + return PANVK_QUEUE_FAMILY_COUNT; +} + static VkResult check_global_priority(const struct panvk_physical_device *phys_dev, const VkDeviceQueueCreateInfo *create_info) @@ -215,7 +236,10 @@ priority_info ? priority_info->globalPriority : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; - switch (create_info->queueFamilyIndex) { + const enum panvk_queue_family panvk_qfi = + panvk_per_arch(vulkan_qfi_to_panvk)( + (struct panvk_physical_device *) phys_dev, create_info->queueFamilyIndex); + switch (panvk_qfi) { case PANVK_QUEUE_FAMILY_GPU: { enum pan_kmod_group_allow_priority_flags requested_prio = global_priority_to_group_allow_priority_flag(priority); @@ -242,6 +266,12 @@ return VK_ERROR_NOT_PERMITTED_KHR; } + case PANVK_QUEUE_FAMILY_VIDEO_DECODE: + /* iter1: only MEDIUM priority for now */ + return priority == VK_QUEUE_GLOBAL_PRIORITY_MEDIUM + ? VK_SUCCESS + : VK_ERROR_NOT_PERMITTED_KHR; + default: UNREACHABLE("Unknown queue family"); } @@ -250,11 +280,20 @@ static VkResult panvk_queue_check_status(struct vk_queue *queue) { - switch (queue->queue_family_index) { + struct panvk_device *dev = + container_of(queue->base.device, struct panvk_device, vk); + struct panvk_physical_device *pdev = + to_panvk_physical_device(dev->vk.physical); + const enum panvk_queue_family panvk_qfi = + panvk_per_arch(vulkan_qfi_to_panvk)(pdev, queue->queue_family_index); + switch (panvk_qfi) { case PANVK_QUEUE_FAMILY_GPU: return panvk_per_arch(gpu_queue_check_status)(queue); case PANVK_QUEUE_FAMILY_BIND: return panvk_per_arch(bind_queue_check_status)(queue); + case PANVK_QUEUE_FAMILY_VIDEO_DECODE: + /* iter1: stub — commit 4 implements real status check. */ + return VK_SUCCESS; default: UNREACHABLE("Unknown queue family"); } @@ -297,18 +336,52 @@ } static VkResult +panvk_video_queue_submit_noop(struct vk_queue *queue, + struct vk_queue_submit *submit) +{ + /* All decode work was done synchronously in vkCmdDecodeVideoKHR; the + * queue-side submit only has to satisfy the Vulkan fence/semaphore + * contract by signaling everything. Waits are guaranteed satisfied by + * the time the runtime calls us. */ + return vk_sync_signal_many(queue->base.device, submit->signal_count, + submit->signals); +} + +static VkResult panvk_queue_create(struct panvk_device *dev, const VkDeviceQueueCreateInfo *create_info, uint32_t queue_idx, struct vk_queue **out_queue) { - switch (create_info->queueFamilyIndex) { + struct panvk_physical_device *pdev = + to_panvk_physical_device(dev->vk.physical); + const enum panvk_queue_family panvk_qfi = + panvk_per_arch(vulkan_qfi_to_panvk)(pdev, create_info->queueFamilyIndex); + switch (panvk_qfi) { case PANVK_QUEUE_FAMILY_GPU: return panvk_per_arch(create_gpu_queue)( dev, create_info, queue_idx, out_queue); case PANVK_QUEUE_FAMILY_BIND: return panvk_per_arch(create_bind_queue)( dev, create_info, queue_idx, out_queue); + case PANVK_QUEUE_FAMILY_VIDEO_DECODE: { + /* Decode work is fully synchronous at record time (CmdDecodeVideoKHR + * drives the V4L2 14-step dance to completion). At submit time there + * is nothing left to dispatch, so we honor the Vulkan contract by + * just signaling everything. */ + struct vk_queue *vkq = vk_zalloc(&dev->vk.alloc, sizeof(*vkq), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!vkq) + return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + VkResult vqr = vk_queue_init(vkq, &dev->vk, create_info, queue_idx); + if (vqr != VK_SUCCESS) { + vk_free(&dev->vk.alloc, vkq); + return panvk_error(dev, vqr); + } + vkq->driver_submit = panvk_video_queue_submit_noop; + *out_queue = vkq; + return VK_SUCCESS; + } default: return panvk_error(dev, VK_ERROR_INITIALIZATION_FAILED); } @@ -317,13 +390,26 @@ static void panvk_queue_destroy(struct vk_queue *queue) { - switch (queue->queue_family_index) { + struct panvk_device *dev = + container_of(queue->base.device, struct panvk_device, vk); + struct panvk_physical_device *pdev = + to_panvk_physical_device(dev->vk.physical); + const enum panvk_queue_family panvk_qfi = + panvk_per_arch(vulkan_qfi_to_panvk)(pdev, queue->queue_family_index); + switch (panvk_qfi) { case PANVK_QUEUE_FAMILY_GPU: panvk_per_arch(destroy_gpu_queue)(queue); break; case PANVK_QUEUE_FAMILY_BIND: panvk_per_arch(destroy_bind_queue)(queue); break; + case PANVK_QUEUE_FAMILY_VIDEO_DECODE: { + struct panvk_device *dev = + container_of(queue->base.device, struct panvk_device, vk); + vk_queue_finish(queue); + vk_free(&dev->vk.alloc, queue); + break; + } default: UNREACHABLE("Unknown queue family"); } @@ -511,6 +597,7 @@ vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd); + simple_mtx_init(&device->active_video.lock, mtx_plain); result = panvk_precomp_init(device); if (result != VK_SUCCESS) goto err_free_priv_bos; @@ -542,7 +629,13 @@ if (result != VK_SUCCESS) goto err_finish_queues; - uint32_t qfi = queue_create->queueFamilyIndex; + uint32_t vulkan_qfi = queue_create->queueFamilyIndex; + enum panvk_queue_family qfi = + panvk_per_arch(vulkan_qfi_to_panvk)(physical_device, vulkan_qfi); + if (qfi >= PANVK_QUEUE_FAMILY_COUNT) { + result = panvk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto err_finish_queues; + } struct panvk_device_queue_family *qf = &device->queue_families[qfi]; qf->queues = diff -urN a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c --- a/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-21 22:46:59.273811425 +0200 +++ b/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-22 10:17:41.214043265 +0200 @@ -12,6 +12,7 @@ #include #include "git_sha1.h" +#include "panvk_video_decode.h" #include "vk_android.h" #include "vk_device.h" @@ -170,6 +171,14 @@ .EXT_queue_family_foreign = true, .EXT_robustness2 = true, .EXT_transform_feedback = PAN_ARCH < 9, /* iter13: JM-class only for now */ + /* Video extensions are advertised only when (a) we're on a Bifrost + * arch (PAN_ARCH < 9) AND (b) a hantro VPU is reachable on the + * expected V4L2 nodes — otherwise CreateVideoSessionKHR would + * succeed at the panvk layer and then fail at v4l2_open_fds, giving + * the app a misleading capability claim. */ + .KHR_video_queue = PAN_ARCH < 9 && panvk_v4l2_probe_hantro(), + .KHR_video_decode_queue = PAN_ARCH < 9 && panvk_v4l2_probe_hantro(), + .KHR_video_decode_h264 = PAN_ARCH < 9 && panvk_v4l2_probe_hantro(), .EXT_sampler_filter_minmax = PAN_ARCH >= 10, .EXT_scalar_block_layout = true, .EXT_separate_stencil_usage = true,