diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml index 66e966f6b..3ff16ad6a 100644 --- a/.gitea/workflows/build.yml +++ b/.gitea/workflows/build.yml @@ -1420,3 +1420,138 @@ jobs: - name: wipe secrets if: always() run: rm -f /root/repo_pass /root/.ssh/id_ed25519 + run: rm -f /root/.ssh/id_ed25519_hertz + + # ------------------------------------------------------------------------- + # mesa-panvk-bifrost-video (aarch64 only) — sibling adding VK_KHR_video_decode_h264 + # via the V4L2 hantro VPU. Phase 4 byte-exact validated 2026-05-21. + # Co-installs at /usr/lib/panvk-bifrost-video/ (parallel to r4); opt-in + # via VK_ICD_FILENAMES (no launcher shipped — uses standard Vulkan loader). + # + # Build is slow (~30-60min on actrunner-aarch64): full Mesa-from-source. + # Standalone job — no `needs:` since it doesn't depend on the fourier + # codec stack. continue-on-error so a build hiccup doesn't block other + # jobs in the same workflow run. + # ------------------------------------------------------------------------- + mesa-panvk-bifrost-video-aarch64: + runs-on: arch-aarch64 + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: skip if already published + id: skip-check + run: | + set -e + result=$(./.gitea/scripts/check-already-published.sh arch/mesa-panvk-bifrost-video) + echo "$result" >> "$GITHUB_OUTPUT" + echo "decision: $result" + + - name: bootstrap runner (idempotent) + if: steps.skip-check.outputs.skip != '1' + run: | + set -e + retry() { for i in 1 2 3; do "$@" && return 0; rc=$?; echo "retry $i (exit=$rc)" >&2; sleep $((i*5)); done; return 1; } + retry pacman -Syu --noconfirm --needed base-devel git rsync gnupg openssh sudo + + - name: import signing key + if: steps.skip-check.outputs.skip != '1' + env: + PRIV: ${{ secrets.MARFRIT_REPO_PRIVATE_KEY }} + PASS: ${{ secrets.MARFRIT_REPO_PASSPHRASE }} + run: | + set -e + gpgconf --homedir /root/.gnupg --kill all 2>/dev/null || true + rm -rf /root/.gnupg /root/repo_pass + mkdir -m700 -p /root/.gnupg + printf '%s' "$PASS" > /root/repo_pass + chmod 600 /root/repo_pass + printf '%s\n' "$PRIV" | gpg --batch --import + echo "92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C:6:" | gpg --import-ownertrust + + - name: install deploy ssh key + if: steps.skip-check.outputs.skip != '1' + env: + KEY: ${{ secrets.MARFRIT_REPO_DEPLOY_KEY }} + run: | + mkdir -m700 -p /root/.ssh + printf '%s\n' "$KEY" > /root/.ssh/id_ed25519 + chmod 600 /root/.ssh/id_ed25519 + ssh-keyscan -t ed25519 nc.reauktion.de > /root/.ssh/known_hosts 2>/dev/null + + - name: makepkg mesa-panvk-bifrost-video + if: steps.skip-check.outputs.skip != '1' + run: | + set -e + rm -rf /tmp/build-mesa-panvk-bifrost-video + cp -r arch/mesa-panvk-bifrost-video /tmp/build-mesa-panvk-bifrost-video + chown -R builder:builder /tmp/build-mesa-panvk-bifrost-video + cd /tmp/build-mesa-panvk-bifrost-video + # MAKEFLAGS for parallel build; runner is multi-core. + # --skipinteg because sha256sums=SKIP in PKGBUILD (matches the + # fourier-fork PKGBUILD convention). + sudo -u builder -H env MAKEFLAGS="-j60" \ + makepkg --nocheck --noconfirm --syncdeps --cleanbuild --skipinteg + ls -la *.pkg.tar.* | grep -v "\.sig$" + + - name: sign mesa-panvk-bifrost-video + if: steps.skip-check.outputs.skip != '1' + run: | + set -e + cd /tmp/build-mesa-panvk-bifrost-video + for f in *.pkg.tar.xz *.pkg.tar.zst *.pkg.tar.gz; do + [ -f "$f" ] || continue + gpg --batch --pinentry-mode loopback --passphrase-file /root/repo_pass \ + --detach-sign --yes -u 92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C "$f" + done + + - name: update aarch64 repo db + if: steps.skip-check.outputs.skip != '1' + run: | + set -e + mkdir -p /tmp/arch-stage-mesa-panvk-video + cd /tmp/arch-stage-mesa-panvk-video + rm -f * + for f in marfrit.db.tar.gz marfrit.db.tar.gz.sig marfrit.files.tar.gz marfrit.files.tar.gz.sig; do + curl -sSLf "https://packages.reauktion.de/arch/aarch64/$f" -o "$f" || rm -f "$f" + done + for ext in xz zst gz; do + ls /tmp/build-mesa-panvk-bifrost-video/*.pkg.tar.$ext 2>/dev/null && \ + mv /tmp/build-mesa-panvk-bifrost-video/*.pkg.tar.$ext /tmp/build-mesa-panvk-bifrost-video/*.pkg.tar.$ext.sig . + done || true + export GNUPGHOME=/root/.gnupg + printf 'pinentry-mode loopback\npassphrase-file /root/repo_pass\n' > /root/.gnupg/gpg.conf + printf 'allow-loopback-pinentry\n' > /root/.gnupg/gpg-agent.conf + gpg-connect-agent reloadagent /bye + pkgs=() + for ext in xz zst gz; do + for f in *.pkg.tar.$ext; do [ -f "$f" ] && pkgs+=("$f"); done + done + if [ -f marfrit.db.tar.gz ]; then + for f in "${pkgs[@]}"; do + name=$(echo "$f" | sed -E 's/-[0-9].*//') + repo-remove --sign --key 92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C \ + marfrit.db.tar.gz "$name" 2>/dev/null || true + done + fi + repo-add --new --sign --key 92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C \ + --verify marfrit.db.tar.gz "${pkgs[@]}" + ln -sf marfrit.db.tar.gz marfrit.db + ln -sf marfrit.files.tar.gz marfrit.files + ln -sf marfrit.db.tar.gz.sig marfrit.db.sig + rm -f marfrit.files.sig + + - name: publish to aarch64 + if: steps.skip-check.outputs.skip != '1' + run: | + set -e + retry() { for i in 1 2 3; do "$@" && return 0; rc=$?; echo "retry $i (exit=$rc)" >&2; sleep $((i*5)); done; return 1; } + cd /tmp/arch-stage-mesa-panvk-video + retry rsync -avL --copy-unsafe-links \ + -e 'ssh -i /root/.ssh/id_ed25519' \ + ./ mfritsche@nc.reauktion.de:arch/aarch64/ + + - name: wipe secrets + if: always() + run: rm -f /root/repo_pass /root/.ssh/id_ed25519 + diff --git a/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch b/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch new file mode 120000 index 000000000..0dcf8589b --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch @@ -0,0 +1 @@ +../mesa-panvk-bifrost/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch b/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch new file mode 120000 index 000000000..1d7a265b9 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch @@ -0,0 +1 @@ +../mesa-panvk-bifrost/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch b/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch new file mode 120000 index 000000000..7aebd6f38 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch @@ -0,0 +1 @@ +../mesa-panvk-bifrost/0003-panvk-bifrost-vk-ext-transform-feedback.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch b/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch new file mode 120000 index 000000000..e9ba2ffbb --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch @@ -0,0 +1 @@ +../mesa-panvk-bifrost/0004-panvk-bifrost-xfb-primitive-decomposition.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0005-panvk-bifrost-video-KHR-video-decode-h264.patch b/arch/mesa-panvk-bifrost-video/0005-panvk-bifrost-video-KHR-video-decode-h264.patch new file mode 100644 index 000000000..e00b3c48a --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0005-panvk-bifrost-video-KHR-video-decode-h264.patch @@ -0,0 +1,2385 @@ +diff -urN a/src/panfrost/vulkan/jm/panvk_cmd_buffer.h b/src/panfrost/vulkan/jm/panvk_cmd_buffer.h +--- a/src/panfrost/vulkan/jm/panvk_cmd_buffer.h 2026-05-21 22:46:57.477785029 +0200 ++++ b/src/panfrost/vulkan/jm/panvk_cmd_buffer.h 2026-05-21 22:47:09.189957157 +0200 +@@ -88,8 +88,18 @@ + struct panvk_cmd_compute_state compute; + struct panvk_push_constant_state push_constants; + } state; ++ ++ /* iter1: panvk-bifrost-video — current bound video session + params ++ * scoped by vkCmdBeginVideoCodingKHR..vkCmdEndVideoCodingKHR. */ ++ struct { ++ struct panvk_video_session *vs; ++ struct vk_video_session_parameters *params; ++ } video; + }; + ++struct panvk_video_session; ++struct vk_video_session_parameters; ++ + VK_DEFINE_HANDLE_CASTS(panvk_cmd_buffer, vk.base, VkCommandBuffer, + VK_OBJECT_TYPE_COMMAND_BUFFER) + +diff -urN a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build +--- a/src/panfrost/vulkan/meson.build 2026-05-21 22:46:59.277811484 +0200 ++++ b/src/panfrost/vulkan/meson.build 2026-05-21 22:47:09.189957157 +0200 +@@ -41,6 +41,10 @@ + 'panvk_device_memory.c', + 'panvk_host_copy.c', + 'panvk_image.c', ++ 'panvk_video_decode.c', ++ 'panvk_v4l2.c', ++ 'panvk_v4l2_h264.c', ++ 'panvk_v4l2_h264_slice_header.c', + 'panvk_instance.c', + 'panvk_mempool.c', + 'panvk_physical_device.c', +diff -urN a/src/panfrost/vulkan/panvk_buffer.c b/src/panfrost/vulkan/panvk_buffer.c +--- a/src/panfrost/vulkan/panvk_buffer.c 2026-05-21 22:46:57.485785147 +0200 ++++ b/src/panfrost/vulkan/panvk_buffer.c 2026-05-21 22:47:09.189957157 +0200 +@@ -88,6 +88,8 @@ + *bind_status->pResult = VK_SUCCESS; + + buffer->vk.device_address = mem->addr.dev + pBindInfos[i].memoryOffset; ++ buffer->mem = mem; ++ buffer->mem_offset = pBindInfos[i].memoryOffset; + } + return VK_SUCCESS; + } +diff -urN a/src/panfrost/vulkan/panvk_buffer.h b/src/panfrost/vulkan/panvk_buffer.h +--- a/src/panfrost/vulkan/panvk_buffer.h 2026-05-21 22:46:57.485785147 +0200 ++++ b/src/panfrost/vulkan/panvk_buffer.h 2026-05-21 22:47:09.189957157 +0200 +@@ -14,8 +14,14 @@ + + struct panvk_priv_bo; + ++struct panvk_device_memory; + struct panvk_buffer { + struct vk_buffer vk; ++ ++ /* iter1: panvk-bifrost-video — bound memory tracking for dmabuf export. ++ * Set in panvk_BindBufferMemory. NULL until bound. */ ++ struct panvk_device_memory *mem; ++ uint64_t mem_offset; + }; + + VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_buffer, vk.base, VkBuffer, +diff -urN a/src/panfrost/vulkan/panvk_device.h b/src/panfrost/vulkan/panvk_device.h +--- a/src/panfrost/vulkan/panvk_device.h 2026-05-21 22:46:57.489785206 +0200 ++++ b/src/panfrost/vulkan/panvk_device.h 2026-05-21 22:47:09.189957157 +0200 +@@ -45,6 +45,8 @@ + enum panvk_queue_family { + PANVK_QUEUE_FAMILY_GPU, + PANVK_QUEUE_FAMILY_BIND, ++ /* iter1: video decode via V4L2-stateless hantro (PAN_ARCH < 9, runtime-gated). */ ++ PANVK_QUEUE_FAMILY_VIDEO_DECODE, + PANVK_QUEUE_FAMILY_COUNT, + }; + +@@ -97,6 +99,23 @@ + + struct panvk_device_queue_family queue_families[PANVK_QUEUE_FAMILY_COUNT]; + ++ /* iter1: Phase 1 simplification — device-level active video session + ++ * params, set by Cmd{Begin,End}VideoCodingKHR. Single-session Phase 1 ++ * scope; per-cmdbuf state-tracking lives in Phase >>1 once per-arch ++ * cmd_buffer access from arch-agnostic source is wired. */ ++ struct { ++ simple_mtx_t lock; ++ struct panvk_video_session *vs; ++ struct vk_video_session_parameters *params; ++ } active_video; ++ /* iter1: Vulkan-visible queue family index ↔ panvk_qfi enum mapping. ++ * Needed because hideable families create gaps between the enum slot ++ * and the position the Vulkan loader sees. Populated at vkCreateDevice ++ * from pCreateInfo->pQueueCreateInfos[].queueFamilyIndex by querying ++ * physical-device queue family properties (which is what was advertised). */ ++ uint8_t vulkan_to_panvk_qfi[PANVK_QUEUE_FAMILY_COUNT]; ++ uint8_t num_vulkan_qfi; ++ + struct panvk_precomp_cache *precomp_cache; + + struct { +diff -urN a/src/panfrost/vulkan/panvk_physical_device.c b/src/panfrost/vulkan/panvk_physical_device.c +--- a/src/panfrost/vulkan/panvk_physical_device.c 2026-05-21 22:46:57.497785323 +0200 ++++ b/src/panfrost/vulkan/panvk_physical_device.c 2026-05-21 22:47:09.189957157 +0200 +@@ -577,12 +577,22 @@ + .queueFlags = VK_QUEUE_SPARSE_BINDING_BIT, + .queueCount = 1, + }, ++ [PANVK_QUEUE_FAMILY_VIDEO_DECODE] = { ++ /* iter1: video decode + transfer (Vulkan spec requires VIDEO families ++ * to also advertise TRANSFER for bitstream-buffer copies). */ ++ .queueFlags = VK_QUEUE_VIDEO_DECODE_BIT_KHR | VK_QUEUE_TRANSFER_BIT, ++ .queueCount = 1, ++ .minImageTransferGranularity = {1, 1, 1}, ++ }, + }; + + for (uint32_t family = 0; family < ARRAY_SIZE(qfamily_props); family++) { + if (family == PANVK_QUEUE_FAMILY_BIND && + !physical_device->vk.supported_features.sparseBinding) +- break; ++ continue; /* iter1: was break, but a later family (VIDEO_DECODE) must be reachable */ ++ if (family == PANVK_QUEUE_FAMILY_VIDEO_DECODE && ++ !physical_device->vk.supported_extensions.KHR_video_queue) ++ continue; + + vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) { + p->queueFamilyProperties = qfamily_props[family]; +@@ -591,6 +601,16 @@ + vk_find_struct(p->pNext, QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR); + if (prio) + panvk_fill_global_priority(physical_device, family, prio); ++ ++ /* iter1: VK_KHR_video_queue advertises codec ops per family. */ ++ VkQueueFamilyVideoPropertiesKHR *vid = ++ vk_find_struct(p->pNext, QUEUE_FAMILY_VIDEO_PROPERTIES_KHR); ++ if (vid) { ++ vid->videoCodecOperations = 0; ++ if (family == PANVK_QUEUE_FAMILY_VIDEO_DECODE) ++ vid->videoCodecOperations |= ++ VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR; ++ } + } + } + } +@@ -1558,3 +1578,87 @@ + .compatibleHandleTypes = handle_types, + }; + } ++ ++/* panvk-bifrost-video Phase 4 commit 2: ++ * Per-physical-device video capability + format reporting. */ ++ ++#include "vk_video.h" ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++panvk_GetPhysicalDeviceVideoCapabilitiesKHR( ++ VkPhysicalDevice physicalDevice, ++ const VkVideoProfileInfoKHR *pVideoProfile, ++ VkVideoCapabilitiesKHR *pCapabilities) ++{ ++ /* iter1: H.264 only; degrade to UNSUPPORTED for anything else. */ ++ if (pVideoProfile->videoCodecOperation != ++ VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR) ++ return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR; ++ ++ pCapabilities->flags = 0; ++ pCapabilities->minBitstreamBufferOffsetAlignment = 16; ++ pCapabilities->minBitstreamBufferSizeAlignment = 16; ++ pCapabilities->pictureAccessGranularity.width = 16; ++ pCapabilities->pictureAccessGranularity.height = 16; ++ pCapabilities->minCodedExtent.width = 16; ++ pCapabilities->minCodedExtent.height = 16; ++ /* RK3566 hantro max H.264 4Kp30 in spec; constrain to 1080p safe baseline ++ * until Commit 6 queries real V4L2 format-size limits. */ ++ pCapabilities->maxCodedExtent.width = 1920; ++ pCapabilities->maxCodedExtent.height = 1088; ++ pCapabilities->maxDpbSlots = 16; ++ pCapabilities->maxActiveReferencePictures = 16; ++ pCapabilities->stdHeaderVersion.extensionName[0] = 0; ++ strcpy(pCapabilities->stdHeaderVersion.extensionName, ++ VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_EXTENSION_NAME); ++ pCapabilities->stdHeaderVersion.specVersion = ++ VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_SPEC_VERSION; ++ ++ VkVideoDecodeCapabilitiesKHR *dec_caps = ++ vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_CAPABILITIES_KHR); ++ if (dec_caps) { ++ /* Hantro outputs to a CAPTURE buffer separate from the DPB; expose ++ * DISTINCT mode. (COINCIDE would be the GPU-engine-DPB-as-output mode.) */ ++ dec_caps->flags = ++ VK_VIDEO_DECODE_CAPABILITY_DPB_AND_OUTPUT_DISTINCT_BIT_KHR; ++ } ++ ++ VkVideoDecodeH264CapabilitiesKHR *h264_caps = ++ vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H264_CAPABILITIES_KHR); ++ if (h264_caps) { ++ h264_caps->maxLevelIdc = STD_VIDEO_H264_LEVEL_IDC_4_2; ++ h264_caps->fieldOffsetGranularity.x = 0; ++ h264_caps->fieldOffsetGranularity.y = 0; ++ } ++ ++ return VK_SUCCESS; ++} ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++panvk_GetPhysicalDeviceVideoFormatPropertiesKHR( ++ VkPhysicalDevice physicalDevice, ++ const VkPhysicalDeviceVideoFormatInfoKHR *pVideoFormatInfo, ++ uint32_t *pVideoFormatPropertyCount, ++ VkVideoFormatPropertiesKHR *pVideoFormatProperties) ++{ ++ /* iter1: NV12 (8:8:0 4:2:0 2-plane) is the only format hantro emits. */ ++ VK_OUTARRAY_MAKE_TYPED(VkVideoFormatPropertiesKHR, out, ++ pVideoFormatProperties, pVideoFormatPropertyCount); ++ ++ vk_outarray_append_typed(VkVideoFormatPropertiesKHR, &out, p) { ++ p->format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; ++ p->imageType = VK_IMAGE_TYPE_2D; ++ p->imageTiling = VK_IMAGE_TILING_OPTIMAL; ++ p->imageUsageFlags = pVideoFormatInfo->imageUsage & ++ (VK_IMAGE_USAGE_VIDEO_DECODE_DST_BIT_KHR | ++ VK_IMAGE_USAGE_VIDEO_DECODE_DPB_BIT_KHR | ++ VK_IMAGE_USAGE_SAMPLED_BIT | ++ VK_IMAGE_USAGE_TRANSFER_SRC_BIT); ++ p->componentMapping.r = VK_COMPONENT_SWIZZLE_IDENTITY; ++ p->componentMapping.g = VK_COMPONENT_SWIZZLE_IDENTITY; ++ p->componentMapping.b = VK_COMPONENT_SWIZZLE_IDENTITY; ++ p->componentMapping.a = VK_COMPONENT_SWIZZLE_IDENTITY; ++ } ++ ++ return vk_outarray_status(&out); ++} +diff -urN a/src/panfrost/vulkan/panvk_v4l2.c b/src/panfrost/vulkan/panvk_v4l2.c +--- a/src/panfrost/vulkan/panvk_v4l2.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/src/panfrost/vulkan/panvk_v4l2.c 2026-05-21 22:47:09.189957157 +0200 +@@ -0,0 +1,569 @@ ++/* ++ * panvk-bifrost-video Phase 4 commit 3: ++ * ++ * V4L2-stateless hantro VPU bridge for panvk video decode sessions. ++ * ++ * Mirrors the libva-v4l2-request-fourier probe + per-session-init ++ * pattern (proven on RK3566 hantro at 1.16x realtime). ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "panvk_video_decode.h" ++#include "panvk_device.h" ++ ++#include "vk_alloc.h" ++#include "vk_log.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++/* Phase 2 D9: hard-coded paths first; topology-based enumeration is a ++ * later iter (libva-v4l2-request-fourier has the full version). */ ++#define PANVK_V4L2_VIDEO_NODE "/dev/video1" ++#define PANVK_V4L2_MEDIA_NODE "/dev/media0" ++ ++/* Phase 1 max bitstream buffer: BBB peak ~2.4 MB/frame, 4MB is comfortable. */ ++#define PANVK_V4L2_SOURCE_SIZE_MAX (4 * 1024 * 1024) ++ ++/* Phase 2 D3: request_fd pool size = max_dpb_slots + 2 = 18. ++ * 16 DPB slots + current frame + safety margin. */ ++#define PANVK_V4L2_REQUEST_FD_COUNT 18 ++ ++/* Probe: try VIDIOC_QUERYCAP on /dev/video1, check the card string. */ ++bool ++panvk_v4l2_probe_hantro(void) ++{ ++ int fd = open(PANVK_V4L2_VIDEO_NODE, O_RDWR | O_NONBLOCK); ++ if (fd < 0) ++ return false; ++ ++ struct v4l2_capability cap; ++ memset(&cap, 0, sizeof(cap)); ++ int rc = ioctl(fd, VIDIOC_QUERYCAP, &cap); ++ close(fd); ++ if (rc < 0) ++ return false; ++ ++ /* Hantro VPU on RK3566/RK3568 reports card = "hantro-vpu" or ++ * driver = "hantro-vpu". Accept either field matching. */ ++ bool is_hantro = (strncmp((const char *) cap.driver, "hantro", 6) == 0) || ++ (strncmp((const char *) cap.card, "hantro", 6) == 0); ++ return is_hantro; ++} ++ ++/* Detect whether device requires multi-planar buffer types. ++ * Hantro on rk3568 advertises V4L2_CAP_VIDEO_M2M_MPLANE — multi-planar only. */ ++static bool ++v4l2_device_is_mplane(int video_fd) ++{ ++ struct v4l2_capability cap; ++ memset(&cap, 0, sizeof(cap)); ++ if (ioctl(video_fd, VIDIOC_QUERYCAP, &cap) < 0) ++ return false; ++ uint32_t caps = (cap.capabilities & V4L2_CAP_DEVICE_CAPS) ++ ? cap.device_caps : cap.capabilities; ++ return (caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0; ++} ++ ++/* Open V4L2 fds for one video session. /dev/media0 grants request_fds. */ ++static int ++v4l2_open_fds(struct panvk_video_session *vs) ++{ ++ vs->video_fd = open(PANVK_V4L2_VIDEO_NODE, O_RDWR | O_NONBLOCK); ++ if (vs->video_fd < 0) { ++ mesa_loge("panvk_v4l2: open video failed: %s", strerror(errno)); ++ return -errno; ++ } ++ vs->media_fd = open(PANVK_V4L2_MEDIA_NODE, O_RDWR | O_NONBLOCK); ++ if (vs->media_fd < 0) { ++ mesa_loge("panvk_v4l2: open media failed: %s", strerror(errno)); ++ close(vs->video_fd); ++ vs->video_fd = -1; ++ return -errno; ++ } ++ return 0; ++} ++ ++/* Set OUTPUT (input bitstream) format to H264_SLICE; CAPTURE (output picture) ++ * format to NV12. Width/height come from VkVideoSessionCreateInfo. ++ * Hantro on RK3568 is multi-planar; rkvdec on RK3399 is single-planar. ++ * Detect at runtime via V4L2_CAP_VIDEO_M2M_MPLANE. */ ++static int ++v4l2_negotiate_formats(struct panvk_video_session *vs, ++ uint32_t width, uint32_t height) ++{ ++ const bool mplane = v4l2_device_is_mplane(vs->video_fd); ++ vs->mplane = mplane; ++ ++ struct v4l2_format f; ++ ++ /* OUTPUT — H.264 stateless bitstream */ ++ memset(&f, 0, sizeof(f)); ++ if (mplane) { ++ f.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ f.fmt.pix_mp.width = width; ++ f.fmt.pix_mp.height = height; ++ f.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_H264_SLICE; ++ f.fmt.pix_mp.plane_fmt[0].sizeimage = PANVK_V4L2_SOURCE_SIZE_MAX; ++ f.fmt.pix_mp.num_planes = 1; ++ } else { ++ f.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ f.fmt.pix.width = width; ++ f.fmt.pix.height = height; ++ f.fmt.pix.pixelformat = V4L2_PIX_FMT_H264_SLICE; ++ f.fmt.pix.sizeimage = PANVK_V4L2_SOURCE_SIZE_MAX; ++ } ++ if (ioctl(vs->video_fd, VIDIOC_S_FMT, &f) < 0) { ++ mesa_loge("panvk_v4l2: S_FMT OUTPUT (H264_SLICE, mplane=%d) failed: %s", ++ mplane, strerror(errno)); ++ return -errno; ++ } ++ memcpy(&vs->fmt_output, &f, sizeof(f)); ++ ++ /* CAPTURE — NV12 decoded frames */ ++ memset(&f, 0, sizeof(f)); ++ if (mplane) { ++ f.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ f.fmt.pix_mp.width = width; ++ f.fmt.pix_mp.height = height; ++ f.fmt.pix_mp.pixelformat = V4L2_PIX_FMT_NV12; ++ f.fmt.pix_mp.num_planes = 1; ++ } else { ++ f.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ f.fmt.pix.width = width; ++ f.fmt.pix.height = height; ++ f.fmt.pix.pixelformat = V4L2_PIX_FMT_NV12; ++ } ++ if (ioctl(vs->video_fd, VIDIOC_S_FMT, &f) < 0) { ++ mesa_loge("panvk_v4l2: S_FMT CAPTURE (NV12, mplane=%d) failed: %s", ++ mplane, strerror(errno)); ++ return -errno; ++ } ++ memcpy(&vs->fmt_capture, &f, sizeof(f)); ++ ++ return 0; ++} ++ ++/* REQBUFS to register N buffers on each queue. Phase 1: minimal counts to ++ * exercise the path; full pipelining is a later iter. ++ * ++ * Commit 7c: BOTH OUTPUT + CAPTURE use MMAP (mirrors libva-v4l2-request-fourier ++ * working pattern exactly). Bitstream copied in CPU-side from VkBuffer host ++ * map. Decoded frame copied out CPU-side to VkImage (7d). Validates the ++ * IOC_QUEUE path without dma_buf-side variables. */ ++#define PANVK_V4L2_CAPTURE_COUNT 18 ++ ++static int ++v4l2_reqbufs(struct panvk_video_session *vs) ++{ ++ struct v4l2_requestbuffers rb; ++ ++ /* OUTPUT: MMAP (kernel-allocated, mmap'd to CPU for bitstream copy-in). */ ++ memset(&rb, 0, sizeof(rb)); ++ rb.count = PANVK_V4L2_REQUEST_FD_COUNT; ++ rb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE ++ : V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ rb.memory = V4L2_MEMORY_MMAP; ++ if (ioctl(vs->video_fd, VIDIOC_REQBUFS, &rb) < 0) { ++ mesa_loge("panvk_v4l2: REQBUFS OUTPUT failed: %s", strerror(errno)); ++ return -errno; ++ } ++ vs->num_output_buffers = rb.count; ++ vs->output_next = 0; ++ ++ /* CAPTURE: MMAP — kernel-allocated, mmap to CPU for copy-out path. */ ++ memset(&rb, 0, sizeof(rb)); ++ rb.count = PANVK_V4L2_CAPTURE_COUNT; ++ rb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE ++ : V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ rb.memory = V4L2_MEMORY_MMAP; ++ if (ioctl(vs->video_fd, VIDIOC_REQBUFS, &rb) < 0) { ++ mesa_loge("panvk_v4l2: REQBUFS CAPTURE failed: %s", strerror(errno)); ++ return -errno; ++ } ++ vs->num_capture_buffers = rb.count; ++ vs->capture_next = 0; ++ ++ return 0; ++} ++ ++/* Allocate the request_fd pool via MEDIA_IOC_REQUEST_ALLOC. */ ++static int ++v4l2_alloc_request_pool(struct panvk_video_session *vs, ++ const VkAllocationCallbacks *alloc, ++ struct vk_device *dev) ++{ ++ vs->request_fds = vk_alloc(&dev->alloc, ++ sizeof(int) * PANVK_V4L2_REQUEST_FD_COUNT, 8, ++ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); ++ if (!vs->request_fds) ++ return -ENOMEM; ++ for (unsigned i = 0; i < PANVK_V4L2_REQUEST_FD_COUNT; i++) ++ vs->request_fds[i] = -1; ++ ++ for (unsigned i = 0; i < PANVK_V4L2_REQUEST_FD_COUNT; i++) { ++ int rfd = -1; ++ if (ioctl(vs->media_fd, MEDIA_IOC_REQUEST_ALLOC, &rfd) < 0) { ++ mesa_loge("panvk_v4l2: MEDIA_IOC_REQUEST_ALLOC [%u] failed: %s", ++ i, strerror(errno)); ++ return -errno; ++ } ++ vs->request_fds[i] = rfd; ++ } ++ vs->num_request_fds = PANVK_V4L2_REQUEST_FD_COUNT; ++ vs->request_fd_next = 0; ++ return 0; ++} ++ ++/* QUERYBUF + mmap CAPTURE buffers — NV12-decoded frame source. */ ++static int ++v4l2_mmap_capture_buffers(struct panvk_video_session *vs) ++{ ++ for (unsigned i = 0; i < vs->num_capture_buffers && i < 18; i++) { ++ struct v4l2_buffer qb = { 0 }; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = { 0 }; ++ qb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE ++ : V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ qb.memory = V4L2_MEMORY_MMAP; ++ qb.index = i; ++ if (vs->mplane) { qb.length = 1; qb.m.planes = planes; } ++ if (ioctl(vs->video_fd, VIDIOC_QUERYBUF, &qb) < 0) { ++ mesa_loge("panvk_v4l2: QUERYBUF CAPTURE[%u] failed: %s", ++ i, strerror(errno)); ++ return -errno; ++ } ++ uint32_t length = vs->mplane ? planes[0].length : qb.length; ++ uint32_t offset = vs->mplane ? planes[0].m.mem_offset : qb.m.offset; ++ void *p = mmap(NULL, length, PROT_READ, MAP_SHARED, ++ vs->video_fd, offset); ++ if (p == MAP_FAILED) { ++ mesa_loge("panvk_v4l2: mmap CAPTURE[%u] failed: %s", ++ i, strerror(errno)); ++ return -errno; ++ } ++ vs->capture_map[i] = p; ++ vs->capture_map_size[i] = length; ++ } ++ return 0; ++} ++ ++/* QUERYBUF + mmap each OUTPUT buffer — bitstream-copy-in destination. */ ++static int ++v4l2_mmap_output_buffers(struct panvk_video_session *vs) ++{ ++ for (unsigned i = 0; i < vs->num_output_buffers; i++) { ++ struct v4l2_buffer qb = { 0 }; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = { 0 }; ++ qb.type = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE ++ : V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ qb.memory = V4L2_MEMORY_MMAP; ++ qb.index = i; ++ if (vs->mplane) { qb.length = 1; qb.m.planes = planes; } ++ if (ioctl(vs->video_fd, VIDIOC_QUERYBUF, &qb) < 0) { ++ mesa_loge("panvk_v4l2: QUERYBUF OUTPUT[%u] failed: %s", ++ i, strerror(errno)); ++ return -errno; ++ } ++ uint32_t length = vs->mplane ? planes[0].length : qb.length; ++ uint32_t offset = vs->mplane ? planes[0].m.mem_offset : qb.m.offset; ++ void *p = mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, ++ vs->video_fd, offset); ++ if (p == MAP_FAILED) { ++ mesa_loge("panvk_v4l2: mmap OUTPUT[%u] failed: %s", ++ i, strerror(errno)); ++ return -errno; ++ } ++ vs->output_map[i] = p; ++ vs->output_map_size[i] = length; ++ } ++ return 0; ++} ++ ++/* STREAMON both queues. Must happen after REQBUFS, before first QBUF. */ ++static int ++v4l2_streamon(struct panvk_video_session *vs) ++{ ++ enum v4l2_buf_type t; ++ t = vs->mplane ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE ++ : V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ if (ioctl(vs->video_fd, VIDIOC_STREAMON, &t) < 0) { ++ mesa_loge("panvk_v4l2: STREAMON OUTPUT failed: %s", strerror(errno)); ++ return -errno; ++ } ++ t = vs->mplane ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE ++ : V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ if (ioctl(vs->video_fd, VIDIOC_STREAMON, &t) < 0) { ++ mesa_loge("panvk_v4l2: STREAMON CAPTURE failed: %s", strerror(errno)); ++ return -errno; ++ } ++ return 0; ++} ++ ++/* Set device-level H.264 controls (DECODE_MODE + START_CODE) before STREAMON. ++ * libva-v4l2-request-fourier calls these with request_fd=-1; we mirror. ++ * Errors are non-fatal — some backing drivers may default-only. */ ++static void ++v4l2_set_device_h264_controls(struct panvk_video_session *vs) ++{ ++ struct v4l2_ext_control dev[2] = { 0 }; ++ dev[0].id = V4L2_CID_STATELESS_H264_DECODE_MODE; ++ dev[0].value = V4L2_STATELESS_H264_DECODE_MODE_FRAME_BASED; ++ dev[1].id = V4L2_CID_STATELESS_H264_START_CODE; ++ dev[1].value = V4L2_STATELESS_H264_START_CODE_ANNEX_B; ++ ++ struct v4l2_ext_controls c = { 0 }; ++ c.controls = dev; ++ c.count = 2; ++ (void) ioctl(vs->video_fd, VIDIOC_S_EXT_CTRLS, &c); ++ /* intentionally ignoring rc — device-level probes; some drivers reject. */ ++} ++ ++int ++panvk_v4l2_session_init(struct panvk_video_session *vs, ++ struct vk_device *vk_dev, ++ const VkAllocationCallbacks *alloc, ++ uint32_t width, uint32_t height) ++{ ++ int rc = v4l2_open_fds(vs); ++ if (rc) return rc; ++ ++ rc = v4l2_negotiate_formats(vs, width, height); ++ if (rc) goto fail; ++ ++ rc = v4l2_reqbufs(vs); ++ if (rc) goto fail; ++ ++ rc = v4l2_alloc_request_pool(vs, alloc, vk_dev); ++ if (rc) goto fail; ++ ++ /* Set device-level H.264 mode controls (non-fatal). */ ++ v4l2_set_device_h264_controls(vs); ++ ++ /* mmap OUTPUT buffers for bitstream copy-in. */ ++ rc = v4l2_mmap_output_buffers(vs); ++ if (rc) goto fail; ++ ++ /* mmap CAPTURE buffers for NV12 frame readback (Phase 1 verification). */ ++ rc = v4l2_mmap_capture_buffers(vs); ++ if (rc) goto fail; ++ ++ /* Stream on both queues. */ ++ rc = v4l2_streamon(vs); ++ if (rc) goto fail; ++ ++ return 0; ++ ++fail: ++ panvk_v4l2_session_finish(vs, vk_dev, alloc); ++ return rc; ++} ++ ++/* Allocate one V4L2 OUTPUT-queue buffer index for this submit; round-robin ++ * through the request_fd pool. Returns the request_fd to use. ++ * ++ * REINIT contract: a freshly-allocated request is in QUEUEABLE state; after ++ * MEDIA_REQUEST_IOC_QUEUE + dequeue it's in COMPLETE state and S_EXT_CTRLS ++ * on it returns EBUSY. MEDIA_REQUEST_IOC_REINIT puts it back in QUEUEABLE. ++ * We track per-fd "ever been queued" so the very-first use skips REINIT ++ * (which returns EBUSY on never-queued requests). */ ++static int ++v4l2_pick_request_fd(struct panvk_video_session *vs) ++{ ++ uint32_t idx = vs->request_fd_next; ++ int rfd = vs->request_fds[idx]; ++ if (vs->request_fd_used[idx]) { ++ if (ioctl(rfd, MEDIA_REQUEST_IOC_REINIT) < 0) { ++ mesa_loge("panvk_v4l2: MEDIA_REQUEST_IOC_REINIT rfd=%d failed: %s", ++ rfd, strerror(errno)); ++ } ++ } ++ vs->request_fd_used[idx] = true; ++ vs->request_fd_next = (idx + 1) % vs->num_request_fds; ++ return rfd; ++} ++ ++/* The 14-step ioctl dance for one decode op (Phase 2 D7). ++ * Operates synchronously at record time per Phase 1 D8 lock. ++ * Returns 0 on success, -errno on failure. ++ * ++ * Commit 7c MMAP-side: src_bitstream is a CPU pointer (NOT a dma_buf fd). ++ * We copy it into the mmap'd OUTPUT buffer at index `out_idx`. */ ++int ++panvk_v4l2_submit_h264_decode(struct panvk_video_session *vs, ++ const struct v4l2_ctrl_h264_sps *sps, ++ const struct v4l2_ctrl_h264_pps *pps, ++ const struct v4l2_ctrl_h264_scaling_matrix *scaling, ++ const struct v4l2_ctrl_h264_decode_params *dec, ++ const void *src_bitstream, uint32_t src_bytes, ++ int dst_dmabuf_fd_unused, ++ uint64_t qbuf_ts) ++{ ++ int rfd = v4l2_pick_request_fd(vs); ++ const bool mp = vs->mplane; ++ ++ /* Pick an OUTPUT buffer index + copy bitstream in. */ ++ const uint32_t out_idx = vs->output_next; ++ vs->output_next = (vs->output_next + 1) % vs->num_output_buffers; ++ if (src_bytes > vs->output_map_size[out_idx]) { ++ mesa_loge("panvk_v4l2: bitstream %u > buffer %u", ++ src_bytes, vs->output_map_size[out_idx]); ++ return -ENOSPC; ++ } ++ memcpy(vs->output_map[out_idx], src_bitstream, src_bytes); ++ ++ /* 1-7: build extended controls batch with request_fd binding */ ++ struct v4l2_ext_control ctrls[4] = { 0 }; ++ ctrls[0].id = V4L2_CID_STATELESS_H264_SPS; ++ ctrls[0].ptr = (void *) sps; ++ ctrls[0].size = sizeof(*sps); ++ ctrls[1].id = V4L2_CID_STATELESS_H264_PPS; ++ ctrls[1].ptr = (void *) pps; ++ ctrls[1].size = sizeof(*pps); ++ ctrls[2].id = V4L2_CID_STATELESS_H264_DECODE_PARAMS; ++ ctrls[2].ptr = (void *) dec; ++ ctrls[2].size = sizeof(*dec); ++ ctrls[3].id = V4L2_CID_STATELESS_H264_SCALING_MATRIX; ++ ctrls[3].ptr = (void *) scaling; ++ ctrls[3].size = sizeof(*scaling); ++ ++ struct v4l2_ext_controls batch = { 0 }; ++ batch.controls = ctrls; ++ batch.count = 4; ++ batch.which = V4L2_CTRL_WHICH_REQUEST_VAL; ++ batch.request_fd = rfd; ++ ++ if (ioctl(vs->video_fd, VIDIOC_S_EXT_CTRLS, &batch) < 0) { ++ mesa_loge("panvk_v4l2: S_EXT_CTRLS request_fd=%d failed: %s " ++ "(error_idx=%u/%u)", ++ rfd, strerror(errno), batch.error_idx, batch.count); ++ return -errno; ++ } ++ ++ /* 8: QBUF OUTPUT (bitstream input) — MMAP, index out_idx. */ ++ struct v4l2_buffer qb = { 0 }; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = { 0 }; ++ qb.memory = V4L2_MEMORY_MMAP; ++ qb.index = out_idx; ++ qb.flags = V4L2_BUF_FLAG_REQUEST_FD; ++ qb.request_fd = rfd; ++ qb.timestamp.tv_sec = (uint32_t)(qbuf_ts / 1000000000ULL); ++ qb.timestamp.tv_usec = (uint32_t)((qbuf_ts / 1000ULL) % 1000000ULL); ++ if (mp) { ++ qb.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ qb.length = 1; ++ qb.m.planes = planes; ++ planes[0].bytesused = src_bytes; ++ } else { ++ qb.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ qb.bytesused = src_bytes; ++ } ++ if (ioctl(vs->video_fd, VIDIOC_QBUF, &qb) < 0) { ++ mesa_loge("panvk_v4l2: QBUF OUTPUT (mmap idx=%u) failed: %s", ++ out_idx, strerror(errno)); ++ return -errno; ++ } ++ ++ /* 9: QBUF CAPTURE (output frame) — MMAP-backed kernel-allocated buffer. ++ * dst_dmabuf_fd is ignored in 7c; copy-out to VkImage is 7d. */ ++ const uint32_t cap_idx = vs->capture_next; ++ vs->capture_next = (vs->capture_next + 1) % vs->num_capture_buffers; ++ memset(&qb, 0, sizeof(qb)); ++ memset(&planes, 0, sizeof(planes)); ++ qb.memory = V4L2_MEMORY_MMAP; ++ qb.index = cap_idx; ++ if (mp) { ++ qb.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ qb.length = 1; ++ qb.m.planes = planes; ++ } else { ++ qb.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ } ++ if (ioctl(vs->video_fd, VIDIOC_QBUF, &qb) < 0) { ++ mesa_loge("panvk_v4l2: QBUF CAPTURE (mmap idx=%u) failed: %s", ++ cap_idx, strerror(errno)); ++ return -errno; ++ } ++ ++ /* 10: MEDIA_REQUEST_IOC_QUEUE */ ++ if (ioctl(rfd, MEDIA_REQUEST_IOC_QUEUE) < 0) { ++ mesa_loge("panvk_v4l2: REQUEST_IOC_QUEUE failed: %s", strerror(errno)); ++ return -errno; ++ } ++ ++ /* 11: poll(rfd, POLLPRI) — 200ms timeout per Phase 2 D7 */ ++ struct pollfd pfd = { .fd = rfd, .events = POLLPRI }; ++ int pr = poll(&pfd, 1, 200); ++ if (pr <= 0) { ++ mesa_loge("panvk_v4l2: poll request_fd timeout/err pr=%d errno=%d", ++ pr, errno); ++ return -ETIMEDOUT; ++ } ++ ++ /* 12: DQBUF OUTPUT — must match the memory type used at QBUF (MMAP, not ++ * DMABUF). With the wrong memory type the kernel rejects the DQBUF and ++ * the OUTPUT slot stays in flight, which leaks request_fd resources ++ * (mostly cosmetic for Phase 1 single-decode tests, but breaks the ++ * pipelined case). */ ++ memset(&qb, 0, sizeof(qb)); ++ memset(&planes, 0, sizeof(planes)); ++ qb.memory = V4L2_MEMORY_MMAP; ++ qb.type = mp ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE : V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ if (mp) { qb.length = 1; qb.m.planes = planes; } ++ if (ioctl(vs->video_fd, VIDIOC_DQBUF, &qb) < 0) { ++ mesa_loge("panvk_v4l2: DQBUF OUTPUT failed: %s", strerror(errno)); ++ /* non-fatal — capture might still have completed */ ++ } ++ ++ /* 13: DQBUF CAPTURE — MMAP, kernel-allocated. */ ++ memset(&qb, 0, sizeof(qb)); ++ memset(&planes, 0, sizeof(planes)); ++ qb.memory = V4L2_MEMORY_MMAP; ++ qb.type = mp ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE : V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ if (mp) { qb.length = 1; qb.m.planes = planes; } ++ if (ioctl(vs->video_fd, VIDIOC_DQBUF, &qb) < 0) { ++ mesa_loge("panvk_v4l2: DQBUF CAPTURE failed: %s", strerror(errno)); ++ return -errno; ++ } ++ if (qb.flags & V4L2_BUF_FLAG_ERROR) { ++ mesa_loge("panvk_v4l2: CAPTURE buffer flagged ERROR"); ++ return -EIO; ++ } ++ ++ /* Verification print: first 16 bytes of decoded Y plane. ++ * If hantro wrote real data this should NOT be all zeros. */ ++ return 0; ++} ++ ++void ++panvk_v4l2_session_finish(struct panvk_video_session *vs, ++ struct vk_device *vk_dev, ++ const VkAllocationCallbacks *alloc) ++{ ++ if (vs->request_fds) { ++ for (unsigned i = 0; i < vs->num_request_fds; i++) ++ if (vs->request_fds[i] >= 0) ++ close(vs->request_fds[i]); ++ vk_free(&vk_dev->alloc, vs->request_fds); ++ vs->request_fds = NULL; ++ vs->num_request_fds = 0; ++ } ++ if (vs->video_fd >= 0) { ++ close(vs->video_fd); ++ vs->video_fd = -1; ++ } ++ if (vs->media_fd >= 0) { ++ close(vs->media_fd); ++ vs->media_fd = -1; ++ } ++} +diff -urN a/src/panfrost/vulkan/panvk_v4l2_h264.c b/src/panfrost/vulkan/panvk_v4l2_h264.c +--- a/src/panfrost/vulkan/panvk_v4l2_h264.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/src/panfrost/vulkan/panvk_v4l2_h264.c 2026-05-21 22:47:09.189957157 +0200 +@@ -0,0 +1,478 @@ ++/* ++ * panvk-bifrost-video Phase 4: Vulkan StdVideo H.264 → V4L2 stateless H.264 ++ * control-struct translation. ++ * ++ * This file is the protocol-translation bridge that lets a Vulkan video ++ * decode session (VkVideoDecodeH264PictureInfoKHR + StdVideoH264*) drive a ++ * V4L2 stateless H.264 decoder (the hantro VPU on RK3566/RK3568 via ++ * /dev/video1) by filling the four FRAME_BASED controls: ++ * ++ * - V4L2_CID_STATELESS_H264_SPS → struct v4l2_ctrl_h264_sps ++ * - V4L2_CID_STATELESS_H264_PPS → struct v4l2_ctrl_h264_pps ++ * - V4L2_CID_STATELESS_H264_SCALING_MATRIX → struct v4l2_ctrl_h264_scaling_matrix ++ * - V4L2_CID_STATELESS_H264_DECODE_PARAMS → struct v4l2_ctrl_h264_decode_params ++ * ++ * The ioctl-side (VIDIOC_S_EXT_CTRLS on a request_fd) is the caller's ++ * responsibility — see panvk_v4l2.c. This file is pure data-shape ++ * translation; no syscalls, no GPU/shader work. ++ * ++ * Cross-references: ++ * - V4L2 UAPI structs and field semantics: ++ * /usr/include/linux/v4l2-controls.h ++ * Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst ++ * - Vulkan StdVideo H.264 structs: ++ * include/vk_video/vulkan_video_codec_h264std.h ++ * include/vk_video/vulkan_video_codec_h264std_decode.h ++ * include/vulkan/vulkan_core.h (VkVideoDecodeH264*KHR) ++ * - VAAPI→V4L2 reference impl (semantically equivalent, different carrier): ++ * libva-v4l2-request-fourier/src/h264.c ++ * ++ * Why every flag is mapped by name, not by bit position: ++ * StdVideoH264SpsFlags packs its flags as 1-bit bitfield members in a ++ * compiler-defined layout. V4L2_H264_SPS_FLAG_* are explicit ++ * bit-position #defines. The flag *names* match the H.264 spec — ++ * pic-by-pic mapping is mechanical — but the *bit positions* don't ++ * match between the two. Each flag is therefore translated by its ++ * spec name, never by raw bitmask copy. ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "panvk_video_decode.h" ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++/* ------------------------------------------------------------------ */ ++/* SPS */ ++/* ------------------------------------------------------------------ */ ++ ++/* ++ * Translate StdVideoH264SequenceParameterSet → struct v4l2_ctrl_h264_sps. ++ * ++ * profile_idc: StdVideoH264ProfileIdc is the literal H.264 profile_idc value ++ * (BASELINE=66, MAIN=77, HIGH=100, …) so a direct cast is correct. ++ * ++ * level_idc: StdVideoH264LevelIdc is an *enum index* (1_0=0, 1_1=1, …, 6_2=18), ++ * NOT the spec-encoded level_idc byte (which V4L2 expects in units of ++ * level*10: Level 4.1 → 41, Level 5.1 → 51, etc). We must encode. ++ * ++ * pic_order_cnt_type: StdVideoH264PocType enum values (0/1/2) match the spec ++ * directly; cast is safe. ++ * ++ * constraint_set_flags: V4L2 packs constraint_set{0..5}_flag into a single ++ * __u8 (V4L2_H264_SPS_CONSTRAINT_SETN_FLAG = 0x01..0x20). The StdVideo flag ++ * bitfields hold each one separately. ++ */ ++void ++panvk_v4l2_h264_std_to_ctrl_sps(const StdVideoH264SequenceParameterSet *in, ++ struct v4l2_ctrl_h264_sps *out) ++{ ++ memset(out, 0, sizeof(*out)); ++ ++ /* StdVideoH264LevelIdc → level_idc byte (level * 10). */ ++ static const __u8 level_idc_lut[] = { ++ [STD_VIDEO_H264_LEVEL_IDC_1_0] = 10, ++ [STD_VIDEO_H264_LEVEL_IDC_1_1] = 11, ++ [STD_VIDEO_H264_LEVEL_IDC_1_2] = 12, ++ [STD_VIDEO_H264_LEVEL_IDC_1_3] = 13, ++ [STD_VIDEO_H264_LEVEL_IDC_2_0] = 20, ++ [STD_VIDEO_H264_LEVEL_IDC_2_1] = 21, ++ [STD_VIDEO_H264_LEVEL_IDC_2_2] = 22, ++ [STD_VIDEO_H264_LEVEL_IDC_3_0] = 30, ++ [STD_VIDEO_H264_LEVEL_IDC_3_1] = 31, ++ [STD_VIDEO_H264_LEVEL_IDC_3_2] = 32, ++ [STD_VIDEO_H264_LEVEL_IDC_4_0] = 40, ++ [STD_VIDEO_H264_LEVEL_IDC_4_1] = 41, ++ [STD_VIDEO_H264_LEVEL_IDC_4_2] = 42, ++ [STD_VIDEO_H264_LEVEL_IDC_5_0] = 50, ++ [STD_VIDEO_H264_LEVEL_IDC_5_1] = 51, ++ [STD_VIDEO_H264_LEVEL_IDC_5_2] = 52, ++ [STD_VIDEO_H264_LEVEL_IDC_6_0] = 60, ++ [STD_VIDEO_H264_LEVEL_IDC_6_1] = 61, ++ [STD_VIDEO_H264_LEVEL_IDC_6_2] = 62, ++ }; ++ ++ out->profile_idc = (__u8) in->profile_idc; ++ if ((unsigned) in->level_idc < sizeof(level_idc_lut) / sizeof(level_idc_lut[0])) ++ out->level_idc = level_idc_lut[in->level_idc]; ++ else ++ out->level_idc = 0; ++ out->seq_parameter_set_id = in->seq_parameter_set_id; ++ out->chroma_format_idc = (__u8) in->chroma_format_idc; ++ out->bit_depth_luma_minus8 = in->bit_depth_luma_minus8; ++ out->bit_depth_chroma_minus8 = in->bit_depth_chroma_minus8; ++ out->log2_max_frame_num_minus4 = in->log2_max_frame_num_minus4; ++ out->pic_order_cnt_type = (__u8) in->pic_order_cnt_type; ++ out->log2_max_pic_order_cnt_lsb_minus4 = in->log2_max_pic_order_cnt_lsb_minus4; ++ out->max_num_ref_frames = in->max_num_ref_frames; ++ out->num_ref_frames_in_pic_order_cnt_cycle = ++ in->num_ref_frames_in_pic_order_cnt_cycle; ++ ++ out->offset_for_non_ref_pic = in->offset_for_non_ref_pic; ++ out->offset_for_top_to_bottom_field = in->offset_for_top_to_bottom_field; ++ ++ /* offset_for_ref_frame[]: StdVideo passes via pOffsetForRefFrame pointer ++ * sized num_ref_frames_in_pic_order_cnt_cycle. V4L2 has a 255-entry fixed ++ * array. Copy in-bounds entries. */ ++ if (in->pOffsetForRefFrame != NULL) { ++ unsigned n = in->num_ref_frames_in_pic_order_cnt_cycle; ++ if (n > 255) n = 255; ++ for (unsigned i = 0; i < n; i++) ++ out->offset_for_ref_frame[i] = in->pOffsetForRefFrame[i]; ++ } ++ ++ out->pic_width_in_mbs_minus1 = (__u16) in->pic_width_in_mbs_minus1; ++ out->pic_height_in_map_units_minus1 = (__u16) in->pic_height_in_map_units_minus1; ++ ++ /* Constraint set flags — V4L2 packs into __u8 constraint_set_flags. */ ++ __u8 cs = 0; ++ if (in->flags.constraint_set0_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET0_FLAG; ++ if (in->flags.constraint_set1_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET1_FLAG; ++ if (in->flags.constraint_set2_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET2_FLAG; ++ if (in->flags.constraint_set3_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET3_FLAG; ++ if (in->flags.constraint_set4_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET4_FLAG; ++ if (in->flags.constraint_set5_flag) cs |= V4L2_H264_SPS_CONSTRAINT_SET5_FLAG; ++ out->constraint_set_flags = cs; ++ ++ /* Plain SPS flags — translated by spec name, NOT by bit position. */ ++ __u32 f = 0; ++ if (in->flags.separate_colour_plane_flag) ++ f |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE; ++ if (in->flags.qpprime_y_zero_transform_bypass_flag) ++ f |= V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS; ++ if (in->flags.delta_pic_order_always_zero_flag) ++ f |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO; ++ if (in->flags.gaps_in_frame_num_value_allowed_flag) ++ f |= V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED; ++ if (in->flags.frame_mbs_only_flag) ++ f |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY; ++ if (in->flags.mb_adaptive_frame_field_flag) ++ f |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD; ++ if (in->flags.direct_8x8_inference_flag) ++ f |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE; ++ out->flags = f; ++ /* ++ * StdVideoH264SpsFlags also has: frame_cropping_flag, ++ * seq_scaling_matrix_present_flag, vui_parameters_present_flag. ++ * V4L2 has no equivalent SPS flags for these — frame cropping is ++ * communicated via S_FMT cropping rectangles, scaling matrix presence is ++ * carried in PPS's V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT, and VUI is ++ * not exposed at all. Intentionally dropped. ++ */ ++} ++ ++/* ------------------------------------------------------------------ */ ++/* PPS */ ++/* ------------------------------------------------------------------ */ ++ ++/* ++ * Translate StdVideoH264PictureParameterSet → struct v4l2_ctrl_h264_pps. ++ * ++ * num_slice_groups_minus1: not in StdVideoH264PictureParameterSet at all ++ * (Vulkan H.264 video core profile excludes FMO). Set to 0 (one slice ++ * group, the only value FMO-free decoders accept). ++ * ++ * V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT: per kernel doc this should be ++ * set when a non-flat matrix applies to the picture. We translate it from ++ * StdVideoH264PpsFlags::pic_scaling_matrix_present_flag — note however ++ * that the libva-v4l2-request-fourier reference always sets this flag ++ * together with a default-flat matrix, because hantro's set_params reads ++ * the flag to drive G1_REG_DEC_CTRL2_TYPE1_QUANT_E. Callers may want to ++ * set this flag unconditionally for hantro; we don't here because the ++ * spec mapping is the cleaner default. ++ */ ++void ++panvk_v4l2_h264_std_to_ctrl_pps(const StdVideoH264PictureParameterSet *in, ++ struct v4l2_ctrl_h264_pps *out) ++{ ++ memset(out, 0, sizeof(*out)); ++ ++ out->pic_parameter_set_id = in->pic_parameter_set_id; ++ out->seq_parameter_set_id = in->seq_parameter_set_id; ++ out->num_slice_groups_minus1 = 0; /* not exposed by StdVideo; H.264 ++ core profile assumes 1 group. */ ++ out->num_ref_idx_l0_default_active_minus1 = in->num_ref_idx_l0_default_active_minus1; ++ out->num_ref_idx_l1_default_active_minus1 = in->num_ref_idx_l1_default_active_minus1; ++ out->weighted_bipred_idc = (__u8) in->weighted_bipred_idc; ++ out->pic_init_qp_minus26 = in->pic_init_qp_minus26; ++ out->pic_init_qs_minus26 = in->pic_init_qs_minus26; ++ out->chroma_qp_index_offset = in->chroma_qp_index_offset; ++ out->second_chroma_qp_index_offset = in->second_chroma_qp_index_offset; ++ ++ __u16 f = 0; ++ if (in->flags.entropy_coding_mode_flag) ++ f |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE; ++ if (in->flags.bottom_field_pic_order_in_frame_present_flag) ++ f |= V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT; ++ if (in->flags.weighted_pred_flag) ++ f |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED; ++ if (in->flags.deblocking_filter_control_present_flag) ++ f |= V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT; ++ if (in->flags.constrained_intra_pred_flag) ++ f |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED; ++ if (in->flags.redundant_pic_cnt_present_flag) ++ f |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT; ++ if (in->flags.transform_8x8_mode_flag) ++ f |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE; ++ /* ++ * V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT: set UNCONDITIONALLY. ++ * ++ * Hantro VPU2 (rockchip_vpu2_hw_h264_dec.c) and G1 both gate ++ * scaling_list ingestion on this flag (assemble_scaling_list in ++ * hantro_h264.c:215 short-circuits if clear, leaving the priv-table ++ * scaling region zero — dequant then computes 0 * quant = 0 pixels). ++ * libva-v4l2-request-fourier sets the flag together with the spec- ++ * default flat-16 matrix for this exact reason (h264.c:484 lineage). ++ * Validated empirically 2026-05-21: with flag clear hantro produces ++ * all-zero Y plane; with flag set + flat matrix it decodes BBB. ++ * ++ * Vulkan-side: the Std flag tracks the bitstream's pic_scaling_matrix ++ * _present_flag — useful for software decoders but irrelevant to the ++ * hantro hardware path. Always-on is safe here because the caller ++ * pairs this PPS with panvk_v4l2_h264_default_flat_scaling_matrix() ++ * (whose flat-16 values are themselves the H.264 §7.4.2.1.1.1 ++ * fall-back when no explicit list is signalled). ++ */ ++ f |= V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT; ++ out->flags = f; ++} ++ ++/* ------------------------------------------------------------------ */ ++/* SCALING_MATRIX */ ++/* ------------------------------------------------------------------ */ ++ ++/* ++ * Translate StdVideoH264ScalingLists → struct v4l2_ctrl_h264_scaling_matrix. ++ * ++ * StdVideoH264ScalingLists.ScalingList4x4[6][16]: 6 lists in raster order, ++ * indices 0..5 = Intra Y, Intra Cb, Intra Cr, Inter Y, Inter Cb, Inter Cr. ++ * V4L2's scaling_list_4x4[6][16] expects the SAME order (per kernel doc ++ * ext-ctrls-codec-stateless.rst). → straight memcpy. ++ * ++ * StdVideoH264ScalingLists.ScalingList8x8[6][64]: 6 lists in raster order ++ * per the H.264 spec table 7-2 ordering: Intra Y, Inter Y, Intra Cb, ++ * Inter Cb, Intra Cr, Inter Cr. V4L2's scaling_list_8x8[6][64] uses the ++ * SAME order per kernel doc. → straight memcpy. ++ * ++ * IMPORTANT — libva-v4l2-request-fourier's h264_va_matrix_to_v4l2 (h264.c ++ * line 544) does an unusual 8x8 remap (VAMatrix[0]→[0], VAMatrix[1]→[3]) ++ * because VAIQMatrixBufferH264 only carries 2 of the 6 8x8 lists (Intra Y ++ * and Inter Y, for YUV420 streams). That's a libva *carrier* limitation, ++ * not a V4L2 ordering quirk — we do NOT replicate it here. Vulkan ++ * provides all 6 lists in spec order; we copy them straight. ++ * ++ * If a caller has no explicit lists, pass NULL — the caller writes a flat-16 ++ * default (the H.264 §7.4.2.1.1.1 Flat_4x4_16 / Flat_8x8_16 defaults) into ++ * the struct directly, see panvk_v4l2_h264_default_flat_scaling_matrix. ++ */ ++void ++panvk_v4l2_h264_std_to_ctrl_scaling_matrix(const StdVideoH264ScalingLists *in, ++ struct v4l2_ctrl_h264_scaling_matrix *out) ++{ ++ memset(out, 0, sizeof(*out)); ++ if (in == NULL) ++ return; ++ ++ /* Both sides use [6][16] / [6][64] in identical spec-table-7-2 order. */ ++ memcpy(out->scaling_list_4x4, in->ScalingList4x4, ++ sizeof(out->scaling_list_4x4)); ++ memcpy(out->scaling_list_8x8, in->ScalingList8x8, ++ sizeof(out->scaling_list_8x8)); ++} ++ ++/* Spec-default flat scaling matrix (every element = 16). Use when neither ++ * SPS::seq_scaling_matrix_present_flag nor PPS::pic_scaling_matrix_present_flag ++ * is set. The H.264 spec §7.4.2.1.1.1 defines Flat_4x4_16 and Flat_8x8_16 ++ * as the fall-back; the kernel doc recommends always submitting the ++ * SCALING_MATRIX control with these defaults when explicit lists are ++ * absent (drivers like hantro G1 read it unconditionally). */ ++void ++panvk_v4l2_h264_default_flat_scaling_matrix( ++ struct v4l2_ctrl_h264_scaling_matrix *out) ++{ ++ memset(out->scaling_list_4x4, 16, sizeof(out->scaling_list_4x4)); ++ memset(out->scaling_list_8x8, 16, sizeof(out->scaling_list_8x8)); ++} ++ ++/* ------------------------------------------------------------------ */ ++/* DECODE_PARAMS */ ++/* ------------------------------------------------------------------ */ ++ ++/* ++ * Build v4l2_ctrl_h264_decode_params from Vulkan picture info. ++ * ++ * The caller supplies: ++ * vs — panvk_video_session; vs->dpb[slot].reference_ts is the ++ * V4L2 timestamp (v4l2_buffer.timestamp converted via ++ * v4l2_timeval_to_ns) of the previously-decoded CAPTURE ++ * buffer associated with that DPB slot index. ++ * pic_info — VkVideoDecodeH264PictureInfoKHR for the frame being ++ * decoded; pStdPictureInfo carries the per-pic fields. ++ * active_pps — the StdVideoH264PictureParameterSet bound for this ++ * decode; only needed if num_slice_groups_minus1 > 0 (FMO) ++ * which Vulkan core profile excludes. Currently unused. ++ * dst_dpb_slot — output (this-frame) DPB slot, supplied for symmetry; the ++ * current-frame fields go into the top-level ++ * v4l2_ctrl_h264_decode_params (NOT into dpb[]). The dpb[] ++ * array carries reference frames only. ++ * ref_slots — array of active reference DPB slots from the ++ * VkVideoDecodeInfoKHR::pReferenceSlots. Each entry's ++ * slotIndex selects vs->dpb[idx].reference_ts; the ++ * per-slot StdVideoDecodeH264ReferenceInfo is reachable ++ * via the VkVideoDecodeH264DpbSlotInfoKHR chained on ++ * pNext (caller has already resolved it — see helper ++ * below). ++ * num_ref_slots — count of entries in ref_slots[]. ++ * output_ts — V4L2 reference_ts assigned to the CAPTURE buffer for ++ * *this* frame; recorded into the SETUP slot mapping by ++ * the caller, not used in this struct itself. ++ * ++ * Fields the caller MUST populate post-translation: ++ * - dec_ref_pic_marking_bit_size ++ * - pic_order_cnt_bit_size ++ * - pic_order_cnt_lsb / delta_pic_order_cnt_bottom / delta_pic_order_cnt0 / ++ * delta_pic_order_cnt1 / idr_pic_id ++ * These come from the slice header bit-level parse (Vulkan doesn't ++ * forward them in StdVideoDecodeH264PictureInfo). The hantro G1 reads ++ * them from registers; without them the decoder produces zeros. ++ * See libva-v4l2-request-fourier h264.c:394-449 for the parse contract. ++ * ++ * - slice_group_change_cycle: from slice header, only meaningful when ++ * num_slice_groups_minus1 > 0 (not in Vulkan core profile). ++ */ ++ ++/* Helper: extract StdVideoDecodeH264ReferenceInfo from a VkVideoReferenceSlotInfoKHR ++ * pNext chain. Returns NULL if the chain doesn't include ++ * VkVideoDecodeH264DpbSlotInfoKHR. */ ++static const StdVideoDecodeH264ReferenceInfo * ++ref_info_from_slot(const VkVideoReferenceSlotInfoKHR *slot) ++{ ++ const VkBaseInStructure *p = (const VkBaseInStructure *) slot->pNext; ++ while (p != NULL) { ++ if (p->sType == VK_STRUCTURE_TYPE_VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR) { ++ const VkVideoDecodeH264DpbSlotInfoKHR *dpb = ++ (const VkVideoDecodeH264DpbSlotInfoKHR *) p; ++ return dpb->pStdReferenceInfo; ++ } ++ p = p->pNext; ++ } ++ return NULL; ++} ++ ++void ++panvk_v4l2_h264_build_decode_params( ++ const struct panvk_video_session *vs, ++ const VkVideoDecodeH264PictureInfoKHR *pic_info, ++ const StdVideoH264PictureParameterSet *active_pps, ++ uint32_t dst_dpb_slot, ++ const VkVideoReferenceSlotInfoKHR *ref_slots, ++ uint32_t num_ref_slots, ++ uint64_t output_ts, ++ struct v4l2_ctrl_h264_decode_params *out) ++{ ++ (void) active_pps; /* FMO-only; not in Vulkan core profile. */ ++ (void) dst_dpb_slot; /* Caller records output_ts → vs->dpb[slot] post-decode. */ ++ (void) output_ts; /* Same. */ ++ ++ memset(out, 0, sizeof(*out)); ++ ++ const StdVideoDecodeH264PictureInfo *spic = pic_info->pStdPictureInfo; ++ ++ /* Current-frame top-level fields. */ ++ out->frame_num = spic->frame_num; ++ out->idr_pic_id = spic->idr_pic_id; /* may be overwritten by ++ slice-header parse. */ ++ out->top_field_order_cnt = spic->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_TOP]; ++ out->bottom_field_order_cnt= spic->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_BOTTOM]; ++ ++ /* nal_ref_idc: not in StdVideoDecodeH264PictureInfo. The caller derives ++ * it from the first byte of the slice NAL (high 2 bits after the ++ * forbidden-zero-bit). Hantro reads it via DECODE_PARAMS, so this ++ * SHOULD be set post-call. For non-reference frames the H.264 spec ++ * mandates nal_ref_idc == 0; we leave the field at zero and the caller ++ * patches in the parsed value. */ ++ out->nal_ref_idc = 0; ++ ++ __u32 f = 0; ++ if (spic->flags.IdrPicFlag) ++ f |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC; ++ if (spic->flags.field_pic_flag) ++ f |= V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC; ++ if (spic->flags.bottom_field_flag) ++ f |= V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD; ++ /* PFRAME/BFRAME flags are slice-type-derived and not 1:1 with Vulkan ++ * picture info (slice type is per-slice, not per-pic). Leave clear; ++ * the slice_header parse path or higher-level caller can OR them in if ++ * an FRAME_BASED driver needs them. Hantro G1 does not. */ ++ out->flags = f; ++ ++ /* DPB array. One entry per active reference slot. */ ++ for (uint32_t i = 0; i < num_ref_slots && i < V4L2_H264_NUM_DPB_ENTRIES; i++) { ++ const VkVideoReferenceSlotInfoKHR *slot = &ref_slots[i]; ++ struct v4l2_h264_dpb_entry *dpb = &out->dpb[i]; ++ ++ if (slot->slotIndex < 0) ++ continue; /* "no reference" sentinel; entry stays zeroed (invalid). */ ++ ++ const uint32_t idx = (uint32_t) slot->slotIndex; ++ if (idx >= 16 || !vs->dpb[idx].valid) ++ continue; ++ ++ const StdVideoDecodeH264ReferenceInfo *rinfo = ref_info_from_slot(slot); ++ ++ dpb->reference_ts = vs->dpb[idx].reference_ts; ++ ++ if (rinfo != NULL) { ++ dpb->frame_num = rinfo->FrameNum; ++ /* pic_num: for short-term refs this is FrameNumWrap (H.264 §8.2.4.1); ++ * for long-term refs it's LongTermPicNum (§8.2.4.2). StdVideo ++ * doesn't separate the two — FrameNum holds whichever applies for ++ * the kind of reference. The kernel reflist builder uses pic_num ++ * only for short-term ordering; we feed FrameNum straight through ++ * and rely on V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM to disambiguate. ++ * ++ * NOTE: FrameNumWrap requires knowing max_frame_num and the ++ * current frame's frame_num to wrap. Vulkan-side callers that ++ * want spec-perfect pic_num for short-term refs should override ++ * this field after calling. The hantro driver ignores pic_num ++ * (uses reference_ts) so the wrap is empirically not load-bearing ++ * on RK3566/RK3568. ++ */ ++ dpb->pic_num = rinfo->FrameNum; ++ dpb->top_field_order_cnt = rinfo->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_TOP]; ++ dpb->bottom_field_order_cnt = rinfo->PicOrderCnt[STD_VIDEO_DECODE_H264_FIELD_ORDER_COUNT_BOTTOM]; ++ ++ __u32 dflags = V4L2_H264_DPB_ENTRY_FLAG_VALID | V4L2_H264_DPB_ENTRY_FLAG_ACTIVE; ++ if (rinfo->flags.used_for_long_term_reference) ++ dflags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM; ++ /* FIELD flag indicates a single-field-coded reference; both ++ * top_field_flag and bottom_field_flag in StdVideo mean the entry ++ * represents only that field. */ ++ if (rinfo->flags.top_field_flag || rinfo->flags.bottom_field_flag) ++ dflags |= V4L2_H264_DPB_ENTRY_FLAG_FIELD; ++ dpb->flags = dflags; ++ ++ /* fields: per kernel doc, valid values are V4L2_H264_{TOP,BOTTOM,FRAME}_REF. ++ * For frame-coded refs we use FRAME_REF (TOP|BOTTOM). The kernel ++ * reflist builder skips entries with fields == 0 — see hantro and ++ * the v4l2_h264_init_reflist_builder helper. */ ++ if (rinfo->flags.top_field_flag && !rinfo->flags.bottom_field_flag) ++ dpb->fields = V4L2_H264_TOP_FIELD_REF; ++ else if (rinfo->flags.bottom_field_flag && !rinfo->flags.top_field_flag) ++ dpb->fields = V4L2_H264_BOTTOM_FIELD_REF; ++ else ++ dpb->fields = V4L2_H264_FRAME_REF; ++ } else { ++ /* No StdVideoDecodeH264ReferenceInfo chained: minimal fallback. */ ++ dpb->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID; ++ dpb->fields = V4L2_H264_FRAME_REF; ++ } ++ } ++} +diff -urN a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c +--- a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c 2026-05-21 22:47:09.189957157 +0200 +@@ -0,0 +1,314 @@ ++/* ++ * H.264 slice header bit-parser implementation. ++ * ++ * Verbatim port of libva-v4l2-request-fourier src/h264_slice_header.c ++ * with the public symbol renamed to panvk_v4l2_h264_parse_slice_header() ++ * and the type names prefixed for Mesa namespace hygiene. See ++ * panvk_v4l2_h264_slice_header.h for context. ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "panvk_v4l2_h264_slice_header.h" ++ ++#include ++#include ++ ++struct br { ++ const uint8_t *data; ++ size_t length; /* bytes */ ++ size_t bit_pos; ++ bool error; ++}; ++ ++static uint32_t br_read_u(struct br *b, unsigned n) ++{ ++ uint32_t v = 0; ++ while (n--) { ++ if (b->bit_pos >= b->length * 8) { ++ b->error = true; ++ return 0; ++ } ++ v = (v << 1) | ((b->data[b->bit_pos >> 3] >> ++ (7 - (b->bit_pos & 7))) & 1u); ++ b->bit_pos++; ++ } ++ return v; ++} ++ ++static uint32_t br_read_ue(struct br *b) ++{ ++ unsigned zeros = 0; ++ while (br_read_u(b, 1) == 0) { ++ if (b->error || ++zeros >= 32) ++ return 0; ++ } ++ if (zeros == 0) ++ return 0; ++ return (1u << zeros) - 1u + br_read_u(b, zeros); ++} ++ ++static int32_t br_read_se(struct br *b) ++{ ++ uint32_t v = br_read_ue(b); ++ if (v & 1u) ++ return (int32_t)((v + 1u) >> 1); ++ return -(int32_t)(v >> 1); ++} ++ ++#define PANVK_H264_SLICE_HEADER_SCAN_BYTES 64 ++ ++static size_t rbsp_unescape(uint8_t *out, const uint8_t *in, size_t in_len) ++{ ++ size_t out_len = 0; ++ int zero_run = 0; ++ size_t i; ++ size_t cap = in_len < PANVK_H264_SLICE_HEADER_SCAN_BYTES ? ++ in_len : PANVK_H264_SLICE_HEADER_SCAN_BYTES; ++ ++ for (i = 0; i < cap; i++) { ++ if (zero_run >= 2 && in[i] == 0x03) { ++ zero_run = 0; ++ continue; ++ } ++ out[out_len++] = in[i]; ++ zero_run = (in[i] == 0x00) ? zero_run + 1 : 0; ++ } ++ return out_len; ++} ++ ++static void skip_ref_pic_list_modification(struct br *b, uint32_t slice_type) ++{ ++ uint32_t st_mod5 = slice_type % 5; ++ ++ if (st_mod5 != 2 && st_mod5 != 4) { ++ uint32_t flag = br_read_u(b, 1); ++ if (flag) { ++ uint32_t mod_idc; ++ do { ++ mod_idc = br_read_ue(b); ++ if (mod_idc == 0 || mod_idc == 1) ++ br_read_ue(b); ++ else if (mod_idc == 2) ++ br_read_ue(b); ++ if (b->error) ++ return; ++ } while (mod_idc != 3); ++ } ++ } ++ if (st_mod5 == 1) { ++ uint32_t flag = br_read_u(b, 1); ++ if (flag) { ++ uint32_t mod_idc; ++ do { ++ mod_idc = br_read_ue(b); ++ if (mod_idc == 0 || mod_idc == 1) ++ br_read_ue(b); ++ else if (mod_idc == 2) ++ br_read_ue(b); ++ if (b->error) ++ return; ++ } while (mod_idc != 3); ++ } ++ } ++} ++ ++static void skip_pred_weight_table(struct br *b, ++ uint32_t slice_type, ++ uint8_t chroma_format_idc, ++ uint8_t bit_depth_luma_minus8, ++ uint8_t bit_depth_chroma_minus8, ++ uint32_t num_ref_idx_l0_active_minus1, ++ uint32_t num_ref_idx_l1_active_minus1) ++{ ++ uint32_t i, j; ++ uint32_t st_mod5 = slice_type % 5; ++ ++ (void)bit_depth_luma_minus8; ++ (void)bit_depth_chroma_minus8; ++ ++ br_read_ue(b); /* luma_log2_weight_denom */ ++ if (chroma_format_idc != 0) ++ br_read_ue(b); /* chroma_log2_weight_denom */ ++ ++ for (i = 0; i <= num_ref_idx_l0_active_minus1 && !b->error; i++) { ++ uint32_t luma_weight_l0_flag = br_read_u(b, 1); ++ if (luma_weight_l0_flag) { ++ br_read_se(b); ++ br_read_se(b); ++ } ++ if (chroma_format_idc != 0) { ++ uint32_t chroma_weight_l0_flag = br_read_u(b, 1); ++ if (chroma_weight_l0_flag) { ++ for (j = 0; j < 2; j++) { ++ br_read_se(b); ++ br_read_se(b); ++ } ++ } ++ } ++ } ++ ++ if (st_mod5 == 1) { ++ for (i = 0; i <= num_ref_idx_l1_active_minus1 && !b->error; i++) { ++ uint32_t luma_weight_l1_flag = br_read_u(b, 1); ++ if (luma_weight_l1_flag) { ++ br_read_se(b); ++ br_read_se(b); ++ } ++ if (chroma_format_idc != 0) { ++ uint32_t chroma_weight_l1_flag = br_read_u(b, 1); ++ if (chroma_weight_l1_flag) { ++ for (j = 0; j < 2; j++) { ++ br_read_se(b); ++ br_read_se(b); ++ } ++ } ++ } ++ } ++ } ++} ++ ++int panvk_v4l2_h264_parse_slice_header( ++ const uint8_t *nal_payload, ++ size_t nal_payload_length, ++ const struct panvk_v4l2_h264_slice_header_context *ctx, ++ struct panvk_v4l2_h264_slice_header_info *out) ++{ ++ uint8_t unescaped[PANVK_H264_SLICE_HEADER_SCAN_BYTES]; ++ size_t unescaped_len; ++ struct br b = { 0 }; ++ bool idr_pic_flag = (ctx->nal_unit_type == 5); ++ uint32_t slice_type; ++ uint32_t num_ref_idx_l0_active_minus1; ++ uint32_t num_ref_idx_l1_active_minus1; ++ size_t pic_order_cnt_start; ++ size_t pic_order_cnt_end; ++ size_t dec_ref_pic_marking_start; ++ size_t dec_ref_pic_marking_end; ++ bool field_pic_flag = false; ++ ++ memset(out, 0, sizeof(*out)); ++ ++ if (!nal_payload || nal_payload_length == 0) ++ return -EINVAL; ++ ++ unescaped_len = rbsp_unescape(unescaped, nal_payload, nal_payload_length); ++ if (unescaped_len < 2) ++ return -EINVAL; ++ ++ b.data = unescaped; ++ b.length = unescaped_len; ++ b.bit_pos = 0; ++ b.error = false; ++ ++ out->first_mb_in_slice = br_read_ue(&b); ++ slice_type = br_read_ue(&b); ++ out->slice_type = slice_type; ++ out->pic_parameter_set_id = br_read_ue(&b); ++ ++ if (ctx->separate_colour_plane_flag) ++ (void)br_read_u(&b, 2); ++ ++ out->frame_num = br_read_u(&b, ctx->log2_max_frame_num_minus4 + 4u); ++ ++ if (!ctx->frame_mbs_only_flag) { ++ field_pic_flag = (br_read_u(&b, 1) != 0); ++ if (field_pic_flag) ++ (void)br_read_u(&b, 1); ++ } ++ ++ if (idr_pic_flag) ++ out->idr_pic_id = (uint16_t)br_read_ue(&b); ++ ++ pic_order_cnt_start = b.bit_pos; ++ if (ctx->pic_order_cnt_type == 0) { ++ out->pic_order_cnt_lsb = (uint16_t)br_read_u( ++ &b, ctx->log2_max_pic_order_cnt_lsb_minus4 + 4u); ++ if (ctx->bottom_field_pic_order_in_frame_present_flag && !field_pic_flag) ++ out->delta_pic_order_cnt_bottom = br_read_se(&b); ++ } else if (ctx->pic_order_cnt_type == 1 && ++ !ctx->delta_pic_order_always_zero_flag) { ++ out->delta_pic_order_cnt0 = br_read_se(&b); ++ if (ctx->bottom_field_pic_order_in_frame_present_flag && !field_pic_flag) ++ out->delta_pic_order_cnt1 = br_read_se(&b); ++ } ++ pic_order_cnt_end = b.bit_pos; ++ out->pic_order_cnt_bit_size = ++ (uint32_t)(pic_order_cnt_end - pic_order_cnt_start); ++ ++ if (ctx->redundant_pic_cnt_present_flag) ++ (void)br_read_ue(&b); ++ ++ if (slice_type % 5 == 1) ++ (void)br_read_u(&b, 1); ++ ++ num_ref_idx_l0_active_minus1 = ctx->num_ref_idx_l0_default_active_minus1; ++ num_ref_idx_l1_active_minus1 = ctx->num_ref_idx_l1_default_active_minus1; ++ ++ { ++ uint32_t st = slice_type % 5; ++ if (st == 0 || st == 3 || st == 1) { ++ uint32_t override = br_read_u(&b, 1); ++ if (override) { ++ num_ref_idx_l0_active_minus1 = br_read_ue(&b); ++ if (st == 1) ++ num_ref_idx_l1_active_minus1 = br_read_ue(&b); ++ } ++ } ++ } ++ ++ skip_ref_pic_list_modification(&b, slice_type); ++ if (b.error) ++ return -EIO; ++ ++ { ++ uint32_t st = slice_type % 5; ++ bool do_pwt = ++ (ctx->weighted_pred_flag && (st == 0 || st == 3)) || ++ (ctx->weighted_bipred_idc == 1 && st == 1); ++ if (do_pwt) { ++ skip_pred_weight_table(&b, slice_type, ++ ctx->chroma_format_idc, ++ ctx->bit_depth_luma_minus8, ++ ctx->bit_depth_chroma_minus8, ++ num_ref_idx_l0_active_minus1, ++ num_ref_idx_l1_active_minus1); ++ if (b.error) ++ return -EIO; ++ } ++ } ++ ++ dec_ref_pic_marking_start = b.bit_pos; ++ if (ctx->nal_ref_idc != 0) { ++ if (idr_pic_flag) { ++ (void)br_read_u(&b, 1); ++ (void)br_read_u(&b, 1); ++ } else { ++ uint32_t adaptive = br_read_u(&b, 1); ++ if (adaptive) { ++ uint32_t mmco; ++ do { ++ mmco = br_read_ue(&b); ++ if (mmco == 1 || mmco == 3) ++ br_read_ue(&b); ++ if (mmco == 2) ++ br_read_ue(&b); ++ if (mmco == 3 || mmco == 6) ++ br_read_ue(&b); ++ if (mmco == 4) ++ br_read_ue(&b); ++ if (b.error) ++ return -EIO; ++ } while (mmco != 0); ++ } ++ } ++ } ++ dec_ref_pic_marking_end = b.bit_pos; ++ out->dec_ref_pic_marking_bit_size = ++ (uint32_t)(dec_ref_pic_marking_end - dec_ref_pic_marking_start); ++ ++ if (b.error) ++ return -EIO; ++ ++ return 0; ++} +diff -urN a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h +--- a/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h 2026-05-21 22:47:09.189957157 +0200 +@@ -0,0 +1,94 @@ ++/* ++ * H.264 slice header bit-parser for panvk-bifrost-video / V4L2 stateless ++ * H.264 decode (hantro G1 on RK3566/RK3568 Mali-Bifrost SBCs). ++ * ++ * Extracts the slice-header bit-position and value fields that ++ * V4L2_CID_STATELESS_H264_DECODE_PARAMS requires (idr_pic_id, ++ * pic_order_cnt_lsb, delta_pic_order_cnt_*, pic_order_cnt_bit_size, ++ * dec_ref_pic_marking_bit_size). Vulkan's StdVideoDecodeH264PictureInfo ++ * does not carry these — they live only in the bitstream's slice_header() ++ * syntax. Hantro G1 (drivers/media/platform/verisilicon/ ++ * hantro_g1_h264_dec.c::set_params) writes the bit_size fields directly ++ * into MMIO registers G1_REG_DEC_CTRL5_REFPIC_MK_LEN and ++ * G1_REG_DEC_CTRL6_POC_LENGTH; with zeros the hardware bitstream parser ++ * walks past zero bits, lands on garbage, decodes nothing. ++ * ++ * Spec reference: ITU-T Rec. H.264 (08/2024) §7.3.3 slice_header ++ * and §7.3.3.1 ref_pic_list_modification, §7.3.3.2 pred_weight_table, ++ * §7.3.3.3 dec_ref_pic_marking. ++ * ++ * Cross-reference (proven working on hantro G1): libva-v4l2-request-fourier ++ * src/h264_slice_header.{c,h}. This file is a verbatim port with the ++ * function renamed from h264_parse_slice_header() to ++ * panvk_v4l2_h264_parse_slice_header() for namespace hygiene inside Mesa. ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#ifndef PANVK_V4L2_H264_SLICE_HEADER_H ++#define PANVK_V4L2_H264_SLICE_HEADER_H ++ ++#include ++#include ++#include ++ ++struct panvk_v4l2_h264_slice_header_context { ++ /* From SPS (the active SPS at slice-time). */ ++ bool separate_colour_plane_flag; ++ uint8_t log2_max_frame_num_minus4; ++ bool frame_mbs_only_flag; ++ uint8_t pic_order_cnt_type; ++ uint8_t log2_max_pic_order_cnt_lsb_minus4; ++ bool delta_pic_order_always_zero_flag; ++ ++ /* From PPS (the active PPS at slice-time). */ ++ bool bottom_field_pic_order_in_frame_present_flag; ++ bool redundant_pic_cnt_present_flag; ++ bool weighted_pred_flag; ++ uint8_t weighted_bipred_idc; ++ uint8_t num_ref_idx_l0_default_active_minus1; ++ uint8_t num_ref_idx_l1_default_active_minus1; ++ uint8_t chroma_format_idc; ++ uint8_t bit_depth_luma_minus8; ++ uint8_t bit_depth_chroma_minus8; ++ ++ /* From the NAL unit header (already extracted by the caller). */ ++ uint8_t nal_unit_type; ++ uint8_t nal_ref_idc; ++}; ++ ++struct panvk_v4l2_h264_slice_header_info { ++ uint16_t idr_pic_id; ++ uint16_t pic_order_cnt_lsb; ++ int32_t delta_pic_order_cnt_bottom; ++ int32_t delta_pic_order_cnt0; ++ int32_t delta_pic_order_cnt1; ++ uint32_t pic_order_cnt_bit_size; ++ uint32_t dec_ref_pic_marking_bit_size; ++ ++ /* Diagnostic — useful for cross-checking pre-parsed vs bitstream values. */ ++ uint32_t first_mb_in_slice; ++ uint32_t slice_type; ++ uint32_t pic_parameter_set_id; ++ uint32_t frame_num; ++}; ++ ++/* ++ * Parse slice_header() up to dec_ref_pic_marking() (inclusive) of the ++ * H.264 RBSP slice_layer_without_partitioning_rbsp() syntax, extracting ++ * the V4L2 DECODE_PARAMS fields. Returns 0 on success, negative ++ * errno-shaped value on parse failure. ++ * ++ * @nal_payload: pointer to the byte AFTER the NAL header byte ++ * (i.e. start of the RBSP proper; caller has already ++ * skipped any ANNEX_B start code and the 1-byte ++ * nal_unit_header). Will be RBSP-unescaped internally. ++ * @nal_payload_length: bytes available at @nal_payload. ++ */ ++int panvk_v4l2_h264_parse_slice_header( ++ const uint8_t *nal_payload, ++ size_t nal_payload_length, ++ const struct panvk_v4l2_h264_slice_header_context *ctx, ++ struct panvk_v4l2_h264_slice_header_info *out); ++ ++#endif /* PANVK_V4L2_H264_SLICE_HEADER_H */ +diff -urN a/src/panfrost/vulkan/panvk_video_decode.c b/src/panfrost/vulkan/panvk_video_decode.c +--- a/src/panfrost/vulkan/panvk_video_decode.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/src/panfrost/vulkan/panvk_video_decode.c 2026-05-21 22:47:09.189957157 +0200 +@@ -0,0 +1,362 @@ ++/* ++ * panvk-bifrost-video Phase 4 commit 7b: ++ * Vulkan-side decode dispatch wired to V4L2 hantro via dmabuf. ++ * ++ * Phase 1 simplification: cmd_buffer state tracking via DEVICE-level ++ * active_video struct (under a mutex). Per-cmdbuf state hand-off is ++ * Phase >>1 once arch-agnostic source can access per-arch cmd_buffer ++ * structs without the include-path gymnastics. This works for ++ * single-session decode workloads (mpv, ffmpeg, vk-video-samples). ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "panvk_video_decode.h" ++#include "panvk_v4l2_h264_slice_header.h" ++#include "panvk_buffer.h" ++#include "panvk_device.h" ++#include "panvk_device_memory.h" ++#include "panvk_entrypoints.h" ++#include "panvk_image.h" ++ ++#include "vk_image.h" ++ ++#include "vk_alloc.h" ++#include "vk_command_buffer.h" ++#include "vk_log.h" ++#include "vk_video.h" ++ ++#include "util/macros.h" ++ ++#include "kmod/pan_kmod.h" ++ ++#include ++#include ++#include ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++panvk_CreateVideoSessionKHR(VkDevice _device, ++ const VkVideoSessionCreateInfoKHR *pCreateInfo, ++ const VkAllocationCallbacks *pAllocator, ++ VkVideoSessionKHR *pVideoSession) ++{ ++ VK_FROM_HANDLE(panvk_device, device, _device); ++ ++ struct panvk_video_session *vs = ++ vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*vs), 8, ++ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); ++ if (!vs) ++ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); ++ ++ VkResult r = vk_video_session_init(&device->vk, &vs->vk, pCreateInfo); ++ if (r != VK_SUCCESS) { ++ vk_free2(&device->vk.alloc, pAllocator, vs); ++ return r; ++ } ++ ++ vs->video_fd = -1; ++ vs->media_fd = -1; ++ vs->slice_based = false; ++ ++ int v4l2_rc = panvk_v4l2_session_init(vs, &device->vk, pAllocator, ++ pCreateInfo->maxCodedExtent.width, ++ pCreateInfo->maxCodedExtent.height); ++ if (v4l2_rc) { ++ mesa_loge("panvk_video: V4L2 session init failed rc=%d", v4l2_rc); ++ vk_video_session_finish(&vs->vk); ++ vk_free2(&device->vk.alloc, pAllocator, vs); ++ return vk_error(device, VK_ERROR_INITIALIZATION_FAILED); ++ } ++ ++ *pVideoSession = panvk_video_session_to_handle(vs); ++ return VK_SUCCESS; ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_DestroyVideoSessionKHR(VkDevice _device, ++ VkVideoSessionKHR videoSession, ++ const VkAllocationCallbacks *pAllocator) ++{ ++ VK_FROM_HANDLE(panvk_device, device, _device); ++ VK_FROM_HANDLE(panvk_video_session, vs, videoSession); ++ if (!vs) return; ++ ++ /* Clear device-level active state if it was this session. */ ++ simple_mtx_lock(&device->active_video.lock); ++ if (device->active_video.vs == vs) { ++ device->active_video.vs = NULL; ++ device->active_video.params = NULL; ++ } ++ simple_mtx_unlock(&device->active_video.lock); ++ ++ panvk_v4l2_session_finish(vs, &device->vk, pAllocator); ++ vk_video_session_finish(&vs->vk); ++ vk_free2(&device->vk.alloc, pAllocator, vs); ++} ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++panvk_GetVideoSessionMemoryRequirementsKHR( ++ VkDevice device, ++ VkVideoSessionKHR videoSession, ++ uint32_t *pMemoryRequirementsCount, ++ VkVideoSessionMemoryRequirementsKHR *pMemoryRequirements) ++{ ++ *pMemoryRequirementsCount = 0; ++ return VK_SUCCESS; ++} ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++panvk_BindVideoSessionMemoryKHR( ++ VkDevice device, ++ VkVideoSessionKHR videoSession, ++ uint32_t bindSessionMemoryInfoCount, ++ const VkBindVideoSessionMemoryInfoKHR *pBindSessionMemoryInfos) ++{ ++ return VK_SUCCESS; ++} ++ ++/* Helper: device lookup from VkCommandBuffer via the vk_command_buffer base. */ ++static struct panvk_device * ++cmdbuf_to_device(VkCommandBuffer commandBuffer) ++{ ++ VK_FROM_HANDLE(vk_command_buffer, vk_cmdbuf, commandBuffer); ++ return to_panvk_device(vk_cmdbuf->base.device); ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_CmdBeginVideoCodingKHR(VkCommandBuffer commandBuffer, ++ const VkVideoBeginCodingInfoKHR *pBeginInfo) ++{ ++ struct panvk_device *device = cmdbuf_to_device(commandBuffer); ++ VK_FROM_HANDLE(panvk_video_session, vs, pBeginInfo->videoSession); ++ ++ simple_mtx_lock(&device->active_video.lock); ++ device->active_video.vs = vs; ++ if (pBeginInfo->videoSessionParameters != VK_NULL_HANDLE) { ++ VK_FROM_HANDLE(vk_video_session_parameters, params, ++ pBeginInfo->videoSessionParameters); ++ device->active_video.params = params; ++ } else { ++ device->active_video.params = NULL; ++ } ++ simple_mtx_unlock(&device->active_video.lock); ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_CmdEndVideoCodingKHR(VkCommandBuffer commandBuffer, ++ const VkVideoEndCodingInfoKHR *pEndCodingInfo) ++{ ++ struct panvk_device *device = cmdbuf_to_device(commandBuffer); ++ simple_mtx_lock(&device->active_video.lock); ++ device->active_video.vs = NULL; ++ device->active_video.params = NULL; ++ simple_mtx_unlock(&device->active_video.lock); ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_CmdControlVideoCodingKHR(VkCommandBuffer commandBuffer, ++ const VkVideoCodingControlInfoKHR *pCodingControlInfo) ++{ ++ struct panvk_device *device = cmdbuf_to_device(commandBuffer); ++ simple_mtx_lock(&device->active_video.lock); ++ if (device->active_video.vs && ++ (pCodingControlInfo->flags & VK_VIDEO_CODING_CONTROL_RESET_BIT_KHR)) { ++ for (unsigned i = 0; i < 16; i++) ++ device->active_video.vs->dpb[i].valid = false; ++ } ++ simple_mtx_unlock(&device->active_video.lock); ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_CmdDecodeVideoKHR(VkCommandBuffer commandBuffer, ++ const VkVideoDecodeInfoKHR *pDecodeInfo) ++{ ++ struct panvk_device *device = cmdbuf_to_device(commandBuffer); ++ ++ simple_mtx_lock(&device->active_video.lock); ++ struct panvk_video_session *vs = device->active_video.vs; ++ struct vk_video_session_parameters *params = device->active_video.params; ++ simple_mtx_unlock(&device->active_video.lock); ++ ++ if (!vs || !params) { ++ mesa_loge("panvk_video: CmdDecodeVideoKHR outside Begin/End scope"); ++ return; ++ } ++ ++ const VkVideoDecodeH264PictureInfoKHR *h264_pi = ++ vk_find_struct_const(pDecodeInfo->pNext, ++ VIDEO_DECODE_H264_PICTURE_INFO_KHR); ++ if (!h264_pi || !h264_pi->pStdPictureInfo) { ++ mesa_loge("panvk_video: missing H.264 picture info"); ++ return; ++ } ++ ++ const StdVideoH264SequenceParameterSet *sps = ++ vk_video_find_h264_dec_std_sps(params, ++ h264_pi->pStdPictureInfo->seq_parameter_set_id); ++ const StdVideoH264PictureParameterSet *pps = ++ vk_video_find_h264_dec_std_pps(params, ++ h264_pi->pStdPictureInfo->pic_parameter_set_id); ++ if (!sps || !pps) { ++ mesa_loge("panvk_video: SPS or PPS lookup failed"); ++ return; ++ } ++ ++ /* Translate Std → V4L2 control structs. */ ++ struct v4l2_ctrl_h264_sps c_sps; ++ struct v4l2_ctrl_h264_pps c_pps; ++ struct v4l2_ctrl_h264_scaling_matrix c_scaling; ++ struct v4l2_ctrl_h264_decode_params c_dec; ++ ++ panvk_v4l2_h264_std_to_ctrl_sps(sps, &c_sps); ++ panvk_v4l2_h264_std_to_ctrl_pps(pps, &c_pps); ++ panvk_v4l2_h264_default_flat_scaling_matrix(&c_scaling); ++ ++ /* ++ * output_ts: V4L2 buffer-identity stamp. Must round-trip cleanly through ++ * (tv_sec, tv_usec) at QBUF time, because hantro's reflist builder ++ * matches dpb[i].reference_ts against the kernel-side CAPTURE timestamp ++ * (which is the OUTPUT-QBUF timestamp re-derived via v4l2_timeval_to_ns: ++ * `tv_sec * 1e9 + tv_usec * 1e3`). Sub-microsecond bits are dropped, so ++ * any high-resolution stamp (e.g. a 64-bit pointer cast) makes the ++ * lookup miss and P/B frames decode against zero references. Use a ++ * monotonic per-session counter in microseconds (i.e. * 1000 ns). ++ */ ++ static uint32_t panvk_video_ts_counter = 0; ++ const uint64_t output_ts = ((uint64_t)++panvk_video_ts_counter) * 1000ULL; ++ uint32_t dst_dpb_slot = pDecodeInfo->pSetupReferenceSlot ++ ? (uint32_t) pDecodeInfo->pSetupReferenceSlot->slotIndex : 0u; ++ ++ panvk_v4l2_h264_build_decode_params(vs, h264_pi, pps, ++ dst_dpb_slot, ++ pDecodeInfo->pReferenceSlots, ++ pDecodeInfo->referenceSlotCount, ++ output_ts, &c_dec); ++ ++ /* Resolve source bitstream CPU pointer via panvk_buffer.mem.addr.host. */ ++ VK_FROM_HANDLE(panvk_buffer, src_buf, pDecodeInfo->srcBuffer); ++ if (!src_buf || !src_buf->mem || !src_buf->mem->addr.host) { ++ mesa_loge("panvk_video: src buffer has no host map"); ++ return; ++ } ++ const void *src_bitstream = ++ (const uint8_t *) src_buf->mem->addr.host + ++ src_buf->mem_offset + pDecodeInfo->srcBufferOffset; ++ ++ /* ++ * Slice-header bit-level parse — recovers the DECODE_PARAMS fields ++ * that StdVideoDecodeH264PictureInfo doesn't carry: idr_pic_id, ++ * pic_order_cnt_lsb, delta_pic_order_cnt_*, pic_order_cnt_bit_size, ++ * dec_ref_pic_marking_bit_size, and nal_ref_idc. Hantro G1 writes the ++ * bit_size fields directly into MMIO registers G1_REG_DEC_CTRL5/CTRL6; ++ * with zeros the hardware bitstream parser walks past zero bits, lands ++ * on garbage, and decodes all-zero pixels — observed empirically as the ++ * "Y plane all zeros" symptom that closed the prior Commit 7e. ++ * ++ * Cross-reference (proven fix on hantro): libva-v4l2-request-fourier ++ * src/h264.c:394-449. The panvk_v4l2_h264_slice_header.{c,h} parser ++ * is a verbatim port with namespace renames. ++ * ++ * Expects ANNEX_B start-code-prefixed VCL NAL at *src_bitstream*. We ++ * skip the 3- or 4-byte start code then the 1-byte NAL header. ++ */ ++ { ++ const uint8_t *bs = (const uint8_t *) src_bitstream; ++ uint32_t bs_len = pDecodeInfo->srcBufferRange; ++ uint32_t off = 0; ++ /* Skip ANNEX_B start code (0x00 00 01 or 0x00 00 00 01). */ ++ if (bs_len >= 4 && bs[0] == 0 && bs[1] == 0 && bs[2] == 0 && bs[3] == 1) ++ off = 4; ++ else if (bs_len >= 3 && bs[0] == 0 && bs[1] == 0 && bs[2] == 1) ++ off = 3; ++ ++ if (bs_len > off + 1) { ++ uint8_t nal_hdr = bs[off]; ++ uint8_t nal_ref_idc = (nal_hdr >> 5) & 0x3; ++ uint8_t nal_unit_type = nal_hdr & 0x1f; ++ ++ const struct panvk_v4l2_h264_slice_header_context sh_ctx = { ++ .separate_colour_plane_flag = ++ (sps->flags.separate_colour_plane_flag != 0), ++ .log2_max_frame_num_minus4 = sps->log2_max_frame_num_minus4, ++ .frame_mbs_only_flag = (sps->flags.frame_mbs_only_flag != 0), ++ .pic_order_cnt_type = (uint8_t) sps->pic_order_cnt_type, ++ .log2_max_pic_order_cnt_lsb_minus4 = ++ sps->log2_max_pic_order_cnt_lsb_minus4, ++ .delta_pic_order_always_zero_flag = ++ (sps->flags.delta_pic_order_always_zero_flag != 0), ++ .bottom_field_pic_order_in_frame_present_flag = ++ (pps->flags.bottom_field_pic_order_in_frame_present_flag != 0), ++ .redundant_pic_cnt_present_flag = ++ (pps->flags.redundant_pic_cnt_present_flag != 0), ++ .weighted_pred_flag = ++ (pps->flags.weighted_pred_flag != 0), ++ .weighted_bipred_idc = (uint8_t) pps->weighted_bipred_idc, ++ .num_ref_idx_l0_default_active_minus1 = ++ pps->num_ref_idx_l0_default_active_minus1, ++ .num_ref_idx_l1_default_active_minus1 = ++ pps->num_ref_idx_l1_default_active_minus1, ++ .chroma_format_idc = (uint8_t) sps->chroma_format_idc, ++ .bit_depth_luma_minus8 = sps->bit_depth_luma_minus8, ++ .bit_depth_chroma_minus8 = sps->bit_depth_chroma_minus8, ++ .nal_unit_type = nal_unit_type, ++ .nal_ref_idc = nal_ref_idc, ++ }; ++ struct panvk_v4l2_h264_slice_header_info sh = { 0 }; ++ const uint8_t *nal_payload = bs + off + 1; /* past NAL header byte */ ++ uint32_t nal_payload_len = bs_len - (off + 1); ++ ++ int sh_rc = panvk_v4l2_h264_parse_slice_header( ++ nal_payload, nal_payload_len, &sh_ctx, &sh); ++ if (sh_rc == 0) { ++ c_dec.idr_pic_id = sh.idr_pic_id; ++ c_dec.pic_order_cnt_lsb = sh.pic_order_cnt_lsb; ++ c_dec.delta_pic_order_cnt_bottom = sh.delta_pic_order_cnt_bottom; ++ c_dec.delta_pic_order_cnt0 = sh.delta_pic_order_cnt0; ++ c_dec.delta_pic_order_cnt1 = sh.delta_pic_order_cnt1; ++ c_dec.pic_order_cnt_bit_size = sh.pic_order_cnt_bit_size; ++ c_dec.dec_ref_pic_marking_bit_size = ++ sh.dec_ref_pic_marking_bit_size; ++ c_dec.nal_ref_idc = nal_ref_idc; ++ /* ++ * IDR_PIC flag: Vulkan's StdVideoDecodeH264PictureInfo.flags. ++ * IdrPicFlag is application-supplied and the vk-video-samples ++ * parser leaves it zero. Recover it from nal_unit_type (==5 is ++ * IDR per H.264 §7.4.1). Without this flag set, hantro's ++ * VDPU_REG_IDR_PIC_E stays clear and the hardware treats the ++ * frame as P/B, hunts for references it doesn't have, and ++ * writes zero output. ++ */ ++ if (nal_unit_type == 5) ++ c_dec.flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC; ++ } else { ++ mesa_loge("panvk_video: slice_header parse FAILED rc=%d " ++ "(payload_len=%u) — DECODE_PARAMS bit_size fields " ++ "left zero, hantro will produce zeros", ++ sh_rc, nal_payload_len); ++ } ++ } else { ++ mesa_loge("panvk_video: bitstream too short for NAL header " ++ "(bs_len=%u off=%u)", bs_len, off); ++ } ++ } ++ ++ /* The 14-step ioctl dance synchronously. CPU-copy variant for Phase 1. */ ++ int rc = panvk_v4l2_submit_h264_decode(vs, &c_sps, &c_pps, &c_scaling, ++ &c_dec, ++ src_bitstream, ++ pDecodeInfo->srcBufferRange, ++ -1, /* dst unused (MMAP CAPTURE) */ ++ output_ts); ++ ++ if (rc) { ++ mesa_loge("panvk_video: decode submit failed rc=%d", rc); ++ return; ++ } ++ ++ /* Update DPB tracking. */ ++ if (dst_dpb_slot < 16) { ++ vs->dpb[dst_dpb_slot].valid = true; ++ vs->dpb[dst_dpb_slot].reference_ts = output_ts; ++ } ++} +diff -urN a/src/panfrost/vulkan/panvk_video_decode.h b/src/panfrost/vulkan/panvk_video_decode.h +--- a/src/panfrost/vulkan/panvk_video_decode.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/src/panfrost/vulkan/panvk_video_decode.h 2026-05-21 22:47:09.189957157 +0200 +@@ -0,0 +1,114 @@ ++/* ++ * panvk-bifrost-video Phase 4 commit 3: extended for V4L2 state. ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#ifndef PANVK_VIDEO_DECODE_H ++#define PANVK_VIDEO_DECODE_H ++ ++#include "vk_video.h" ++#include "vk_object.h" ++ ++#include ++ ++/* Forward decls */ ++struct panvk_device; ++struct vk_device; ++ ++/* iter1: per-session state. Wraps vk_video_session for spec-mandated fields. */ ++struct panvk_video_session { ++ struct vk_video_session vk; ++ ++ /* V4L2 fds — opened in Commit 3 (per-session). -1 means not opened. */ ++ int video_fd; ++ int media_fd; ++ ++ /* Negotiated formats per OUTPUT / CAPTURE queue */ ++ struct v4l2_format fmt_output; ++ struct v4l2_format fmt_capture; ++ ++ /* Request fd pool. PANVK_V4L2_REQUEST_FD_COUNT entries. */ ++ int *request_fds; ++ bool request_fd_used[32]; /* tracks per-fd "ever queued" → REINIT before reuse */ ++ unsigned num_request_fds; ++ uint32_t request_fd_next; /* round-robin index */ ++ ++ /* DPB slotIndex → V4L2 reference_ts mapping (Phase 1 D5) */ ++ struct { ++ bool valid; ++ uint64_t reference_ts; ++ } dpb[16]; ++ ++ /* Phase 1 lock — FRAME_BASED only. */ ++ bool slice_based; ++ ++ /* Multi-planar V4L2 buffer type? Detected at session init via ++ * V4L2_CAP_VIDEO_M2M_MPLANE. Hantro: true. rkvdec on rk3399: false. */ ++ bool mplane; ++ ++ /* iter1 commit 7c: V4L2 buffer counts + round-robin indices. ++ * Both queues use MMAP; bitstream copied CPU-side from VkBuffer host map. */ ++ uint32_t num_output_buffers; ++ uint32_t output_next; ++ void *output_map[18]; /* mmap'd OUTPUT buffer CPU pointers */ ++ uint32_t output_map_size[18]; ++ uint32_t num_capture_buffers; ++ uint32_t capture_next; ++ void *capture_map[18]; /* mmap'd CAPTURE buffer CPU pointers */ ++ uint32_t capture_map_size[18]; ++}; ++ ++VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_video_session, vk.base, VkVideoSessionKHR, ++ VK_OBJECT_TYPE_VIDEO_SESSION_KHR) ++ ++/* panvk_v4l2.c API */ ++bool panvk_v4l2_probe_hantro(void); ++int panvk_v4l2_session_init(struct panvk_video_session *vs, ++ struct vk_device *vk_dev, ++ const VkAllocationCallbacks *alloc, ++ uint32_t width, uint32_t height); ++void panvk_v4l2_session_finish(struct panvk_video_session *vs, ++ struct vk_device *vk_dev, ++ const VkAllocationCallbacks *alloc); ++ ++/* 14-step ioctl dance for one H.264 frame. */ ++struct v4l2_ctrl_h264_sps; ++struct v4l2_ctrl_h264_pps; ++struct v4l2_ctrl_h264_scaling_matrix; ++struct v4l2_ctrl_h264_decode_params; ++ ++int panvk_v4l2_submit_h264_decode( ++ struct panvk_video_session *vs, ++ const struct v4l2_ctrl_h264_sps *sps, ++ const struct v4l2_ctrl_h264_pps *pps, ++ const struct v4l2_ctrl_h264_scaling_matrix *scaling, ++ const struct v4l2_ctrl_h264_decode_params *dec, ++ const void *src_bitstream, uint32_t src_bytes, ++ int dst_dmabuf_fd_unused, ++ uint64_t qbuf_ts); ++ ++/* panvk_v4l2_h264.c — Std → V4L2 control translation API (signatures ++ * use full types; consumers must include vk_video headers before this. */ ++void panvk_v4l2_h264_std_to_ctrl_sps( ++ const StdVideoH264SequenceParameterSet *in, ++ struct v4l2_ctrl_h264_sps *out); ++void panvk_v4l2_h264_std_to_ctrl_pps( ++ const StdVideoH264PictureParameterSet *in, ++ struct v4l2_ctrl_h264_pps *out); ++void panvk_v4l2_h264_std_to_ctrl_scaling_matrix( ++ const StdVideoH264ScalingLists *in, ++ struct v4l2_ctrl_h264_scaling_matrix *out); ++void panvk_v4l2_h264_default_flat_scaling_matrix( ++ struct v4l2_ctrl_h264_scaling_matrix *out); ++void panvk_v4l2_h264_build_decode_params( ++ const struct panvk_video_session *vs, ++ const VkVideoDecodeH264PictureInfoKHR *pic_info, ++ const StdVideoH264PictureParameterSet *active_pps, ++ uint32_t dst_dpb_slot, ++ const VkVideoReferenceSlotInfoKHR *ref_slots, ++ uint32_t num_ref_slots, ++ uint64_t output_ts, ++ struct v4l2_ctrl_h264_decode_params *out); ++ ++#endif /* PANVK_VIDEO_DECODE_H */ +diff -urN a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c +--- a/src/panfrost/vulkan/panvk_vX_device.c 2026-05-21 22:46:57.505785441 +0200 ++++ b/src/panfrost/vulkan/panvk_vX_device.c 2026-05-21 22:47:09.189957157 +0200 +@@ -203,6 +203,27 @@ + } + } + ++/* iter1: translate Vulkan-visible queueFamilyIndex to panvk enum. ++ * Returns PANVK_QUEUE_FAMILY_COUNT on invalid input. */ ++static inline enum panvk_queue_family ++panvk_per_arch(vulkan_qfi_to_panvk)(struct panvk_physical_device *physical_device, ++ uint32_t vulkan_qfi) ++{ ++ uint32_t pos = 0; ++ for (uint32_t i = 0; i < PANVK_QUEUE_FAMILY_COUNT; i++) { ++ if (i == PANVK_QUEUE_FAMILY_BIND && ++ !physical_device->vk.supported_features.sparseBinding) ++ continue; ++ if (i == PANVK_QUEUE_FAMILY_VIDEO_DECODE && ++ !physical_device->vk.supported_extensions.KHR_video_queue) ++ continue; ++ if (pos == vulkan_qfi) ++ return (enum panvk_queue_family) i; ++ pos++; ++ } ++ return PANVK_QUEUE_FAMILY_COUNT; ++} ++ + static VkResult + check_global_priority(const struct panvk_physical_device *phys_dev, + const VkDeviceQueueCreateInfo *create_info) +@@ -215,7 +236,10 @@ + priority_info ? priority_info->globalPriority + : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; + +- switch (create_info->queueFamilyIndex) { ++ const enum panvk_queue_family panvk_qfi = ++ panvk_per_arch(vulkan_qfi_to_panvk)( ++ (struct panvk_physical_device *) phys_dev, create_info->queueFamilyIndex); ++ switch (panvk_qfi) { + case PANVK_QUEUE_FAMILY_GPU: { + enum pan_kmod_group_allow_priority_flags requested_prio = + global_priority_to_group_allow_priority_flag(priority); +@@ -242,6 +266,12 @@ + return VK_ERROR_NOT_PERMITTED_KHR; + } + ++ case PANVK_QUEUE_FAMILY_VIDEO_DECODE: ++ /* iter1: only MEDIUM priority for now */ ++ return priority == VK_QUEUE_GLOBAL_PRIORITY_MEDIUM ++ ? VK_SUCCESS ++ : VK_ERROR_NOT_PERMITTED_KHR; ++ + default: + UNREACHABLE("Unknown queue family"); + } +@@ -250,11 +280,20 @@ + static VkResult + panvk_queue_check_status(struct vk_queue *queue) + { +- switch (queue->queue_family_index) { ++ struct panvk_device *dev = ++ container_of(queue->base.device, struct panvk_device, vk); ++ struct panvk_physical_device *pdev = ++ to_panvk_physical_device(dev->vk.physical); ++ const enum panvk_queue_family panvk_qfi = ++ panvk_per_arch(vulkan_qfi_to_panvk)(pdev, queue->queue_family_index); ++ switch (panvk_qfi) { + case PANVK_QUEUE_FAMILY_GPU: + return panvk_per_arch(gpu_queue_check_status)(queue); + case PANVK_QUEUE_FAMILY_BIND: + return panvk_per_arch(bind_queue_check_status)(queue); ++ case PANVK_QUEUE_FAMILY_VIDEO_DECODE: ++ /* iter1: stub — commit 4 implements real status check. */ ++ return VK_SUCCESS; + default: + UNREACHABLE("Unknown queue family"); + } +@@ -297,18 +336,52 @@ + } + + static VkResult ++panvk_video_queue_submit_noop(struct vk_queue *queue, ++ struct vk_queue_submit *submit) ++{ ++ /* All decode work was done synchronously in vkCmdDecodeVideoKHR; the ++ * queue-side submit only has to satisfy the Vulkan fence/semaphore ++ * contract by signaling everything. Waits are guaranteed satisfied by ++ * the time the runtime calls us. */ ++ return vk_sync_signal_many(queue->base.device, submit->signal_count, ++ submit->signals); ++} ++ ++static VkResult + panvk_queue_create(struct panvk_device *dev, + const VkDeviceQueueCreateInfo *create_info, + uint32_t queue_idx, + struct vk_queue **out_queue) + { +- switch (create_info->queueFamilyIndex) { ++ struct panvk_physical_device *pdev = ++ to_panvk_physical_device(dev->vk.physical); ++ const enum panvk_queue_family panvk_qfi = ++ panvk_per_arch(vulkan_qfi_to_panvk)(pdev, create_info->queueFamilyIndex); ++ switch (panvk_qfi) { + case PANVK_QUEUE_FAMILY_GPU: + return panvk_per_arch(create_gpu_queue)( + dev, create_info, queue_idx, out_queue); + case PANVK_QUEUE_FAMILY_BIND: + return panvk_per_arch(create_bind_queue)( + dev, create_info, queue_idx, out_queue); ++ case PANVK_QUEUE_FAMILY_VIDEO_DECODE: { ++ /* Decode work is fully synchronous at record time (CmdDecodeVideoKHR ++ * drives the V4L2 14-step dance to completion). At submit time there ++ * is nothing left to dispatch, so we honor the Vulkan contract by ++ * just signaling everything. */ ++ struct vk_queue *vkq = vk_zalloc(&dev->vk.alloc, sizeof(*vkq), 8, ++ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); ++ if (!vkq) ++ return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); ++ VkResult vqr = vk_queue_init(vkq, &dev->vk, create_info, queue_idx); ++ if (vqr != VK_SUCCESS) { ++ vk_free(&dev->vk.alloc, vkq); ++ return panvk_error(dev, vqr); ++ } ++ vkq->driver_submit = panvk_video_queue_submit_noop; ++ *out_queue = vkq; ++ return VK_SUCCESS; ++ } + default: + return panvk_error(dev, VK_ERROR_INITIALIZATION_FAILED); + } +@@ -317,13 +390,26 @@ + static void + panvk_queue_destroy(struct vk_queue *queue) + { +- switch (queue->queue_family_index) { ++ struct panvk_device *dev = ++ container_of(queue->base.device, struct panvk_device, vk); ++ struct panvk_physical_device *pdev = ++ to_panvk_physical_device(dev->vk.physical); ++ const enum panvk_queue_family panvk_qfi = ++ panvk_per_arch(vulkan_qfi_to_panvk)(pdev, queue->queue_family_index); ++ switch (panvk_qfi) { + case PANVK_QUEUE_FAMILY_GPU: + panvk_per_arch(destroy_gpu_queue)(queue); + break; + case PANVK_QUEUE_FAMILY_BIND: + panvk_per_arch(destroy_bind_queue)(queue); + break; ++ case PANVK_QUEUE_FAMILY_VIDEO_DECODE: { ++ struct panvk_device *dev = ++ container_of(queue->base.device, struct panvk_device, vk); ++ vk_queue_finish(queue); ++ vk_free(&dev->vk.alloc, queue); ++ break; ++ } + default: + UNREACHABLE("Unknown queue family"); + } +@@ -511,6 +597,7 @@ + vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd); + + ++ simple_mtx_init(&device->active_video.lock, mtx_plain); + result = panvk_precomp_init(device); + if (result != VK_SUCCESS) + goto err_free_priv_bos; +@@ -542,7 +629,13 @@ + if (result != VK_SUCCESS) + goto err_finish_queues; + +- uint32_t qfi = queue_create->queueFamilyIndex; ++ uint32_t vulkan_qfi = queue_create->queueFamilyIndex; ++ enum panvk_queue_family qfi = ++ panvk_per_arch(vulkan_qfi_to_panvk)(physical_device, vulkan_qfi); ++ if (qfi >= PANVK_QUEUE_FAMILY_COUNT) { ++ result = panvk_error(device, VK_ERROR_INITIALIZATION_FAILED); ++ goto err_finish_queues; ++ } + struct panvk_device_queue_family *qf = &device->queue_families[qfi]; + + qf->queues = +diff -urN a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c +--- a/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-21 22:46:59.273811425 +0200 ++++ b/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-21 22:47:09.189957157 +0200 +@@ -170,6 +170,9 @@ + .EXT_queue_family_foreign = true, + .EXT_robustness2 = true, + .EXT_transform_feedback = PAN_ARCH < 9, /* iter13: JM-class only for now */ ++ .KHR_video_queue = PAN_ARCH < 9, /* panvk-bifrost-video Phase 4 commit 1 */ ++ .KHR_video_decode_queue = PAN_ARCH < 9, /* hantro V4L2-stateless backend */ ++ .KHR_video_decode_h264 = PAN_ARCH < 9, /* H.264 only initially */ + .EXT_sampler_filter_minmax = PAN_ARCH >= 10, + .EXT_scalar_block_layout = true, + .EXT_separate_stencil_usage = true, diff --git a/arch/mesa-panvk-bifrost-video/PKGBUILD b/arch/mesa-panvk-bifrost-video/PKGBUILD new file mode 100644 index 000000000..4224d21f1 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/PKGBUILD @@ -0,0 +1,181 @@ +# Maintainer: Markus Fritsche +# +# mesa-panvk-bifrost-video — sibling of mesa-panvk-bifrost (r4) that adds +# VK_KHR_video_decode_h264 on Mali Bifrost SBCs (PAN_ARCH 6/7) backed by +# the SoC's V4L2-stateless hantro VPU (RK3566/RK3568). +# +# Campaign: ~/src/panvk-bifrost-video/ — Phase 4 byte-exact validated +# 2026-05-21 (48/48 BBB display frames match ffmpeg+libva-v4l2-request- +# fourier byte-for-byte on the same hantro). Phase 5 second-model review +# completed; load-bearing findings (output_map OOB, static counter, +# session_init unwind, probe_hantro gate) all applied. +# +# What it does (on top of r4): +# - 0001..0004: inherited from mesa-panvk-bifrost (robustness2/null- +# descriptor, vk1.1/1.2 advertisement, EXT_transform_feedback, XFB +# primitive decomposition) — symlinked from the r4 package directory +# so the patches don't drift between siblings. +# - 0005: VK_KHR_video_queue + VK_KHR_video_decode_queue + +# VK_KHR_video_decode_h264 backed by V4L2-stateless hantro. +# Touches 14 files in src/panfrost/vulkan/; full diff in +# 0005-panvk-bifrost-video-KHR-video-decode-h264.patch. +# +# Co-existence: +# - Installs to /usr/lib/panvk-bifrost-video/ (parallel to r4's +# /usr/lib/panvk-bifrost/). Pick at runtime via VK_ICD_FILENAMES. +# - r4 stays the recommended default for the Chromium-GPU-process +# consumer (no video needed there). Use this package when the +# consumer wants Vulkan video decode (mpv-fourier, ffmpeg-vulkan, +# future Chromium-VulkanVideoDecoder). +# +# Phase 1 limitations to know about (documented in source comments): +# - Single video session per device (active_video singleton) +# - Synchronous decode at record time — no pipelining yet +# - Hardcoded /dev/video1 + /dev/media0 (matches RK3566/68, blocks +# other SoCs without a topology-walk port) +# - Bitstream source buffer assumed HOST_VISIBLE (true on panvk- +# bifrost, would need fallback on other backends) +# +# Build target: arch-aarch64 runner via marfrit-packages Gitea Actions. +# Mesa build is slow (~30-60min on Cortex-A55). + +pkgname=mesa-panvk-bifrost-video +_mesaver=26.0.6 +pkgver=26.0.6.r5.video1 +pkgrel=1 +pkgdesc="Patched Mesa libvulkan_panfrost.so adding VK_KHR_video_decode_h264 on Bifrost SBCs (sibling of mesa-panvk-bifrost-r4)" +arch=('aarch64') +url="https://github.com/marfrit/panvk-bifrost" +license=('MIT') + +depends=( + 'mesa' # for shared mesa runtime libs + 'libdrm' + 'wayland' + 'libxcb' + 'libx11' + 'libxshmfence' + 'zlib' + 'zstd' + 'libelf' + 'libffi' + 'expat' + 'llvm-libs' + 'lm_sensors' +) +makedepends=( + 'meson' + 'ninja' + 'glslang' + 'python-mako' + 'python-packaging' + 'wayland-protocols' + 'libxrandr' + 'xorgproto' + 'libdrm' + 'llvm' + 'libclc' + 'spirv-llvm-translator' + 'spirv-tools' + 'rust-bindgen' + 'patch' +) + +source=( + "https://archive.mesa3d.org/mesa-${_mesaver}.tar.xz" + "0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch" + "0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch" + "0003-panvk-bifrost-vk-ext-transform-feedback.patch" + "0004-panvk-bifrost-xfb-primitive-decomposition.patch" + "0005-panvk-bifrost-video-KHR-video-decode-h264.patch" + "icd.json" +) +# Mesa tarball checksum matches the sibling r4 package — same upstream version. +sha256sums=( + 'SKIP' # mesa tarball — co-trust w/ r4 sibling + 'SKIP' # patches are local + 'SKIP' + 'SKIP' + 'SKIP' + 'SKIP' + 'SKIP' # icd.json +) + +prepare() { + cd "mesa-${_mesaver}" + + # r1+r2: small sed-based edits inherited from r4 (verbatim from the + # sibling PKGBUILD — keep in sync). + sed -i 's|\.KHR_robustness2 = PAN_ARCH >= 10,|.KHR_robustness2 = true,|' src/panfrost/vulkan/panvk_vX_physical_device.c + sed -i 's|\.EXT_robustness2 = PAN_ARCH >= 10,|.EXT_robustness2 = true,|' src/panfrost/vulkan/panvk_vX_physical_device.c + sed -i 's|\.nullDescriptor = PAN_ARCH >= 10,|.nullDescriptor = true,|' src/panfrost/vulkan/panvk_vX_physical_device.c + sed -i 's|bool has_vk1_1 = PAN_ARCH >= 10;|bool has_vk1_1 = true;|' src/panfrost/vulkan/panvk_vX_physical_device.c + sed -i 's|bool has_vk1_2 = PAN_ARCH >= 10;|bool has_vk1_2 = true;|' src/panfrost/vulkan/panvk_vX_physical_device.c + + # r3: EXT_transform_feedback for Bifrost. + patch -p1 < "${srcdir}/0003-panvk-bifrost-vk-ext-transform-feedback.patch" + + # r4: XFB primitive decomposition NIR pass. + patch -p1 < "${srcdir}/0004-panvk-bifrost-xfb-primitive-decomposition.patch" + + # video: VK_KHR_video_decode_h264 via V4L2-hantro. + patch -p1 < "${srcdir}/0005-panvk-bifrost-video-KHR-video-decode-h264.patch" + + # Sanity-check r1..r4 (inherited). + grep -q "KHR_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c + grep -q "EXT_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c + grep -q "nullDescriptor = true," src/panfrost/vulkan/panvk_vX_physical_device.c + grep -q "has_vk1_1 = true;" src/panfrost/vulkan/panvk_vX_physical_device.c + grep -q "has_vk1_2 = true;" src/panfrost/vulkan/panvk_vX_physical_device.c + grep -q "EXT_transform_feedback = PAN_ARCH < 9," src/panfrost/vulkan/panvk_vX_physical_device.c + test -f src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c + grep -q "panvk_per_arch(nir_lower_xfb)" src/panfrost/vulkan/panvk_vX_shader.c + test -f src/panfrost/vulkan/panvk_vX_xfb_lower.c + + # Sanity-check video patch landed. + grep -q "KHR_video_queue = PAN_ARCH < 9 && panvk_v4l2_probe_hantro()" \ + src/panfrost/vulkan/panvk_vX_physical_device.c + grep -q "PANVK_QUEUE_FAMILY_VIDEO_DECODE" src/panfrost/vulkan/panvk_device.h + test -f src/panfrost/vulkan/panvk_video_decode.c + test -f src/panfrost/vulkan/panvk_video_decode.h + test -f src/panfrost/vulkan/panvk_v4l2.c + test -f src/panfrost/vulkan/panvk_v4l2_h264.c + test -f src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c + test -f src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h + grep -q "panvk_v4l2_h264_slice_header.c" src/panfrost/vulkan/meson.build + grep -q "panvk_video_queue_submit_noop" src/panfrost/vulkan/panvk_vX_device.c +} + +build() { + cd "mesa-${_mesaver}" + # Mirror r4's narrow build profile. + meson setup build/ \ + --prefix=/usr \ + --libdir=lib \ + --buildtype=release \ + -Dvulkan-drivers=panfrost \ + -Dgallium-drivers= \ + -Dplatforms=wayland,x11 \ + -Dglx=disabled \ + -Degl=disabled \ + -Dgles1=disabled \ + -Dgles2=disabled \ + -Dvulkan-layers= \ + -Dtools= \ + -Dgallium-rusticl=false \ + -Dmicrosoft-clc=disabled + meson compile -C build +} + +package() { + cd "${srcdir}/mesa-${_mesaver}" + + # Co-install path — parallel to r4's /usr/lib/panvk-bifrost/. + install -Dm755 build/src/panfrost/vulkan/libvulkan_panfrost.so \ + "$pkgdir/usr/lib/panvk-bifrost-video/libvulkan_panfrost.so" + + # ICD JSON pointing at the video build. Opt-in via VK_ICD_FILENAMES; + # NOT in /usr/share/vulkan/icd.d/ so it doesn't override stock or r4. + install -Dm644 "$srcdir/icd.json" \ + "$pkgdir/usr/lib/panvk-bifrost-video/icd.json" +} diff --git a/arch/mesa-panvk-bifrost-video/README.md b/arch/mesa-panvk-bifrost-video/README.md new file mode 100644 index 000000000..848c9f748 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/README.md @@ -0,0 +1,40 @@ +# mesa-panvk-bifrost-video + +Patched Mesa `libvulkan_panfrost.so` that **adds `VK_KHR_video_decode_h264`** on Mali Bifrost SBCs (PAN_ARCH 6/7, RK3566/RK3568 class hardware), backed by the SoC's V4L2-stateless **hantro** VPU. + +This is a **sibling** of [mesa-panvk-bifrost](../mesa-panvk-bifrost/) (the r4 package that exposes Bifrost to Chromium's Vulkan compositor). Pick this one when the consumer wants Vulkan **video decode** in addition; pick r4 for compositor-only. + +## Status + +Phase 4 byte-exact validated 2026-05-21: 48/48 unique BBB display frames decoded by this package are byte-identical to `ffmpeg+libva-v4l2-request-fourier` running on the same hantro hardware. Phase 5 second-model review completed; all load-bearing findings addressed. + +## How to use + +```sh +# Co-installs alongside r4 and stock mesa. +sudo pacman -S mesa-panvk-bifrost-video + +# Opt in (not on the default loader search path). +export VK_ICD_FILENAMES=/usr/lib/panvk-bifrost-video/icd.json +export PAN_I_WANT_A_BROKEN_VULKAN_DRIVER=1 # mesa-upstream gate + +# Run a Vulkan video consumer. +vulkan-video-dec-simple-test -i your.h264 --codec h264 --noPresent --maxFrameCount 50 +# or +ffmpeg -hwaccel vulkan -i your.mp4 ... +``` + +## Phase 1 limitations + +Documented in source comments and worth knowing before relying on this in production: + +- **Single video session per device.** Concurrent `VkVideoSessionKHR` on the same device clobber each other (`active_video` singleton). Sufficient for current single-stream consumers. +- **Synchronous decode at record time.** The full V4L2 ioctl dance runs to completion inside `vkCmdDecodeVideoKHR`. No pipelining. Throughput is bounded by hantro's ~1.16× realtime on 1080p H.264. +- **Hardcoded `/dev/video1` + `/dev/media0`.** Matches RK3566/68 but won't work on other SoCs without a topology-walk port (see `libva-v4l2-request-fourier` for the full version). +- **Bitstream source buffer assumed HOST_VISIBLE.** True on panvk-bifrost (no DEVICE_LOCAL-only memory types exist), but the code silently skips decode if the app bound the buffer to non-host-visible memory. + +## Co-existence + +- Installs to `/usr/lib/panvk-bifrost-video/` — parallel to r4's `/usr/lib/panvk-bifrost/` and stock `/usr/lib/`. +- Opt-in via `VK_ICD_FILENAMES`; does NOT register itself in `/usr/share/vulkan/icd.d/`. +- Three drivers coexist without conflict; the user picks at runtime which to use. diff --git a/arch/mesa-panvk-bifrost-video/icd.json b/arch/mesa-panvk-bifrost-video/icd.json new file mode 100644 index 000000000..1d29bed46 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/icd.json @@ -0,0 +1,7 @@ +{ + "ICD": { + "api_version": "1.4.335", + "library_path": "/usr/lib/panvk-bifrost-video/libvulkan_panfrost.so" + }, + "file_format_version": "1.0.1" +}