initial seed: retrofit campaign lineage from local working trees

panvk-bifrost campaigns (r1..r4 Vulkan compositor + r5.video1 Vulkan video decode) shipped before this repo existed; the deliverable patches live in marfrit-packages, but the reasoning chain, phase docs, and source-state evidence lived only in local working trees on the development host. This retrofit imports: - mesa-panvk-bifrost/ — r1..r4 era phase docs (iter1..iter18) (libmali stub blobs at iter18/blob/ excluded — 109MB of RE artifacts replaced with a README pointer) - mesa-panvk-bifrost-video/ — sibling campaign phase docs + probe - evidence/ — frozen .tgz source snapshots at each milestone (basis for the 0005 patch diff generation) Future iterations should branch off here from day one, so each iter is a commit rather than a snapshot. See [[feedback-session-local-process-pins]] for the process drift this retrofit closes. Total: 1.9 MB across 124 files. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 05:25:37 +02:00
parent 430d0da278
commit a4e7d8ab90
124 changed files with 22551 additions and 1 deletions
@@ -0,0 +1,39 @@
+# iter13 XFB probe — build glue.
+
+CC ?= cc
+CFLAGS ?= -O0 -g -Wall -Wextra -std=c11
+LDLIBS ?= -lvulkan
+
+PROBE = probe_xfb
+NOPROBE = probe_xfb_nodraw
+SRC   = probe_xfb.c
+NOSRC = probe_xfb_nodraw.c
+VERT  = probe_xfb.vert
+VSPV  = probe_xfb.vert.spv
+
+all: $(PROBE) $(NOPROBE) $(VSPV)
+
+$(PROBE): $(SRC)
+	$(CC) $(CFLAGS) -o $@ $< $(LDLIBS)
+
+$(NOPROBE): $(NOSRC)
+	$(CC) $(CFLAGS) -o $@ $< $(LDLIBS)
+
+# glslangValidator + xfb-aware compile. The -V flag enables Vulkan SPIR-V output.
+# xfb_buffer / xfb_offset / xfb_stride decorations are honored when the SPIR-V
+# is targeted at Vulkan (which is the default for -V).
+$(VSPV): $(VERT)
+	glslangValidator -V $< -o $@
+
+run: all
+	PAN_I_WANT_A_BROKEN_VULKAN_DRIVER=1 ./$(PROBE)
+
+run-patched-mesa: all
+	VK_ICD_FILENAMES=/usr/lib/panvk-bifrost/icd.json \
+	PAN_I_WANT_A_BROKEN_VULKAN_DRIVER=1 \
+	./$(PROBE)
+
+clean:
+	rm -f $(PROBE) $(VSPV)
+
+.PHONY: all run run-patched-mesa clean
@@ -0,0 +1,484 @@
+/*
+ * Copyright © 2021 Collabora Ltd.
+ *
+ * Derived from tu_cmd_buffer.c which is:
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ * Copyright © 2015 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "genxml/gen_macros.h"
+
+#include "panvk_buffer.h"
+#include "panvk_cmd_alloc.h"
+#include "panvk_cmd_buffer.h"
+#include "panvk_cmd_desc_state.h"
+#include "panvk_cmd_draw.h"
+#include "panvk_cmd_fb_preload.h"
+#include "panvk_cmd_pool.h"
+#include "panvk_cmd_push_constant.h"
+#include "panvk_device.h"
+#include "panvk_entrypoints.h"
+#include "panvk_instance.h"
+#include "panvk_meta.h"
+#include "panvk_physical_device.h"
+#include "panvk_priv_bo.h"
+
+#include "pan_desc.h"
+#include "pan_encoder.h"
+#include "pan_props.h"
+#include "pan_samples.h"
+
+#include "vk_descriptor_update_template.h"
+#include "vk_format.h"
+
+static VkResult
+panvk_cmd_prepare_fragment_job(struct panvk_cmd_buffer *cmdbuf, uint64_t fbd)
+{
+   const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
+   struct panvk_batch *batch = cmdbuf->cur_batch;
+   struct pan_ptr job_ptr = panvk_cmd_alloc_desc(cmdbuf, FRAGMENT_JOB);
+
+   if (!job_ptr.gpu)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   GENX(pan_emit_fragment_job_payload)(fbinfo, fbd, job_ptr.cpu);
+
+   pan_section_pack(job_ptr.cpu, FRAGMENT_JOB, HEADER, header) {
+      header.type = MALI_JOB_TYPE_FRAGMENT;
+      header.index = 1;
+   }
+
+   pan_jc_add_job(&batch->frag_jc, MALI_JOB_TYPE_FRAGMENT, false, false, 0, 0,
+                  &job_ptr, false);
+   util_dynarray_append(&batch->jobs, job_ptr.cpu);
+   return VK_SUCCESS;
+}
+
+void
+panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
+{
+   struct panvk_batch *batch = cmdbuf->cur_batch;
+
+   if (!batch)
+      return;
+
+   struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
+
+   assert(batch);
+
+   if (!batch->fb.desc.gpu && !batch->vtc_jc.first_job) {
+      if (util_dynarray_num_elements(&batch->event_ops,
+                                     struct panvk_cmd_event_op) == 0) {
+         /* Content-less batch, let's drop it */
+         vk_free(&cmdbuf->vk.pool->alloc, batch);
+      } else {
+         /* Batch has no jobs but is needed for synchronization, let's add a
+          * NULL job so the SUBMIT ioctl doesn't choke on it.
+          */
+         struct pan_ptr ptr = panvk_cmd_alloc_desc(cmdbuf, JOB_HEADER);
+
+         if (ptr.gpu) {
+            util_dynarray_append(&batch->jobs, ptr.cpu);
+            pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_NULL, false, false, 0,
+                           0, &ptr, false);
+         }
+
+         list_addtail(&batch->node, &cmdbuf->batches);
+      }
+      cmdbuf->cur_batch = NULL;
+      return;
+   }
+
+   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
+   struct panvk_physical_device *phys_dev =
+      to_panvk_physical_device(dev->vk.physical);
+
+   list_addtail(&batch->node, &cmdbuf->batches);
+
+   if (batch->tlsinfo.tls.size) {
+      unsigned thread_tls_alloc =
+         pan_query_thread_tls_alloc(&phys_dev->kmod.dev->props);
+      unsigned core_id_range;
+
+      pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
+
+      unsigned size = pan_get_total_stack_size(batch->tlsinfo.tls.size,
+                                               thread_tls_alloc, core_id_range);
+      batch->tlsinfo.tls.ptr =
+         panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
+   }
+
+   if (batch->tlsinfo.wls.size) {
+      assert(batch->wls_total_size);
+      batch->tlsinfo.wls.ptr =
+         panvk_cmd_alloc_dev_mem(cmdbuf, tls, batch->wls_total_size, 4096).gpu;
+   }
+
+   if (batch->tls.cpu)
+      GENX(pan_emit_tls)(&batch->tlsinfo, batch->tls.cpu);
+
+   if (batch->fb.desc.cpu) {
+      panvk_per_arch(cmd_select_tile_size)(cmdbuf);
+
+      /* At this point, we should know sample count and the tile size should have
+       * been calculated */
+      assert(fbinfo->nr_samples > 0 && fbinfo->tile_size > 0);
+
+      fbinfo->sample_positions =
+         dev->sample_positions->addr.dev +
+         pan_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
+      fbinfo->first_provoking_vertex =
+         cmdbuf->state.gfx.render.first_provoking_vertex != U_TRISTATE_NO;
+
+      VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, fbinfo);
+      if (result != VK_SUCCESS)
+         return;
+
+      uint32_t view_mask = cmdbuf->state.gfx.render.view_mask;
+      assert(view_mask == 0 || util_bitcount(view_mask) <= batch->fb.layer_count);
+      uint32_t enabled_layer_count = view_mask ?
+         util_bitcount(view_mask) :
+         batch->fb.layer_count;
+
+      for (uint32_t i = 0; i < enabled_layer_count; i++) {
+         uint32_t layer_id = (view_mask != 0) ? u_bit_scan(&view_mask) : i;
+         VkResult result;
+
+         uint64_t fbd = batch->fb.desc.gpu + (batch->fb.desc_stride * layer_id);
+
+         result = panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, layer_id);
+         if (result != VK_SUCCESS)
+            break;
+
+         fbd |= GENX(pan_emit_fbd)(
+            &cmdbuf->state.gfx.render.fb.info, layer_id, &batch->tlsinfo,
+            &batch->tiler.ctx,
+            batch->fb.desc.cpu + (batch->fb.desc_stride * layer_id));
+
+         result = panvk_cmd_prepare_fragment_job(cmdbuf, fbd);
+         if (result != VK_SUCCESS)
+            break;
+      }
+   }
+
+   cmdbuf->cur_batch = NULL;
+}
+
+VkResult
+panvk_per_arch(cmd_alloc_fb_desc)(struct panvk_cmd_buffer *cmdbuf)
+{
+   struct panvk_batch *batch = cmdbuf->cur_batch;
+
+   if (batch->fb.desc.gpu)
+      return VK_SUCCESS;
+
+   const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
+   bool has_zs_ext = fbinfo->zs.view.zs || fbinfo->zs.view.s;
+   batch->fb.layer_count = cmdbuf->state.gfx.render.layer_count;
+   unsigned fbd_size = pan_size(FRAMEBUFFER);
+
+   if (has_zs_ext)
+      fbd_size = ALIGN_POT(fbd_size, pan_alignment(ZS_CRC_EXTENSION)) +
+                 pan_size(ZS_CRC_EXTENSION);
+
+   fbd_size = ALIGN_POT(fbd_size, pan_alignment(RENDER_TARGET)) +
+              (MAX2(fbinfo->rt_count, 1) * pan_size(RENDER_TARGET));
+
+   batch->fb.bo_count = cmdbuf->state.gfx.render.fb.bo_count;
+   memcpy(batch->fb.bos, cmdbuf->state.gfx.render.fb.bos,
+          batch->fb.bo_count * sizeof(batch->fb.bos[0]));
+
+   batch->fb.desc =
+      panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbd_size * batch->fb.layer_count,
+                              pan_alignment(FRAMEBUFFER));
+   batch->fb.desc_stride = fbd_size;
+
+   memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0,
+          sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds));
+
+   return batch->fb.desc.gpu ? VK_SUCCESS : VK_ERROR_OUT_OF_DEVICE_MEMORY;
+}
+
+VkResult
+panvk_per_arch(cmd_alloc_tls_desc)(struct panvk_cmd_buffer *cmdbuf, bool gfx)
+{
+   struct panvk_batch *batch = cmdbuf->cur_batch;
+
+   assert(batch);
+   if (!batch->tls.gpu) {
+      batch->tls = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
+      if (!batch->tls.gpu)
+         return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf,
+                                          uint32_t layer_idx)
+{
+   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
+   struct panvk_physical_device *phys_dev =
+      to_panvk_physical_device(cmdbuf->vk.base.device->physical);
+   struct panvk_batch *batch = cmdbuf->cur_batch;
+   uint64_t tiler_desc;
+
+   if (batch->tiler.ctx_descs.gpu) {
+      tiler_desc =
+         batch->tiler.ctx_descs.gpu + (pan_size(TILER_CONTEXT) * layer_idx);
+      goto out_set_layer_ctx;
+   }
+
+   const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
+   uint32_t layer_count = cmdbuf->state.gfx.render.layer_count;
+   batch->tiler.heap_desc = panvk_cmd_alloc_desc(cmdbuf, TILER_HEAP);
+   batch->tiler.ctx_descs =
+      panvk_cmd_alloc_desc_array(cmdbuf, layer_count, TILER_CONTEXT);
+   if (!batch->tiler.heap_desc.gpu || !batch->tiler.ctx_descs.gpu)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   tiler_desc =
+      batch->tiler.ctx_descs.gpu + (pan_size(TILER_CONTEXT) * layer_idx);
+
+   pan_pack(&batch->tiler.heap_templ, TILER_HEAP, cfg) {
+      cfg.size = pan_kmod_bo_size(dev->tiler_heap->bo);
+      cfg.base = dev->tiler_heap->addr.dev;
+      cfg.bottom = dev->tiler_heap->addr.dev;
+      cfg.top = cfg.base + cfg.size;
+   }
+
+   pan_pack(&batch->tiler.ctx_templ, TILER_CONTEXT, cfg) {
+      cfg.hierarchy_mask = panvk_select_tiler_hierarchy_mask(
+         phys_dev, &cmdbuf->state.gfx, pan_kmod_bo_size(dev->tiler_heap->bo));
+      cfg.fb_width = fbinfo->width;
+      cfg.fb_height = fbinfo->height;
+      cfg.heap = batch->tiler.heap_desc.gpu;
+      cfg.sample_pattern = pan_sample_pattern(fbinfo->nr_samples);
+   }
+
+   memcpy(batch->tiler.heap_desc.cpu, &batch->tiler.heap_templ,
+          sizeof(batch->tiler.heap_templ));
+
+   struct mali_tiler_context_packed *ctxs = batch->tiler.ctx_descs.cpu;
+
+   assert(layer_count > 0);
+   for (uint32_t i = 0; i < layer_count; i++) {
+      STATIC_ASSERT(
+         !(pan_size(TILER_CONTEXT) & (pan_alignment(TILER_CONTEXT) - 1)));
+
+      memcpy(&ctxs[i], &batch->tiler.ctx_templ, sizeof(*ctxs));
+   }
+
+out_set_layer_ctx:
+   if (PAN_ARCH >= 9)
+      batch->tiler.ctx.valhall.desc = tiler_desc;
+   else
+      batch->tiler.ctx.bifrost.desc = tiler_desc;
+
+   return VK_SUCCESS;
+}
+
+struct panvk_batch *
+panvk_per_arch(cmd_open_batch)(struct panvk_cmd_buffer *cmdbuf)
+{
+   assert(!cmdbuf->cur_batch);
+   cmdbuf->cur_batch =
+      vk_zalloc(&cmdbuf->vk.pool->alloc, sizeof(*cmdbuf->cur_batch), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   cmdbuf->cur_batch->jobs = UTIL_DYNARRAY_INIT;
+   cmdbuf->cur_batch->event_ops = UTIL_DYNARRAY_INIT;
+   assert(cmdbuf->cur_batch);
+   return cmdbuf->cur_batch;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+
+   panvk_per_arch(cmd_close_batch)(cmdbuf);
+
+   panvk_pool_flush_maps(&cmdbuf->desc_pool);
+
+   return vk_command_buffer_end(&cmdbuf->vk);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
+                                    const VkDependencyInfo *pDependencyInfo)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+
+   /* Caches are flushed/invalidated at batch boundaries for now, nothing to do
+    * for memory barriers assuming we implement barriers with the creation of a
+    * new batch.
+    * FIXME: We can probably do better with a CacheFlush job that has the
+    * barrier flag set to true.
+    */
+   if (cmdbuf->cur_batch) {
+      bool preload_fb =
+         cmdbuf->cur_batch && cmdbuf->cur_batch->vtc_jc.first_tiler;
+
+      panvk_per_arch(cmd_close_batch)(cmdbuf);
+
+      if (preload_fb)
+         panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf);
+
+      panvk_per_arch(cmd_open_batch)(cmdbuf);
+   }
+
+   for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
+      const VkImageMemoryBarrier2 *barrier = &pDependencyInfo->pImageMemoryBarriers[i];
+
+      panvk_per_arch(cmd_transition_image_layout)(commandBuffer, barrier);
+   }
+
+   /* If we had any layout transition dispatches, the batch will be closed at
+    * this point, therefore establishing the sync between itself and the
+    * commands that follow.
+    */
+}
+
+static void
+panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
+                   VkCommandBufferResetFlags flags)
+{
+   struct panvk_cmd_buffer *cmdbuf =
+      container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
+
+   vk_command_buffer_reset(&cmdbuf->vk);
+
+   list_for_each_entry_safe(struct panvk_batch, batch, &cmdbuf->batches, node) {
+      list_del(&batch->node);
+      util_dynarray_fini(&batch->jobs);
+      util_dynarray_fini(&batch->event_ops);
+
+      vk_free(&cmdbuf->vk.pool->alloc, batch);
+   }
+
+   panvk_pool_reset(&cmdbuf->desc_pool);
+   panvk_pool_reset(&cmdbuf->tls_pool);
+   panvk_pool_reset(&cmdbuf->varying_pool);
+   panvk_cmd_buffer_obj_list_reset(cmdbuf, push_sets);
+
+   memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
+}
+
+static void
+panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
+{
+   struct panvk_cmd_buffer *cmdbuf =
+      container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
+   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
+
+   list_for_each_entry_safe(struct panvk_batch, batch, &cmdbuf->batches, node) {
+      list_del(&batch->node);
+      util_dynarray_fini(&batch->jobs);
+      util_dynarray_fini(&batch->event_ops);
+
+      vk_free(&cmdbuf->vk.pool->alloc, batch);
+   }
+
+   panvk_pool_cleanup(&cmdbuf->desc_pool);
+   panvk_pool_cleanup(&cmdbuf->tls_pool);
+   panvk_pool_cleanup(&cmdbuf->varying_pool);
+   panvk_cmd_buffer_obj_list_cleanup(cmdbuf, push_sets);
+   vk_command_buffer_finish(&cmdbuf->vk);
+   vk_free(&dev->vk.alloc, cmdbuf);
+}
+
+static VkResult
+panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
+                    struct vk_command_buffer **cmdbuf_out)
+{
+   struct panvk_device *device =
+      container_of(vk_pool->base.device, struct panvk_device, vk);
+   struct panvk_cmd_pool *pool =
+      container_of(vk_pool, struct panvk_cmd_pool, vk);
+   struct panvk_cmd_buffer *cmdbuf;
+
+   cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!cmdbuf)
+      return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result = vk_command_buffer_init(
+      &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
+   if (result != VK_SUCCESS) {
+      vk_free(&device->vk.alloc, cmdbuf);
+      return result;
+   }
+
+   panvk_cmd_buffer_obj_list_init(cmdbuf, push_sets);
+   cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
+   cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
+      &cmdbuf->state.gfx.dynamic.sl;
+
+   struct panvk_pool_properties desc_pool_props = {
+      .create_flags =
+         panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_WB_MMAP),
+      .slab_size = 64 * 1024,
+      .label = "Command buffer descriptor pool",
+      .prealloc = true,
+      .owns_bos = true,
+      .needs_locking = false,
+   };
+   panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool, NULL,
+                   &desc_pool_props);
+
+   struct panvk_pool_properties tls_pool_props = {
+      .create_flags =
+         panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
+      .slab_size = 64 * 1024,
+      .label = "TLS pool",
+      .prealloc = false,
+      .owns_bos = true,
+      .needs_locking = false,
+   };
+   panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool, &pool->tls_big_bo_pool,
+                   &tls_pool_props);
+
+   struct panvk_pool_properties var_pool_props = {
+      .create_flags =
+         panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
+      .slab_size = 64 * 1024,
+      .label = "Varying pool",
+      .prealloc = false,
+      .owns_bos = true,
+      .needs_locking = false,
+   };
+   panvk_pool_init(&cmdbuf->varying_pool, device, &pool->varying_bo_pool, NULL,
+                   &var_pool_props);
+
+   list_inithead(&cmdbuf->batches);
+   *cmdbuf_out = &cmdbuf->vk;
+   return VK_SUCCESS;
+}
+
+const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
+   .create = panvk_create_cmdbuf,
+   .reset = panvk_reset_cmdbuf,
+   .destroy = panvk_destroy_cmdbuf,
+};
+
+VKAPI_ATTR VkResult VKAPI_CALL
+panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
+                                   const VkCommandBufferBeginInfo *pBeginInfo)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+
+   vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
+
+#if PAN_ARCH < 9
+   /* iter13: clear XFB state on Begin so a reused command buffer does not
+    * inherit stale xfb.buffer_count / xfb.active / xfb.buffers[] from a
+    * prior recording. */
+   memset(&cmdbuf->state.gfx.xfb, 0, sizeof(cmdbuf->state.gfx.xfb));
+#endif
+
+   return VK_SUCCESS;
+}
@@ -0,0 +1,111 @@
+/*
+ * Copyright © 2026 mfritsche / claude-noether
+ * SPDX-License-Identifier: MIT
+ *
+ * iter13: VK_EXT_transform_feedback command handlers for the JM
+ * architecture path (Bifrost v6/v7 + Valhall-JM v9).
+ *
+ * The runtime contract:
+ *   - vkCmdBindTransformFeedbackBuffersEXT: stash (gpu_addr, offset, size)
+ *     for each slot into cmdbuf->state.gfx.xfb.buffers[].
+ *   - vkCmdBeginTransformFeedbackEXT: set cmdbuf->state.gfx.xfb.active = true.
+ *     Mark sysvals dirty so the next draw re-emits vs.xfb_address[].
+ *   - vkCmdEndTransformFeedbackEXT: set active = false.
+ *
+ * Counter buffers (firstCounterBuffer/counterBufferCount/pCounterBuffers/
+ * pCounterBufferOffsets) are accepted by API but ignored — v1 doesn't
+ * support pause/resume. transformFeedbackDraw is advertised as false.
+ *
+ * Per-draw integration: jm/panvk_vX_cmd_draw.c reads cmdbuf->state.gfx.xfb
+ * and populates vs.xfb_address[i] for shader use. The pan_nir_lower_xfb
+ * pass in panvk_vX_shader.c emits nir_load_xfb_address(i) which lowers
+ * (via panvk_vX_shader.c sysval handler) to a load from the per-draw
+ * sysval push area.
+ */
+
+#include "vk_log.h"
+#include "util/log.h"
+
+#include "panvk_cmd_buffer.h"
+#include "panvk_cmd_draw.h"
+#include "panvk_buffer.h"
+#include "panvk_entrypoints.h"
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdBindTransformFeedbackBuffersEXT)(
+   VkCommandBuffer commandBuffer,
+   uint32_t firstBinding,
+   uint32_t bindingCount,
+   const VkBuffer *pBuffers,
+   const VkDeviceSize *pOffsets,
+   const VkDeviceSize *pSizes)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      uint32_t slot = firstBinding + i;
+      if (slot >= 4)
+         continue;
+
+      VK_FROM_HANDLE(panvk_buffer, buf, pBuffers[i]);
+      gfx->xfb.buffers[slot].addr = panvk_buffer_gpu_ptr(buf, 0);
+      gfx->xfb.buffers[slot].offset = pOffsets[i];
+      gfx->xfb.buffers[slot].size =
+         (pSizes != NULL && pSizes[i] != VK_WHOLE_SIZE)
+            ? pSizes[i]
+            : (buf->vk.size - pOffsets[i]);
+   }
+
+   if (firstBinding + bindingCount > gfx->xfb.buffer_count)
+      gfx->xfb.buffer_count = firstBinding + bindingCount;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdBeginTransformFeedbackEXT)(
+   VkCommandBuffer commandBuffer,
+   uint32_t firstCounterBuffer,
+   uint32_t counterBufferCount,
+   const VkBuffer *pCounterBuffers,
+   const VkDeviceSize *pCounterBufferOffsets)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
+
+   /* Counter buffers ignored in v1 — see VkPhysicalDeviceTransformFeedback
+    * PropertiesEXT.transformFeedbackDraw = false in panvk_vX_physical_device.c.
+    * App is spec-compliant if it does not pass counter buffers (which our
+    * features advertisement allows), but warn loudly if it does so we do not
+    * silently produce wrong capture state. */
+   (void)firstCounterBuffer;
+   (void)pCounterBufferOffsets;
+   if (counterBufferCount > 0 && pCounterBuffers != NULL) {
+      mesa_logw("panvk: CmdBeginTransformFeedbackEXT: counter buffers not "
+                "implemented (transformFeedbackDraw=false); XFB resume will "
+                "restart at buffer offset 0");
+   }
+
+   gfx->xfb.active = true;
+   /* Per-draw set_gfx_sysval picks up the change automatically — no
+    * explicit dirty marking required (set_gfx_sysval uses memcmp +
+    * BITSET to detect state diffs and re-emit sysvals). */
+}
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdEndTransformFeedbackEXT)(
+   VkCommandBuffer commandBuffer,
+   uint32_t firstCounterBuffer,
+   uint32_t counterBufferCount,
+   const VkBuffer *pCounterBuffers,
+   const VkDeviceSize *pCounterBufferOffsets)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
+
+   (void)firstCounterBuffer;
+   (void)counterBufferCount;
+   (void)pCounterBuffers;
+   (void)pCounterBufferOffsets;
+
+   gfx->xfb.active = false;
+}
@@ -0,0 +1,275 @@
+# Copyright © 2021 Collabora Ltd.
+#
+# Derived from the freedreno driver which is:
+# Copyright © 2017 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+panvk_entrypoints = custom_target(
+  'panvk_entrypoints.[ch]',
+  input : [vk_entrypoints_gen, vk_api_xml],
+  output : ['panvk_entrypoints.h', 'panvk_entrypoints.c'],
+  command : [
+    prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
+    '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'panvk',
+    '--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7',
+    '--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10',
+    '--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13',
+    '--beta', with_vulkan_beta.to_string()
+  ],
+  depend_files : vk_entrypoints_gen_depend_files,
+)
+
+panvk_tracepoints = custom_target(
+  'panvk_tracepoints.[ch]',
+  input: 'panvk_tracepoints.py',
+  output: ['panvk_tracepoints.h',
+           'panvk_tracepoints_perfetto.h',
+           'panvk_tracepoints.c'],
+  command: [
+    prog_python, '@INPUT@',
+    '--import-path', join_paths(dir_source_root, 'src/util/perf/'),
+    '--utrace-hdr', '@OUTPUT0@',
+    '--perfetto-hdr', '@OUTPUT1@',
+    '--utrace-src', '@OUTPUT2@',
+  ],
+  depend_files: u_trace_py,
+)
+
+libpanvk_files = files(
+  'panvk_buffer.c',
+  'panvk_cmd_pool.c',
+  'panvk_device_memory.c',
+  'panvk_host_copy.c',
+  'panvk_image.c',
+  'panvk_instance.c',
+  'panvk_mempool.c',
+  'panvk_physical_device.c',
+  'panvk_priv_bo.c',
+  'panvk_sparse.c',
+  'panvk_utrace.c',
+  'panvk_wsi.c',
+)
+libpanvk_files += [sha1_h]
+
+panvk_deps = []
+panvk_flags = []
+panvk_per_arch_libs = []
+
+bifrost_archs = [6, 7]
+bifrost_inc_dir = ['bifrost']
+bifrost_files = [
+  'bifrost/panvk_vX_meta_desc_copy.c',
+]
+
+valhall_archs = [9, 10]
+valhall_inc_dir = ['valhall']
+valhall_files = []
+
+fifthgen_archs = [12, 13]
+fifthgen_inc_dir = ['fifthgen']
+fifthgen_files = []
+
+jm_archs = [6, 7]
+jm_inc_dir = ['jm']
+jm_files = [
+  'jm/panvk_vX_bind_queue.c',
+  'jm/panvk_vX_cmd_xfb.c',   # iter13
+  'jm/panvk_vX_cmd_buffer.c',
+  'jm/panvk_vX_cmd_dispatch.c',
+  'jm/panvk_vX_cmd_draw.c',
+  'jm/panvk_vX_cmd_event.c',
+  'jm/panvk_vX_cmd_query.c',
+  'jm/panvk_vX_cmd_precomp.c',
+  'jm/panvk_vX_event.c',
+  'jm/panvk_vX_gpu_queue.c',
+]
+
+csf_archs = [10, 12, 13]
+csf_inc_dir = ['csf']
+csf_files = [
+  'csf/panvk_vX_bind_queue.c',
+  'csf/panvk_vX_cmd_buffer.c',
+  'csf/panvk_vX_cmd_dispatch.c',
+  'csf/panvk_vX_cmd_draw.c',
+  'csf/panvk_vX_cmd_event.c',
+  'csf/panvk_vX_cmd_query.c',
+  'csf/panvk_vX_cmd_precomp.c',
+  'csf/panvk_vX_event.c',
+  'csf/panvk_vX_exception_handler.c',
+  'csf/panvk_vX_gpu_queue.c',
+  'csf/panvk_vX_instr.c',
+  'csf/panvk_vX_utrace.c',
+]
+
+common_per_arch_files = [
+  panvk_entrypoints[0],
+  panvk_tracepoints[0],
+  'panvk_vX_blend.c',
+  'panvk_vX_buffer_view.c',
+  'panvk_vX_cmd_fb_preload.c',
+  'panvk_vX_cmd_desc_state.c',
+  'panvk_vX_cmd_dispatch.c',
+  'panvk_vX_cmd_draw.c',
+  'panvk_vX_cmd_meta.c',
+  'panvk_vX_cmd_push_constant.c',
+  'panvk_vX_descriptor_set.c',
+  'panvk_vX_descriptor_set_layout.c',
+  'panvk_vX_device.c',
+  'panvk_vX_physical_device.c',
+  'panvk_vX_precomp_cache.c',
+  'panvk_vX_query_pool.c',
+  'panvk_vX_image_view.c',
+  'panvk_vX_nir_lower_descriptors.c',
+  'panvk_vX_nir_lower_input_attachment_loads.c',
+  'panvk_vX_sampler.c',
+  'panvk_vX_shader.c',
+  sha1_h,
+]
+
+foreach arch : [6, 7, 10, 12, 13]
+  per_arch_files = common_per_arch_files
+  inc_panvk_per_arch = []
+
+  if arch in bifrost_archs
+    inc_panvk_per_arch += bifrost_inc_dir
+    per_arch_files += bifrost_files
+  elif arch in valhall_archs
+    inc_panvk_per_arch += valhall_inc_dir
+    per_arch_files += valhall_files
+  elif arch in fifthgen_archs
+    inc_panvk_per_arch += fifthgen_inc_dir
+    per_arch_files += fifthgen_files
+  endif
+
+  if arch in jm_archs
+    inc_panvk_per_arch += jm_inc_dir
+    per_arch_files += jm_files
+  elif arch in csf_archs
+    inc_panvk_per_arch += csf_inc_dir
+    per_arch_files += csf_files
+  endif
+
+  panvk_per_arch_libs += static_library(
+    'panvk_v@0@'.format(arch),
+    per_arch_files,
+    include_directories : [
+      inc_include,
+      inc_src,
+      inc_panfrost,
+      inc_panvk_per_arch,
+    ],
+    dependencies : [
+      idep_nir_headers,
+      idep_pan_packers,
+      idep_vulkan_util_headers,
+      idep_vulkan_runtime_headers,
+      idep_vulkan_wsi_headers,
+      idep_mesautil,
+      dep_libdrm,
+      dep_valgrind,
+      idep_libpan_per_arch[arch.to_string()],
+    ],
+    c_args : [no_override_init_args, panvk_flags, '-DPAN_ARCH=@0@'.format(arch)],
+    gnu_symbol_visibility : 'hidden',
+  )
+endforeach
+
+if with_perfetto
+  panvk_deps += dep_perfetto
+  libpanvk_files += ['panvk_utrace_perfetto.cc']
+endif
+
+if with_platform_wayland
+  panvk_deps += dep_wayland_client
+endif
+
+if with_platform_android
+  libpanvk_files += files('panvk_android.c')
+endif
+
+libvulkan_panfrost = shared_library(
+  'vulkan_panfrost',
+  [libpanvk_files, panvk_entrypoints, panvk_tracepoints],
+  include_directories : [
+    inc_include,
+    inc_src,
+    inc_panfrost,
+  ],
+  link_whole : [panvk_per_arch_libs],
+  link_with : [
+    libpanfrost_shared,
+    libpanfrost_decode,
+    libpanfrost_lib,
+    libpanfrost_compiler,
+  ],
+  dependencies : [
+    dep_dl,
+    dep_elf,
+    dep_libdrm,
+    dep_m,
+    dep_thread,
+    dep_valgrind,
+    idep_nir,
+    idep_pan_packers,
+    panvk_deps,
+    idep_vulkan_util,
+    idep_vulkan_runtime,
+    idep_vulkan_wsi,
+    idep_mesautil,
+  ],
+  c_args : [no_override_init_args, panvk_flags],
+  link_args : [vulkan_icd_link_args, ld_args_bsymbolic, ld_args_gc_sections, ld_args_build_id],
+  gnu_symbol_visibility : 'hidden',
+  install : true,
+)
+
+if with_symbols_check
+  test(
+    'panvk symbols check',
+    symbols_check,
+    args : [
+      '--lib', libvulkan_panfrost,
+      '--symbols-file', vulkan_icd_symbols,
+      symbols_check_args,
+    ],
+    suite : ['panfrost'],
+  )
+endif
+
+icd_file_name = libname_prefix + 'vulkan_panfrost.' + libname_suffix
+
+panfrost_icd = custom_target(
+  'panfrost_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : 'panfrost_icd.' + vulkan_manifest_suffix,
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.4', '--xml', '@INPUT1@',
+    '--sizeof-pointer', sizeof_pointer,
+    '--icd-lib-path', vulkan_icd_lib_path,
+    '--icd-filename', icd_file_name,
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+  install_dir : with_vulkan_icd_dir,
+  install_tag : 'runtime',
+  install : true,
+)
+
+_dev_icdname = 'panfrost_devenv_icd.@0@.json'.format(host_machine.cpu())
+_dev_icd = custom_target(
+  'panfrost_devenv_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : _dev_icdname,
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.4', '--xml', '@INPUT1@',
+    '--sizeof-pointer', sizeof_pointer,
+    '--icd-lib-path', meson.current_build_dir(),
+    '--icd-filename', icd_file_name,
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+)
+
+devenv.append('VK_DRIVER_FILES', _dev_icd.full_path())
@@ -0,0 +1,501 @@
+/*
+ * Copyright © 2024 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef PANVK_CMD_DRAW_H
+#define PANVK_CMD_DRAW_H
+
+#ifndef PAN_ARCH
+#error "PAN_ARCH must be defined"
+#endif
+
+#include "panvk_blend.h"
+#include "panvk_cmd_desc_state.h"
+#include "panvk_cmd_query.h"
+#include "panvk_entrypoints.h"
+#include "panvk_image.h"
+#include "panvk_image_view.h"
+#include "panvk_physical_device.h"
+#include "panvk_shader.h"
+
+#include "vk_command_buffer.h"
+#include "vk_format.h"
+#include "util/u_tristate.h"
+
+#include "pan_props.h"
+
+#define MAX_VBS 16
+
+struct panvk_cmd_buffer;
+
+struct panvk_attrib_buf {
+   uint64_t address;
+   unsigned size;
+};
+
+struct panvk_resolve_attachment {
+   VkResolveModeFlagBits mode;
+   struct panvk_image_view *dst_iview;
+};
+
+struct panvk_rendering_state {
+   VkRenderingFlags flags;
+   uint32_t layer_count;
+   uint32_t view_mask;
+   enum u_tristate first_provoking_vertex;
+
+   enum vk_rp_attachment_flags bound_attachments;
+   struct {
+      struct panvk_image_view *iviews[MAX_RTS];
+      /* If non-null, preload_iviews[i] overrides iviews[i] for preloads. */
+      struct panvk_image_view *preload_iviews[MAX_RTS];
+      VkFormat fmts[MAX_RTS];
+      uint8_t samples[MAX_RTS];
+      struct panvk_resolve_attachment resolve[MAX_RTS];
+   } color_attachments;
+
+   struct pan_image_view zs_pview;
+   struct pan_image_view s_pview;
+
+   struct {
+      struct panvk_image_view *iview;
+      /* If non-null, preload_iview overrides iview for preloads. */
+      struct panvk_image_view *preload_iview;
+      VkFormat fmt;
+      struct panvk_resolve_attachment resolve;
+   } z_attachment, s_attachment;
+
+   struct {
+      struct pan_fb_info info;
+      bool crc_valid[MAX_RTS];
+
+      /* nr_samples to be used before framebuffer / tiler descriptor are emitted */
+      uint32_t nr_samples;
+
+#if PAN_ARCH < 9
+      uint32_t bo_count;
+      struct pan_kmod_bo *bos[(MAX_RTS * PANVK_MAX_PLANES) + 2];
+#endif
+   } fb;
+
+#if PAN_ARCH >= 10
+   struct pan_ptr fbds;
+   uint64_t tiler;
+
+   /* When a secondary command buffer has to flush draws, it disturbs the
+    * inherited context, and the primary command buffer needs to know. */
+   bool invalidate_inherited_ctx;
+
+   /* True if the last render pass was suspended. */
+   bool suspended;
+
+   /* Blocks that can patch to flip the provoking vertex mode if we need to
+    * emit FBDs/TDs before we know which mode the application is using */
+   struct cs_maybe *maybe_set_tds_provoking_vertex;
+   struct cs_maybe *maybe_set_fbds_provoking_vertex;
+
+   struct {
+      /* != 0 if the render pass contains one or more occlusion queries to
+       * signal. */
+      uint64_t chain;
+
+      /* Point to the syncobj of the last occlusion query that was passed
+       * to a draw. */
+      uint64_t last;
+   } oq;
+#endif
+};
+
+enum panvk_cmd_graphics_dirty_state {
+   PANVK_CMD_GRAPHICS_DIRTY_VS,
+   PANVK_CMD_GRAPHICS_DIRTY_FS,
+   PANVK_CMD_GRAPHICS_DIRTY_VB,
+   PANVK_CMD_GRAPHICS_DIRTY_IB,
+   PANVK_CMD_GRAPHICS_DIRTY_OQ,
+   PANVK_CMD_GRAPHICS_DIRTY_DESC_STATE,
+   PANVK_CMD_GRAPHICS_DIRTY_RENDER_STATE,
+   PANVK_CMD_GRAPHICS_DIRTY_VS_PUSH_UNIFORMS,
+   PANVK_CMD_GRAPHICS_DIRTY_FS_PUSH_UNIFORMS,
+   PANVK_CMD_GRAPHICS_DIRTY_STATE_COUNT,
+};
+
+struct panvk_cmd_graphics_state {
+   struct panvk_descriptor_state desc_state;
+
+   struct {
+      struct vk_vertex_input_state vi;
+      struct vk_sample_locations_state sl;
+   } dynamic;
+
+   struct panvk_occlusion_query_state occlusion_query;
+#if PAN_ARCH >= 10
+   struct panvk_prims_generated_query_state prims_generated_query;
+#endif
+   struct panvk_graphics_sysvals sysvals;
+
+#if PAN_ARCH < 9
+   /* iter13: VK_EXT_transform_feedback state (JM-class only for now). */
+   struct {
+      bool active;
+      uint32_t buffer_count;
+      struct {
+         uint64_t addr;
+         uint64_t offset;
+         uint64_t size;
+      } buffers[4];
+   } xfb;
+#endif
+
+#if PAN_ARCH < 9
+   struct panvk_shader_link link;
+#endif
+
+   struct {
+      const struct panvk_shader *shader;
+      struct panvk_shader_desc_state desc;
+      uint64_t blend_descs[MAX_RTS];
+      uint64_t push_uniforms;
+      bool required;
+#if PAN_ARCH < 9
+      uint64_t rsd;
+#endif
+   } fs;
+
+   struct {
+      const struct panvk_shader *shader;
+      struct panvk_shader_desc_state desc;
+      uint64_t push_uniforms;
+#if PAN_ARCH < 9
+      uint64_t attribs;
+      uint64_t attrib_bufs;
+      uint64_t indirect_attribs_infos;
+      uint64_t indirect_attrib_bufs_infos;
+      uint64_t indirect_varying_bufs_infos;
+      bool previous_draw_was_indirect;
+#endif
+   } vs;
+
+   struct {
+      struct panvk_attrib_buf bufs[MAX_VBS];
+      unsigned count;
+   } vb;
+
+#if PAN_ARCH >= 10
+   struct {
+      uint32_t attribs_changing_on_base_instance;
+   } vi;
+#endif
+
+   /* Index buffer */
+   struct {
+      uint64_t dev_addr;
+      uint64_t size;
+      uint8_t index_size;
+   } ib;
+
+   struct {
+      struct panvk_blend_info info;
+   } cb;
+
+   struct panvk_rendering_state render;
+
+   bool vk_meta;
+
+#if PAN_ARCH < 9
+   uint64_t vpd;
+#endif
+
+#if PAN_ARCH >= 10
+   uint64_t tsd;
+#endif
+
+   BITSET_DECLARE(dirty, PANVK_CMD_GRAPHICS_DIRTY_STATE_COUNT);
+};
+
+#define dyn_gfx_state_dirty(__cmdbuf, __name)                                  \
+   BITSET_TEST((__cmdbuf)->vk.dynamic_graphics_state.dirty,                    \
+               MESA_VK_DYNAMIC_##__name)
+
+#define gfx_state_dirty(__cmdbuf, __name)                                      \
+   BITSET_TEST((__cmdbuf)->state.gfx.dirty, PANVK_CMD_GRAPHICS_DIRTY_##__name)
+
+#define gfx_state_set_dirty(__cmdbuf, __name)                                  \
+   BITSET_SET((__cmdbuf)->state.gfx.dirty, PANVK_CMD_GRAPHICS_DIRTY_##__name)
+
+#define gfx_state_clear_all_dirty(__cmdbuf)                                    \
+   BITSET_ZERO((__cmdbuf)->state.gfx.dirty)
+
+#define gfx_state_set_all_dirty(__cmdbuf)                                      \
+   BITSET_ONES((__cmdbuf)->state.gfx.dirty)
+
+#define set_gfx_sysval(__cmdbuf, __dirty, __name, __val)                       \
+   do {                                                                        \
+      struct panvk_graphics_sysvals __new_sysval;                              \
+      __new_sysval.__name = __val;                                             \
+      if (memcmp(&(__cmdbuf)->state.gfx.sysvals.__name, &__new_sysval.__name,  \
+                 sizeof(__new_sysval.__name))) {                               \
+         (__cmdbuf)->state.gfx.sysvals.__name = __new_sysval.__name;           \
+         BITSET_SET_RANGE(__dirty, sysval_fau_start(graphics, __name),         \
+                          sysval_fau_end(graphics, __name));                   \
+      }                                                                        \
+   } while (0)
+
+#if PAN_ARCH >= 10
+struct panvk_device_draw_context {
+   struct panvk_priv_bo *fns_bo;
+   uint64_t fn_set_fbds_provoking_vertex_stride;
+};
+#endif
+
+static inline void
+panvk_depth_range(const struct panvk_cmd_graphics_state *state,
+                  const struct vk_viewport_state *vp,
+                  float *z_min, float *z_max)
+{
+   float a = vp->depth_clip_negative_one_to_one ?
+      state->sysvals.viewport.offset.z - state->sysvals.viewport.scale.z :
+      state->sysvals.viewport.offset.z;
+   float b = state->sysvals.viewport.offset.z + state->sysvals.viewport.scale.z;
+   *z_min = MIN2(a, b);
+   *z_max = MAX2(a, b);
+}
+
+static inline uint32_t
+panvk_select_tiler_hierarchy_mask(const struct panvk_physical_device *phys_dev,
+                                  const struct panvk_cmd_graphics_state *state,
+                                  unsigned bin_ptr_mem_budget)
+{
+   struct pan_tiler_features tiler_features =
+      pan_query_tiler_features(&phys_dev->kmod.dev->props);
+
+   uint32_t hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)(
+      state->render.fb.info.width, state->render.fb.info.height,
+      tiler_features.max_levels, state->render.fb.info.tile_size,
+      bin_ptr_mem_budget);
+
+   return hierarchy_mask;
+}
+
+static inline bool
+fs_required(const struct panvk_cmd_graphics_state *state,
+            const struct vk_dynamic_graphics_state *dyn_state)
+{
+   const struct panvk_shader_variant *fs =
+      panvk_shader_only_variant(state->fs.shader);
+   const struct pan_shader_info *fs_info = fs ? &fs->info : NULL;
+   const struct vk_color_blend_state *cb = &dyn_state->cb;
+   const struct vk_rasterization_state *rs = &dyn_state->rs;
+
+   if (rs->rasterizer_discard_enable || !fs_info)
+      return false;
+
+   /* If we generally have side effects */
+   if (fs_info->fs.sidefx)
+      return true;
+
+   /* If colour is written we need to execute */
+   for (unsigned i = 0; i < cb->attachment_count; ++i) {
+      if ((cb->color_write_enables & BITFIELD_BIT(i)) &&
+          cb->attachments[i].write_mask)
+         return true;
+   }
+
+   /* If alpha-to-coverage is enabled, we need to run the fragment shader even
+    * if we don't have a color attachment, so depth/stencil updates can be
+    * discarded if alpha, and thus coverage, is 0. */
+   if (dyn_state->ms.alpha_to_coverage_enable)
+      return true;
+
+   /* If the sample mask is updated, we need to run the fragment shader,
+    * otherwise the fixed-function depth/stencil results will apply to all
+    * samples. */
+   if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK))
+      return true;
+
+   /* If depth is written and not implied we need to execute.
+    * TODO: Predicate on Z/S writes being enabled */
+   return (fs_info->fs.writes_depth || fs_info->fs.writes_stencil);
+}
+
+static inline bool
+cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state,
+                   ASSERTED const struct vk_dynamic_graphics_state *dyn_state,
+                   bool cached_value)
+{
+   /* Make sure the cached value was properly initialized. */
+   assert(fs_required(state, dyn_state) == cached_value);
+   return cached_value;
+}
+
+#define get_fs(__cmdbuf)                                                       \
+   (cached_fs_required(&(__cmdbuf)->state.gfx,                                 \
+                       &(__cmdbuf)->vk.dynamic_graphics_state,                 \
+                       (__cmdbuf)->state.gfx.fs.required)                      \
+       ? (__cmdbuf)->state.gfx.fs.shader                                       \
+       : NULL)
+
+/* Anything that might change the value returned by get_fs() makes users of the
+ * fragment shader dirty, because not using the fragment shader (when
+ * fs_required() returns false) impacts various other things, like VS -> FS
+ * linking in the JM backend, or the update of the fragment shader pointer in
+ * the CSF backend. Call gfx_state_dirty(cmdbuf, FS) if you only care about
+ * fragment shader updates. */
+
+#define fs_user_dirty(__cmdbuf)                                                \
+   (gfx_state_dirty(cmdbuf, FS) ||                                             \
+    dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) ||               \
+    dyn_gfx_state_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||                        \
+    dyn_gfx_state_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||                     \
+    dyn_gfx_state_dirty(cmdbuf, CB_WRITE_MASKS) ||                             \
+    dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE))
+
+/* After a draw, all dirty flags are cleared except the FS dirty flag which
+ * needs to be set again if the draw didn't use the fragment shader. */
+
+#define clear_dirty_after_draw(__cmdbuf)                                       \
+   do {                                                                        \
+      bool __set_fs_dirty =                                                    \
+         (__cmdbuf)->state.gfx.fs.shader != get_fs(__cmdbuf);                  \
+      bool __set_fs_push_dirty =                                               \
+         __set_fs_dirty && gfx_state_dirty(__cmdbuf, FS_PUSH_UNIFORMS);        \
+      vk_dynamic_graphics_state_clear_dirty(                                   \
+         &(__cmdbuf)->vk.dynamic_graphics_state);                              \
+      gfx_state_clear_all_dirty(__cmdbuf);                                     \
+      if (__set_fs_dirty)                                                      \
+         gfx_state_set_dirty(__cmdbuf, FS);                                    \
+      if (__set_fs_push_dirty)                                                 \
+         gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS);                      \
+   } while (0)
+
+
+#if PAN_ARCH >= 10
+VkResult
+panvk_per_arch(device_draw_context_init)(struct panvk_device *dev);
+
+void
+panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev);
+#endif
+
+void
+panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf,
+                                      const VkRenderingInfo *pRenderingInfo);
+
+void
+panvk_per_arch(cmd_force_fb_preload)(struct panvk_cmd_buffer *cmdbuf,
+                                     const VkRenderingInfo *render_info);
+
+void
+panvk_per_arch(cmd_preload_render_area_border)(struct panvk_cmd_buffer *cmdbuf,
+                                               const VkRenderingInfo *render_info);
+
+void panvk_per_arch(cmd_select_tile_size)(struct panvk_cmd_buffer *cmdbuf);
+
+struct panvk_draw_info {
+   struct {
+      uint32_t size;
+      uint32_t offset;
+   } index;
+
+   struct {
+#if PAN_ARCH < 9
+      int32_t raw_offset;
+#endif
+      int32_t base;
+      uint32_t count;
+   } vertex;
+
+   struct {
+      int32_t base;
+      uint32_t count;
+   } instance;
+
+   struct {
+      uint64_t buffer_dev_addr;
+      uint64_t count_buffer_dev_addr;
+      uint32_t draw_count;
+      uint32_t stride;
+   } indirect;
+
+#if PAN_ARCH < 9
+   uint32_t layer_id;
+#endif
+};
+
+void
+panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
+                                         const struct panvk_draw_info *info);
+
+static inline uint32_t
+color_attachment_written_mask(
+   const struct panvk_shader_variant *fs,
+   const struct vk_color_attachment_location_state *cal)
+{
+   uint32_t written_by_shader =
+      (fs->info.outputs_written >> FRAG_RESULT_DATA0) & BITFIELD_MASK(8);
+   uint32_t catt_written_mask = 0;
+
+   for (uint32_t i = 0; i < MAX_RTS; i++) {
+      if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
+         continue;
+
+      uint32_t shader_rt = cal->color_map[i];
+
+      if (written_by_shader & BITFIELD_BIT(shader_rt))
+         catt_written_mask |= BITFIELD_BIT(i);
+   }
+
+   return catt_written_mask;
+}
+
+static inline uint32_t
+color_attachment_read_mask(const struct panvk_shader_variant *fs,
+                           const struct vk_input_attachment_location_state *ial,
+                           uint8_t color_attachment_mask)
+{
+   uint32_t color_attachment_count =
+      ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN
+         ? util_last_bit(color_attachment_mask)
+         : ial->color_attachment_count;
+   uint32_t catt_read_mask = 0;
+
+   for (uint32_t i = 0; i < color_attachment_count; i++) {
+      if (ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
+         continue;
+
+      uint32_t catt_idx = ial->color_map[i] + 1;
+      if (fs->fs.input_attachment_read & BITFIELD_BIT(catt_idx)) {
+         assert(color_attachment_mask & BITFIELD_BIT(i));
+         catt_read_mask |= BITFIELD_BIT(i);
+      }
+   }
+
+   return catt_read_mask;
+}
+
+static inline bool
+z_attachment_read(const struct panvk_shader_variant *fs,
+                  const struct vk_input_attachment_location_state *ial)
+{
+   uint32_t depth_mask = ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX
+                            ? BITFIELD_BIT(0)
+                         : ial->depth_att != MESA_VK_ATTACHMENT_UNUSED
+                            ? BITFIELD_BIT(ial->depth_att + 1)
+                            : 0;
+   return depth_mask & fs->fs.input_attachment_read;
+}
+
+static inline bool
+s_attachment_read(const struct panvk_shader_variant *fs,
+                  const struct vk_input_attachment_location_state *ial)
+{
+   uint32_t stencil_mask = ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX
+                              ? BITFIELD_BIT(0)
+                           : ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED
+                              ? BITFIELD_BIT(ial->stencil_att + 1)
+                              : 0;
+
+   return stencil_mask & fs->fs.input_attachment_read;
+}
+
+#endif
@@ -0,0 +1,572 @@
+/*
+ * Copyright © 2021 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef PANVK_SHADER_H
+#define PANVK_SHADER_H
+
+#ifndef PAN_ARCH
+#error "PAN_ARCH must be defined"
+#endif
+
+#include "compiler/pan_compiler.h"
+
+#include "pan_desc.h"
+#include "pan_earlyzs.h"
+
+#include "panvk_cmd_push_constant.h"
+#include "panvk_descriptor_set.h"
+#include "panvk_macros.h"
+#include "panvk_mempool.h"
+
+#include "vk_pipeline_layout.h"
+
+#include "vk_shader.h"
+
+extern const struct vk_device_shader_ops panvk_per_arch(device_shader_ops);
+
+#define MAX_RTS 8
+#define MAX_VS_ATTRIBS 16
+
+#if PAN_ARCH < 9
+
+/* We could theoretically use the MAX_PER_SET values here (except for UBOs
+ * where we're really limited to 256 on the shader side), but on Bifrost we
+ * have to copy some tables around, which comes at an extra memory/processing
+ * cost, so let's pick something smaller. */
+#define MAX_PER_STAGE_SAMPLED_IMAGES 256
+#define MAX_PER_STAGE_SAMPLERS 128
+#define MAX_PER_STAGE_UNIFORM_BUFFERS MAX_PER_SET_UNIFORM_BUFFERS
+#define MAX_PER_STAGE_STORAGE_BUFFERS 64
+#define MAX_PER_STAGE_STORAGE_IMAGES 32
+#define MAX_PER_STAGE_INPUT_ATTACHMENTS MAX_PER_SET_INPUT_ATTACHMENTS
+
+#else
+
+#define MAX_PER_STAGE_SAMPLED_IMAGES MAX_PER_SET_SAMPLED_IMAGES
+#define MAX_PER_STAGE_SAMPLERS MAX_PER_SET_SAMPLERS
+#define MAX_PER_STAGE_UNIFORM_BUFFERS MAX_PER_SET_UNIFORM_BUFFERS
+#define MAX_PER_STAGE_STORAGE_BUFFERS MAX_PER_SET_STORAGE_BUFFERS
+#define MAX_PER_STAGE_STORAGE_IMAGES MAX_PER_SET_STORAGE_IMAGES
+#define MAX_PER_STAGE_INPUT_ATTACHMENTS MAX_PER_SET_INPUT_ATTACHMENTS
+
+#endif
+
+#define MAX_PER_STAGE_RESOURCES (                                              \
+   MAX_PER_STAGE_SAMPLED_IMAGES + MAX_PER_STAGE_SAMPLERS +                     \
+   MAX_PER_STAGE_UNIFORM_BUFFERS + MAX_PER_STAGE_STORAGE_BUFFERS +             \
+   MAX_PER_STAGE_STORAGE_IMAGES + MAX_PER_STAGE_INPUT_ATTACHMENTS)
+
+struct nir_shader;
+struct pan_blend_state;
+struct panvk_device;
+
+enum panvk_varying_buf_id {
+   PANVK_VARY_BUF_GENERAL,
+   PANVK_VARY_BUF_POSITION,
+   PANVK_VARY_BUF_PSIZ,
+
+   /* Keep last */
+   PANVK_VARY_BUF_MAX,
+};
+
+#if PAN_ARCH < 9
+enum panvk_desc_table_id {
+   PANVK_DESC_TABLE_USER = 0,
+   PANVK_DESC_TABLE_CS_DYN_SSBOS = MAX_SETS,
+   PANVK_DESC_TABLE_COMPUTE_COUNT = PANVK_DESC_TABLE_CS_DYN_SSBOS + 1,
+   PANVK_DESC_TABLE_VS_DYN_SSBOS = MAX_SETS,
+   PANVK_DESC_TABLE_FS_DYN_SSBOS = MAX_SETS + 1,
+   PANVK_DESC_TABLE_GFX_COUNT = PANVK_DESC_TABLE_FS_DYN_SSBOS + 1,
+};
+#endif
+
+#define PANVK_COLOR_ATTACHMENT(x) (x)
+#define PANVK_ZS_ATTACHMENT       255
+
+struct panvk_input_attachment_info {
+   uint32_t target;
+   uint32_t conversion;
+};
+
+/* One attachment per color, one for depth, one for stencil, and the last one
+ * for the attachment without an InputAttachmentIndex attribute. */
+#define INPUT_ATTACHMENT_MAP_SIZE 11
+
+#define FAU_WORD_SIZE sizeof(uint64_t)
+
+#define aligned_u64 __attribute__((aligned(sizeof(uint64_t)))) uint64_t
+
+/* System values which are common to both graphics and compute.  These are
+ * always at the same offset in both graphics and compute allowing us to
+ * compile the shader without knowing which queue it will be dispatched on.
+ */
+struct panvk_common_sysvals_inner {
+   /* Address of sysval/push constant buffer used for indirect loads */
+   aligned_u64 push_uniforms;
+
+   /* Address of the printf buffer */
+   aligned_u64 printf_buffer_address;
+} __attribute__((aligned(FAU_WORD_SIZE)));
+
+struct panvk_common_sysvals {
+   uint32_t _pad[4];
+   struct panvk_common_sysvals_inner common;
+} __attribute__((aligned(FAU_WORD_SIZE)));
+
+static_assert((offsetof(struct panvk_common_sysvals, common) %
+               FAU_WORD_SIZE) == 0,
+              "struct panvk_graphics_sysvals_inner must be 8-byte aligned");
+static_assert((sizeof(struct panvk_common_sysvals_inner) %
+               FAU_WORD_SIZE) == 0,
+              "struct panvk_graphics_sysvals_inner must be 8-byte aligned");
+
+#define SYSVALS_COMMON_START \
+   (offsetof(struct panvk_common_sysvals, common) / FAU_WORD_SIZE)
+
+#define SYSVALS_COMMON_COUNT \
+   (sizeof(struct panvk_common_sysvals_inner) / FAU_WORD_SIZE)
+
+#define SYSVALS_COMMON_END (SYSVALS_COMMON_START + SYSVALS_COMMON_COUNT)
+
+struct panvk_graphics_sysvals {
+   /* Blend constants MUST come first because their position cannot depend on
+    * the FAU packing of the fragment shader.
+    */
+   struct {
+      float constants[4];
+   } blend;
+
+   /* This must be at the same offset for both compute and graphics */
+   struct panvk_common_sysvals_inner common;
+
+   struct {
+      struct {
+         float x, y, z;
+      } scale, offset;
+   } viewport;
+
+   struct {
+#if PAN_ARCH < 9
+      int32_t raw_vertex_offset;
+      uint32_t num_vertices;       /* iter13: XFB needs per-draw vertex count */
+      /* aligned_u64 attribute below inserts the 4-byte alignment gap
+       * after num_vertices automatically — no explicit pad needed. */
+      aligned_u64 xfb_address[4];  /* iter13: 4 transform feedback buffer base addresses */
+#endif
+      int32_t first_vertex;
+      int32_t base_instance;
+      uint32_t noperspective_varyings;
+   } vs;
+
+   struct {
+      aligned_u64 blend_descs[MAX_RTS];
+   } fs;
+
+   struct panvk_input_attachment_info iam[INPUT_ATTACHMENT_MAP_SIZE];
+
+#if PAN_ARCH < 9
+   /* gl_Layer on Bifrost is a bit of hack. We have to issue one draw per
+    * layer, and filter primitives at the VS level.
+    */
+   int32_t layer_id;
+
+   struct {
+      aligned_u64 sets[PANVK_DESC_TABLE_GFX_COUNT];
+   } desc;
+#endif
+} __attribute__((aligned(FAU_WORD_SIZE)));
+
+static_assert(offsetof(struct panvk_graphics_sysvals, blend) == 0,
+              "panvk_graphics_sysvals::blend must be at the start");
+static_assert(offsetof(struct panvk_graphics_sysvals, common) ==
+                 offsetof(struct panvk_common_sysvals, common),
+              "Common sysvals must be at the same offset everywhere");
+static_assert((sizeof(struct panvk_graphics_sysvals) % FAU_WORD_SIZE) == 0,
+              "struct panvk_graphics_sysvals must be 8-byte aligned");
+#if PAN_ARCH < 9
+static_assert((offsetof(struct panvk_graphics_sysvals, desc) % FAU_WORD_SIZE) ==
+                 0,
+              "panvk_graphics_sysvals::desc must be 8-byte aligned");
+#endif
+
+struct panvk_compute_sysvals {
+   struct {
+      uint32_t x, y, z;
+   } base;
+
+   uint32_t _pad;
+
+   /* This must be at the same offset for both compute and graphics */
+   struct panvk_common_sysvals_inner common;
+
+   struct {
+      uint32_t x, y, z;
+   } num_work_groups;
+   struct {
+      uint32_t x, y, z;
+   } local_group_size;
+
+#if PAN_ARCH < 9
+   struct {
+      aligned_u64 sets[PANVK_DESC_TABLE_COMPUTE_COUNT];
+   } desc;
+#endif
+} __attribute__((aligned(FAU_WORD_SIZE)));
+
+static_assert(offsetof(struct panvk_compute_sysvals, common) ==
+                 offsetof(struct panvk_common_sysvals, common),
+              "Common sysvals must be at the same offset everywhere");
+static_assert((sizeof(struct panvk_compute_sysvals) % FAU_WORD_SIZE) == 0,
+              "struct panvk_compute_sysvals must be 8-byte aligned");
+#if PAN_ARCH < 9
+static_assert((offsetof(struct panvk_compute_sysvals, desc) % FAU_WORD_SIZE) ==
+                 0,
+              "panvk_compute_sysvals::desc must be 8-byte aligned");
+#endif
+
+/* This is not the final offset in the push constant buffer (AKA FAU), but
+ * just a magic offset we use before packing push constants so we can easily
+ * identify the type of push constant (driver sysvals vs user push constants).
+ */
+#define SYSVALS_PUSH_CONST_BASE MAX_PUSH_CONSTANTS_SIZE
+
+#define common_sysval_size(__name)                                             \
+   sizeof(((struct panvk_common_sysvals *)NULL)->common.__name)
+
+#define graphics_sysval_size(__name)                                           \
+   sizeof(((struct panvk_graphics_sysvals *)NULL)->__name)
+
+#define compute_sysval_size(__name)                                            \
+   sizeof(((struct panvk_compute_sysvals *)NULL)->__name)
+
+#define sysval_size(__ptype, __name) __ptype##_sysval_size(__name)
+
+#define common_sysval_offset(__name)                                           \
+   offsetof(struct panvk_common_sysvals, common.__name)
+
+#define graphics_sysval_offset(__name)                                         \
+   offsetof(struct panvk_graphics_sysvals, __name)
+
+#define compute_sysval_offset(__name)                                          \
+   offsetof(struct panvk_compute_sysvals, __name)
+
+#define sysval_offset(__ptype, __name) __ptype##_sysval_offset(__name)
+
+#define sysval_entry_size(__ptype, __name)                                     \
+   sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0])
+
+#define sysval_entry_offset(__ptype, __name, __idx)                            \
+   (sysval_offset(__ptype, __name) +                                           \
+    (sysval_entry_size(__ptype, __name) * __idx))
+
+#define sysval_fau_start(__ptype, __name)                                      \
+   (sysval_offset(__ptype, __name) / FAU_WORD_SIZE)
+
+#define sysval_fau_end(__ptype, __name)                                        \
+   ((sysval_offset(__ptype, __name) + sysval_size(__ptype, __name) - 1) /      \
+    FAU_WORD_SIZE)
+
+#define sysval_fau_entry_start(__ptype, __name, __idx)                         \
+   (sysval_entry_offset(__ptype, __name, __idx) / FAU_WORD_SIZE)
+
+#define sysval_fau_entry_end(__ptype, __name, __idx)                           \
+   ((sysval_entry_offset(__ptype, __name, __idx + 1) - 1) / FAU_WORD_SIZE)
+
+#define shader_remapped_fau_offset(__shader, __kind, __offset)                 \
+   ((FAU_WORD_SIZE * BITSET_PREFIX_SUM((__shader)->fau.used_##__kind,          \
+                                       (__offset) / FAU_WORD_SIZE)) +          \
+    ((__offset) % FAU_WORD_SIZE))
+
+#define shader_remapped_sysval_offset(__shader, __offset)                      \
+   shader_remapped_fau_offset(__shader, sysvals, __offset)
+
+#define shader_remapped_push_const_offset(__shader, __offset)                  \
+   (((__shader)->fau.sysval_count * FAU_WORD_SIZE) +                     \
+    shader_remapped_fau_offset(__shader, push_consts, __offset))
+
+#define shader_use_sysval(__shader, __ptype, __name)                           \
+   BITSET_SET_RANGE((__shader)->fau.used_sysvals,                              \
+                    sysval_fau_start(__ptype, __name),                         \
+                    sysval_fau_end(__ptype, __name))
+
+#define shader_uses_sysval(__shader, __ptype, __name)                          \
+   BITSET_TEST_RANGE((__shader)->fau.used_sysvals,                             \
+                     sysval_fau_start(__ptype, __name),                        \
+                     sysval_fau_end(__ptype, __name))
+
+#define shader_uses_sysval_entry(__shader, __ptype, __name, __idx)             \
+   BITSET_TEST_RANGE((__shader)->fau.used_sysvals,                             \
+                     sysval_fau_entry_start(__ptype, __name, __idx),           \
+                     sysval_fau_entry_end(__ptype, __name, __idx))
+
+#define shader_use_sysval_range(__shader, __base, __range)                     \
+   BITSET_SET_RANGE((__shader)->fau.used_sysvals, (__base) / FAU_WORD_SIZE,    \
+                    ((__base) + (__range) - 1) / FAU_WORD_SIZE)
+
+#define shader_use_push_const_range(__shader, __base, __range)                 \
+   BITSET_SET_RANGE((__shader)->fau.used_push_consts,                          \
+                    (__base) / FAU_WORD_SIZE,                                  \
+                    ((__base) + (__range) - 1) / FAU_WORD_SIZE)
+
+#define load_sysval(__b, __ptype, __bitsz, __name)                             \
+   nir_load_push_constant(                                                     \
+      __b, sysval_size(__ptype, __name) / ((__bitsz) / 8), __bitsz,            \
+      nir_imm_int(__b, sysval_offset(__ptype, __name)),                        \
+      .base = SYSVALS_PUSH_CONST_BASE)
+
+#define load_sysval_entry(__b, __ptype, __bitsz, __name, __dyn_idx)            \
+   nir_load_push_constant(                                                     \
+      __b, sysval_entry_size(__ptype, __name) / ((__bitsz) / 8), __bitsz,      \
+      nir_imul_imm(__b, __dyn_idx, sysval_entry_size(__ptype, __name)),        \
+      .base = SYSVALS_PUSH_CONST_BASE + sysval_offset(__ptype, __name),        \
+      .range = sysval_size(__ptype, __name))
+
+#if PAN_ARCH < 9
+enum panvk_bifrost_desc_table_type {
+   PANVK_BIFROST_DESC_TABLE_INVALID = -1,
+
+   /* UBO is encoded on 8 bytes */
+   PANVK_BIFROST_DESC_TABLE_UBO = 0,
+
+   /* Images are using a <3DAttributeBuffer,Attribute> pair, each
+    * of them being stored in a separate table. */
+   PANVK_BIFROST_DESC_TABLE_IMG,
+
+   /* Texture and sampler are encoded on 32 bytes */
+   PANVK_BIFROST_DESC_TABLE_TEXTURE,
+   PANVK_BIFROST_DESC_TABLE_SAMPLER,
+
+   PANVK_BIFROST_DESC_TABLE_COUNT,
+};
+#endif
+
+#define COPY_DESC_HANDLE(table, idx)           ((table << 28) | (idx))
+#define COPY_DESC_HANDLE_EXTRACT_INDEX(handle) ((handle) & BITFIELD_MASK(28))
+#define COPY_DESC_HANDLE_EXTRACT_TABLE(handle) ((handle) >> 28)
+
+#define MAX_COMPUTE_SYSVAL_FAUS                                                \
+   (sizeof(struct panvk_compute_sysvals) / FAU_WORD_SIZE)
+#define MAX_GFX_SYSVAL_FAUS                                                    \
+   (sizeof(struct panvk_graphics_sysvals) / FAU_WORD_SIZE)
+#define MAX_SYSVAL_FAUS     MAX2(MAX_COMPUTE_SYSVAL_FAUS, MAX_GFX_SYSVAL_FAUS)
+#define MAX_PUSH_CONST_FAUS (MAX_PUSH_CONSTANTS_SIZE / FAU_WORD_SIZE)
+
+struct panvk_shader_fau_info {
+   BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
+   BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
+   uint32_t sysval_count;
+   uint32_t total_count;
+};
+
+struct panvk_shader_desc_info {
+   uint32_t used_set_mask;
+
+#if PAN_ARCH < 9
+   struct {
+      uint32_t map[MAX_DYNAMIC_UNIFORM_BUFFERS];
+      uint32_t count;
+   } dyn_ubos;
+   struct {
+      uint32_t map[MAX_DYNAMIC_STORAGE_BUFFERS];
+      uint32_t count;
+   } dyn_ssbos;
+   struct {
+      struct panvk_priv_mem map;
+      uint32_t count[PANVK_BIFROST_DESC_TABLE_COUNT];
+   } others;
+#else
+   struct {
+      uint32_t map[MAX_DYNAMIC_BUFFERS];
+      uint32_t count;
+   } dyn_bufs;
+   uint32_t fs_varying_attr_desc_count;
+#endif
+};
+
+struct panvk_shader_variant {
+   struct pan_shader_info info;
+
+   union {
+      struct {
+         struct pan_compute_dim local_size;
+      } cs;
+
+      struct {
+         struct pan_earlyzs_lut earlyzs_lut;
+         uint32_t input_attachment_read;
+      } fs;
+   };
+
+   struct panvk_shader_desc_info desc_info;
+
+   struct panvk_shader_fau_info fau;
+
+   const void *bin_ptr;
+   uint32_t bin_size;
+   bool own_bin;
+
+   struct panvk_priv_mem code_mem;
+
+#if PAN_ARCH < 9
+   struct panvk_priv_mem rsd;
+#else
+   union {
+      struct panvk_priv_mem spd;
+      struct {
+#if PAN_ARCH < 12
+         struct panvk_priv_mem pos_points;
+         struct panvk_priv_mem pos_triangles;
+         struct panvk_priv_mem var;
+#else
+         struct panvk_priv_mem all_points;
+         struct panvk_priv_mem all_triangles;
+#endif
+      } spds;
+   };
+#endif
+
+   const char *nir_str;
+   const char *asm_str;
+};
+
+enum panvk_vs_variant {
+   /* Hardware vertex shader, when next stage is fragment */
+   PANVK_VS_VARIANT_HW,
+
+   PANVK_VS_VARIANTS,
+};
+
+struct panvk_shader {
+   struct vk_shader vk;
+
+   struct panvk_shader_variant variants[];
+};
+
+static inline unsigned
+panvk_shader_num_variants(mesa_shader_stage stage)
+{
+   if (stage == MESA_SHADER_VERTEX)
+      return PANVK_VS_VARIANTS;
+
+   return 1;
+}
+
+static const char *panvk_vs_shader_variant_name[] = {
+   [PANVK_VS_VARIANT_HW] = NULL,
+};
+
+static const char *
+panvk_shader_variant_name(const struct panvk_shader *shader,
+                          struct panvk_shader_variant *variant)
+{
+   unsigned i = variant - shader->variants;
+   assert(i < panvk_shader_num_variants(shader->vk.stage));
+
+   if (shader->vk.stage == MESA_SHADER_VERTEX) {
+      assert(i < ARRAY_SIZE(panvk_vs_shader_variant_name));
+      return panvk_vs_shader_variant_name[i];
+   }
+
+   assert(panvk_shader_num_variants(shader->vk.stage) == 1);
+
+   return NULL;
+}
+
+static const struct panvk_shader_variant *
+panvk_shader_only_variant(const struct panvk_shader *shader)
+{
+   if (!shader)
+      return NULL;
+
+   assert(panvk_shader_num_variants(shader->vk.stage) == 1);
+   return &shader->variants[0];
+}
+
+static const struct panvk_shader_variant *
+panvk_shader_hw_variant(const struct panvk_shader *shader)
+{
+   if (!shader)
+      return NULL;
+
+   return &shader->variants[0];
+}
+
+static inline uint64_t
+panvk_shader_variant_get_dev_addr(const struct panvk_shader_variant *shader)
+{
+   return shader != NULL ? panvk_priv_mem_dev_addr(shader->code_mem) : 0;
+}
+
+#define panvk_shader_foreach_variant(__shader, __var)                          \
+   for (struct panvk_shader_variant *__var = (__shader)->variants;             \
+        __var < (__shader)->variants +                                         \
+                   panvk_shader_num_variants((__shader)->vk.stage);            \
+        ++__var)
+
+#if PAN_ARCH < 9
+struct panvk_shader_link {
+   struct {
+      struct panvk_priv_mem attribs;
+   } vs, fs;
+   unsigned buf_strides[PANVK_VARY_BUF_MAX];
+};
+
+VkResult panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
+                                      const struct panvk_shader_variant *vs,
+                                      const struct panvk_shader_variant *fs,
+                                      struct panvk_shader_link *link);
+
+static inline void
+panvk_shader_link_cleanup(struct panvk_shader_link *link)
+{
+   panvk_pool_free_mem(&link->vs.attribs);
+   panvk_pool_free_mem(&link->fs.attribs);
+}
+#endif
+
+bool panvk_per_arch(nir_lower_input_attachment_loads)(
+   nir_shader *nir,
+   const struct vk_graphics_pipeline_state *state,
+   uint32_t *input_attachment_read_out);
+
+void panvk_per_arch(nir_lower_descriptors)(
+   nir_shader *nir, struct panvk_device *dev,
+   const struct vk_pipeline_robustness_state *rs, uint32_t set_layout_count,
+   struct vk_descriptor_set_layout *const *set_layouts,
+   const struct vk_graphics_pipeline_state *state,
+   struct panvk_shader_desc_info *desc_info);
+
+/* This a stripped-down version of panvk_shader for internal shaders that
+ * are managed by vk_meta (blend and preload shaders). Those don't need the
+ * complexity inherent to user provided shaders as they're not exposed. */
+struct panvk_internal_shader {
+   struct vk_shader vk;
+   struct pan_shader_info info;
+   struct panvk_priv_mem code_mem;
+
+#if PAN_ARCH < 9
+   struct panvk_priv_mem rsd;
+#else
+   struct panvk_priv_mem spd;
+#endif
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_internal_shader, vk.base, VkShaderEXT,
+                               VK_OBJECT_TYPE_SHADER_EXT)
+
+void panvk_per_arch(compiler_lock)(void);
+void panvk_per_arch(compiler_unlock)(void);
+
+VkResult panvk_per_arch(create_internal_shader)(
+   struct panvk_device *dev, nir_shader *nir,
+   struct pan_compile_inputs *compiler_inputs,
+   struct panvk_internal_shader **shader_out);
+
+VkResult panvk_per_arch(create_shader_from_binary)(
+   struct panvk_device *dev, const struct pan_shader_info *info,
+   struct pan_compute_dim local_size, const void *bin_ptr, size_t bin_size,
+   struct panvk_shader **shader_out);
+
+#endif
@@ -0,0 +1,956 @@
+/*
+ * Copyright © 2024 Collabora Ltd.
+ * Copyright © 2024 Arm Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "panvk_buffer.h"
+#include "panvk_cmd_buffer.h"
+#include "panvk_device_memory.h"
+#include "panvk_entrypoints.h"
+
+#include "pan_desc.h"
+#include "pan_compiler.h"   /* PAN_SHADER_OOB_ADDRESS */
+#include "pan_util.h"
+
+static void
+att_set_clear_preload(const VkRenderingAttachmentInfo *att, bool *clear, bool *preload)
+{
+   switch (att->loadOp) {
+   case VK_ATTACHMENT_LOAD_OP_CLEAR:
+      *clear = true;
+      break;
+   case VK_ATTACHMENT_LOAD_OP_LOAD:
+      *preload = true;
+      break;
+   case VK_ATTACHMENT_LOAD_OP_NONE:
+   case VK_ATTACHMENT_LOAD_OP_DONT_CARE:
+      /* This is a very frustrating corner case. From the spec:
+       *
+       *     VK_ATTACHMENT_STORE_OP_NONE specifies the contents within the
+       *     render area are not accessed by the store operation as long as
+       *     no values are written to the attachment during the render pass.
+       *
+       * With VK_ATTACHMENT_LOAD_OP_DONT_CARE + VK_ATTACHMENT_STORE_OP_NONE,
+       * we need to preserve the contents throughout partial renders. The
+       * easiest way to do that is forcing a preload, so that partial stores
+       * for unused attachments will be no-op'd by writing existing contents.
+       *
+       * TODO: disable preload when we have clean_pixel_write_enable = false
+       * as an optimization
+       */
+      *preload |= att->storeOp == VK_ATTACHMENT_STORE_OP_NONE;
+      break;
+   default:
+      UNREACHABLE("Unsupported loadOp");
+   }
+}
+
+static struct panvk_image_view *
+get_ms2ss_image_view(struct panvk_image_view *iview, uint32_t nr_samples)
+{
+   assert(nr_samples >= 2 && nr_samples <= 16);
+   assert(iview->pview.nr_samples == 1);
+   assert(iview->vk.image->create_flags &
+          VK_IMAGE_CREATE_MULTISAMPLED_RENDER_TO_SINGLE_SAMPLED_BIT_EXT);
+
+   /* sample count 2 is at index 0, 4 at 1, .. */
+   uint32_t vidx = 0;
+   switch (nr_samples) {
+   case VK_SAMPLE_COUNT_2_BIT:
+      vidx = 0;
+      break;
+   case VK_SAMPLE_COUNT_4_BIT:
+      vidx = 1;
+      break;
+   case VK_SAMPLE_COUNT_8_BIT:
+      vidx = 2;
+      break;
+   case VK_SAMPLE_COUNT_16_BIT:
+      vidx = 3;
+      break;
+   default:
+      UNREACHABLE("unhandled sample count");
+   }
+   assert(iview->ms_views[vidx] != VK_NULL_HANDLE);
+
+   struct panvk_image_view *res =
+      panvk_image_view_from_handle(iview->ms_views[vidx]);
+
+   assert(res->pview.nr_samples == nr_samples);
+
+   return res;
+}
+
+static void
+render_state_set_color_attachment(struct panvk_cmd_buffer *cmdbuf,
+                                  const VkRenderingAttachmentInfo *att,
+                                  uint32_t index)
+{
+   struct panvk_physical_device *phys_dev =
+         to_panvk_physical_device(cmdbuf->vk.base.device->physical);
+   struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
+   struct pan_fb_info *fbinfo = &state->render.fb.info;
+   VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
+
+   struct panvk_image_view *iview_ss = NULL;
+   const bool ms2ss = cmdbuf->state.gfx.render.fb.nr_samples > 1 &&
+                      iview->pview.nr_samples == 1;
+
+   if (ms2ss) {
+      iview_ss = iview;
+      iview =
+         get_ms2ss_image_view(iview, cmdbuf->state.gfx.render.fb.nr_samples);
+   }
+
+   struct panvk_image *img =
+      container_of(iview->vk.image, struct panvk_image, vk);
+
+   state->render.bound_attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(index);
+   state->render.color_attachments.iviews[index] = iview;
+   state->render.color_attachments.preload_iviews[index] =
+      ms2ss ? iview_ss : NULL;
+   state->render.color_attachments.fmts[index] = iview->vk.format;
+   state->render.color_attachments.samples[index] = img->vk.samples;
+
+#if PAN_ARCH < 9
+   for (uint8_t p = 0; p < ARRAY_SIZE(iview->pview.planes); p++) {
+      struct pan_image_plane_ref pref =
+         pan_image_view_get_plane(&iview->pview, p);
+
+      if (!pref.image)
+         continue;
+
+      assert(pref.plane_idx < ARRAY_SIZE(img->planes));
+      assert(img->planes[pref.plane_idx].mem->bo != NULL);
+      state->render.fb.bos[state->render.fb.bo_count++] =
+         img->planes[pref.plane_idx].mem->bo;
+   }
+#endif
+
+   fbinfo->rts[index].view = &iview->pview;
+   fbinfo->rts[index].crc_valid = &state->render.fb.crc_valid[index];
+   state->render.fb.nr_samples =
+      MAX2(state->render.fb.nr_samples,
+           pan_image_view_get_nr_samples(&iview->pview));
+
+   if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+      enum pipe_format fmt = vk_format_to_pipe_format(iview->vk.format);
+      union pipe_color_union *col =
+         (union pipe_color_union *)&att->clearValue.color;
+      pan_pack_color(phys_dev->formats.blendable,
+                     fbinfo->rts[index].clear_value, col, fmt, false);
+   }
+
+   att_set_clear_preload(att, &fbinfo->rts[index].clear,
+                         &fbinfo->rts[index].preload);
+
+   if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
+      struct panvk_resolve_attachment *resolve_info =
+         &state->render.color_attachments.resolve[index];
+      VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView);
+
+      /* VUID-VkRenderingAttachmentInfo-imageView-06862 and
+       * VUID-VkRenderingAttachmentInfo-imageView-06863:
+       * If resolveMode != NONE, then
+       * resolveView == NULL iff. multisampledRenderToSingleSampledEnable */
+      assert(ms2ss == (resolve_iview == NULL));
+
+      resolve_info->mode = att->resolveMode;
+      if (!ms2ss) {
+         resolve_info->dst_iview = resolve_iview;
+      } else {
+         assert(iview_ss);
+         resolve_info->dst_iview = iview_ss;
+         assert(resolve_info->dst_iview->pview.nr_samples == 1);
+      }
+   }
+}
+
+static void
+render_state_set_z_attachment(struct panvk_cmd_buffer *cmdbuf,
+                              const VkRenderingAttachmentInfo *att)
+{
+   struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
+   struct pan_fb_info *fbinfo = &state->render.fb.info;
+   VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
+
+   struct panvk_image_view *iview_ss = NULL;
+   const bool ms2ss = cmdbuf->state.gfx.render.fb.nr_samples > 1 &&
+                      iview->pview.nr_samples == 1;
+
+   if (ms2ss) {
+      iview_ss = iview;
+      iview =
+         get_ms2ss_image_view(iview, cmdbuf->state.gfx.render.fb.nr_samples);
+   }
+
+   struct panvk_image *img =
+      container_of(iview->vk.image, struct panvk_image, vk);
+
+#if PAN_ARCH < 9
+   /* Depth plane always comes first. */
+   state->render.fb.bos[state->render.fb.bo_count++] = img->planes[0].mem->bo;
+#endif
+
+   state->render.z_attachment.fmt = iview->vk.format;
+   state->render.bound_attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
+
+   state->render.zs_pview = iview->pview;
+   fbinfo->zs.view.zs = &state->render.zs_pview;
+
+   /* Fixup view format when the image is multiplanar. */
+   if (panvk_image_is_planar_depth_stencil(img))
+      state->render.zs_pview.format = panvk_image_depth_only_pfmt(img);
+
+   state->render.zs_pview.planes[0] = (struct pan_image_plane_ref){
+      .image = &img->planes[0].image,
+      .plane_idx = 0,
+   };
+   state->render.zs_pview.planes[1] = (struct pan_image_plane_ref){0};
+   state->render.fb.nr_samples =
+      MAX2(state->render.fb.nr_samples,
+           pan_image_view_get_nr_samples(&iview->pview));
+   state->render.z_attachment.iview = iview;
+   state->render.z_attachment.preload_iview = ms2ss ? iview_ss : NULL;
+
+   /* D24S8 is a single plane format where the depth/stencil are interleaved.
+    * If we touch the depth component, we need to make sure the stencil
+    * component is preserved, hence the preload, and the view format adjusment.
+    */
+   if (panvk_image_is_interleaved_depth_stencil(img)) {
+      fbinfo->zs.preload.s = true;
+      cmdbuf->state.gfx.render.zs_pview.format =
+         img->planes[0].image.props.format;
+   } else {
+      state->render.zs_pview.format = panvk_image_depth_only_pfmt(img);
+   }
+
+   if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
+      fbinfo->zs.clear_value.depth = att->clearValue.depthStencil.depth;
+
+   att_set_clear_preload(att, &fbinfo->zs.clear.z, &fbinfo->zs.preload.z);
+
+   if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
+      struct panvk_resolve_attachment *resolve_info =
+         &state->render.z_attachment.resolve;
+      VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView);
+
+      resolve_info->mode = att->resolveMode;
+      if (!ms2ss) {
+         resolve_info->dst_iview = resolve_iview;
+      } else {
+         assert(iview_ss);
+         resolve_info->dst_iview = iview_ss;
+         assert(resolve_info->dst_iview->pview.nr_samples == 1);
+      }
+   }
+}
+
+static void
+render_state_set_s_attachment(struct panvk_cmd_buffer *cmdbuf,
+                              const VkRenderingAttachmentInfo *att)
+{
+   struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
+   struct pan_fb_info *fbinfo = &state->render.fb.info;
+   VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
+
+   struct panvk_image_view *iview_ss = NULL;
+   const bool ms2ss = cmdbuf->state.gfx.render.fb.nr_samples > 1 &&
+                      iview->pview.nr_samples == 1;
+
+   if (ms2ss) {
+      iview_ss = iview;
+      iview =
+         get_ms2ss_image_view(iview, cmdbuf->state.gfx.render.fb.nr_samples);
+   }
+
+   struct panvk_image *img =
+      container_of(iview->vk.image, struct panvk_image, vk);
+
+#if PAN_ARCH < 9
+   /* The stencil plane is always last. */
+   state->render.fb.bos[state->render.fb.bo_count++] =
+      img->planes[img->plane_count - 1].mem->bo;
+#endif
+
+   state->render.s_attachment.fmt = iview->vk.format;
+   state->render.bound_attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
+
+   state->render.s_pview = iview->pview;
+   fbinfo->zs.view.s = &state->render.s_pview;
+
+   if (panvk_image_is_planar_depth_stencil(img)) {
+      state->render.s_pview.format = panvk_image_stencil_only_pfmt(img);
+      state->render.s_pview.planes[0] = (struct pan_image_plane_ref){0};
+      state->render.s_pview.planes[1] = (struct pan_image_plane_ref){
+         .image = &img->planes[1].image,
+         .plane_idx = 0,
+      };
+   } else {
+      state->render.s_pview.format = panvk_image_stencil_only_pfmt(img);
+      state->render.s_pview.planes[0] = (struct pan_image_plane_ref){
+         .image = &img->planes[0].image,
+         .plane_idx = 0,
+      };
+      state->render.s_pview.planes[1] = (struct pan_image_plane_ref){0};
+   }
+
+   state->render.fb.nr_samples =
+      MAX2(state->render.fb.nr_samples,
+           pan_image_view_get_nr_samples(&iview->pview));
+   state->render.s_attachment.iview = iview;
+   state->render.s_attachment.preload_iview = ms2ss ? iview_ss : NULL;
+
+   /* If the depth and stencil attachments point to the same image,
+    * and the format is D24S8, we can combine them in a single view
+    * addressing both components.
+    */
+   if (state->render.s_pview.format == PIPE_FORMAT_X24S8_UINT &&
+       state->render.z_attachment.iview &&
+       state->render.z_attachment.iview->vk.image == iview->vk.image) {
+      state->render.zs_pview.format = PIPE_FORMAT_Z24_UNORM_S8_UINT;
+      fbinfo->zs.preload.s = false;
+      fbinfo->zs.view.s = NULL;
+
+   /* If there was no depth attachment, and the image format is D24S8,
+    * we use the depth+stencil slot, so we can benefit from AFBC, which
+    * is not supported on the stencil-only slot on Bifrost.
+    */
+   } else if (img->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
+              state->render.s_pview.format == PIPE_FORMAT_X24S8_UINT &&
+              fbinfo->zs.view.zs == NULL) {
+      fbinfo->zs.view.zs = &state->render.s_pview;
+      state->render.s_pview.format = PIPE_FORMAT_Z24_UNORM_S8_UINT;
+      fbinfo->zs.preload.z = true;
+      fbinfo->zs.view.s = NULL;
+   }
+
+   if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
+      fbinfo->zs.clear_value.stencil = att->clearValue.depthStencil.stencil;
+
+   att_set_clear_preload(att, &fbinfo->zs.clear.s, &fbinfo->zs.preload.s);
+
+   if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
+      struct panvk_resolve_attachment *resolve_info =
+         &state->render.s_attachment.resolve;
+      VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView);
+
+      resolve_info->mode = att->resolveMode;
+      if (!ms2ss) {
+         resolve_info->dst_iview = resolve_iview;
+      } else {
+         assert(iview_ss);
+         resolve_info->dst_iview = iview_ss;
+         assert(resolve_info->dst_iview->pview.nr_samples == 1);
+      }
+   }
+}
+
+void
+panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf,
+                                      const VkRenderingInfo *pRenderingInfo)
+{
+   struct panvk_physical_device *phys_dev =
+         to_panvk_physical_device(cmdbuf->vk.base.device->physical);
+   struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
+   struct pan_fb_info *fbinfo = &state->render.fb.info;
+   uint32_t att_width = UINT32_MAX, att_height = UINT32_MAX;
+
+   state->render.flags = pRenderingInfo->flags;
+
+   BITSET_SET(state->dirty, PANVK_CMD_GRAPHICS_DIRTY_RENDER_STATE);
+
+#if PAN_ARCH < 9
+   state->render.fb.bo_count = 0;
+   memset(state->render.fb.bos, 0, sizeof(state->render.fb.bos));
+#endif
+
+   state->render.first_provoking_vertex = U_TRISTATE_UNSET;
+#if PAN_ARCH >= 10
+   state->render.maybe_set_tds_provoking_vertex = NULL;
+   state->render.maybe_set_fbds_provoking_vertex = NULL;
+#endif
+   memset(state->render.fb.crc_valid, 0, sizeof(state->render.fb.crc_valid));
+   memset(&state->render.color_attachments, 0,
+          sizeof(state->render.color_attachments));
+   memset(&state->render.z_attachment, 0, sizeof(state->render.z_attachment));
+   memset(&state->render.s_attachment, 0, sizeof(state->render.s_attachment));
+   state->render.bound_attachments = 0;
+
+   const VkMultisampledRenderToSingleSampledInfoEXT *ms2ss_info =
+      vk_find_struct_const(pRenderingInfo,
+                           MULTISAMPLED_RENDER_TO_SINGLE_SAMPLED_INFO_EXT);
+   const bool ms2ss = ms2ss_info
+                         ? ms2ss_info->multisampledRenderToSingleSampledEnable
+                         : VK_FALSE;
+
+   cmdbuf->state.gfx.render.layer_count = pRenderingInfo->viewMask ?
+      util_last_bit(pRenderingInfo->viewMask) :
+      pRenderingInfo->layerCount;
+   cmdbuf->state.gfx.render.view_mask = pRenderingInfo->viewMask;
+   *fbinfo = (struct pan_fb_info){
+      .tile_buf_budget = pan_query_optimal_tib_size(PAN_ARCH, phys_dev->model),
+      .z_tile_buf_budget = pan_query_optimal_z_tib_size(PAN_ARCH, phys_dev->model),
+      .nr_samples = 0,
+      .rt_count = pRenderingInfo->colorAttachmentCount,
+   };
+   /* In case ms2ss is enabled, use the provided sample count.
+    * All attachments need to have sample count == 1 or the provided value.
+    * But, if all attachments have 1, we would end up choosing the wrong value
+    * if we don't set it here already. */
+   cmdbuf->state.gfx.render.fb.nr_samples =
+      ms2ss ? ms2ss_info->rasterizationSamples : 1;
+
+   assert(pRenderingInfo->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts));
+
+   for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
+      const VkRenderingAttachmentInfo *att =
+         &pRenderingInfo->pColorAttachments[i];
+      VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
+
+      if (!iview)
+         continue;
+
+      render_state_set_color_attachment(cmdbuf, att, i);
+      att_width = MIN2(iview->vk.extent.width, att_width);
+      att_height = MIN2(iview->vk.extent.height, att_height);
+   }
+
+   if (pRenderingInfo->pDepthAttachment &&
+       pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE) {
+      const VkRenderingAttachmentInfo *att = pRenderingInfo->pDepthAttachment;
+      VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
+
+      if (iview) {
+         assert(iview->vk.image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
+         render_state_set_z_attachment(cmdbuf, att);
+         att_width = MIN2(iview->vk.extent.width, att_width);
+         att_height = MIN2(iview->vk.extent.height, att_height);
+      }
+   }
+
+   if (pRenderingInfo->pStencilAttachment &&
+       pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE) {
+      const VkRenderingAttachmentInfo *att = pRenderingInfo->pStencilAttachment;
+      VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
+
+      if (iview) {
+         assert(iview->vk.image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
+         render_state_set_s_attachment(cmdbuf, att);
+         att_width = MIN2(iview->vk.extent.width, att_width);
+         att_height = MIN2(iview->vk.extent.height, att_height);
+      }
+   }
+
+   fbinfo->draw_extent.minx = pRenderingInfo->renderArea.offset.x;
+   fbinfo->draw_extent.maxx = pRenderingInfo->renderArea.offset.x +
+                              pRenderingInfo->renderArea.extent.width - 1;
+   fbinfo->draw_extent.miny = pRenderingInfo->renderArea.offset.y;
+   fbinfo->draw_extent.maxy = pRenderingInfo->renderArea.offset.y +
+                              pRenderingInfo->renderArea.extent.height - 1;
+
+   fbinfo->frame_bounding_box = fbinfo->draw_extent;
+
+   if (state->render.bound_attachments) {
+      fbinfo->width = att_width;
+      fbinfo->height = att_height;
+   } else {
+      fbinfo->width = fbinfo->draw_extent.maxx + 1;
+      fbinfo->height = fbinfo->draw_extent.maxy + 1;
+   }
+
+   assert(fbinfo->width && fbinfo->height);
+}
+
+void
+panvk_per_arch(cmd_select_tile_size)(struct panvk_cmd_buffer *cmdbuf)
+{
+   struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
+
+   /* In case we never emitted tiler/framebuffer descriptors, we emit the
+    * current sample count and compute tile size */
+   if (fbinfo->nr_samples == 0) {
+      fbinfo->nr_samples = cmdbuf->state.gfx.render.fb.nr_samples;
+      GENX(pan_select_tile_size)(fbinfo);
+
+#if PAN_ARCH != 6
+      if (fbinfo->cbuf_allocation > fbinfo->tile_buf_budget) {
+         vk_perf(VK_LOG_OBJS(&cmdbuf->vk.base),
+                 "Using too much tile-memory, disabling pipelining");
+      }
+#endif
+   } else {
+      /* In case we already emitted tiler/framebuffer descriptors, we ensure
+       * that the sample count didn't change (this should never happen) */
+      assert(fbinfo->nr_samples == cmdbuf->state.gfx.render.fb.nr_samples);
+   }
+}
+
+void
+panvk_per_arch(cmd_force_fb_preload)(struct panvk_cmd_buffer *cmdbuf,
+                                     const VkRenderingInfo *render_info)
+{
+   /* We force preloading for all active attachments when the render area is
+    * unaligned or when a barrier flushes prior draw calls in the middle of a
+    * render pass.  The two cases can be distinguished by whether a
+    * render_info is provided.
+    *
+    * When the render area is unaligned, we force preloading to preserve
+    * contents falling outside of the render area.  We also make sure the
+    * initial attachment clears are performed.
+    */
+   struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
+   struct pan_fb_info *fbinfo = &state->render.fb.info;
+   VkClearAttachment clear_atts[MAX_RTS + 2];
+   uint32_t clear_att_count = 0;
+
+   if (!state->render.bound_attachments)
+      return;
+
+   for (unsigned i = 0; i < fbinfo->rt_count; i++) {
+      if (!fbinfo->rts[i].view)
+         continue;
+
+      fbinfo->rts[i].preload = true;
+
+      if (fbinfo->rts[i].clear) {
+         if (render_info) {
+            const VkRenderingAttachmentInfo *att =
+               &render_info->pColorAttachments[i];
+
+            clear_atts[clear_att_count++] = (VkClearAttachment){
+               .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+               .colorAttachment = i,
+               .clearValue = att->clearValue,
+            };
+         }
+         fbinfo->rts[i].clear = false;
+      }
+   }
+
+   if (fbinfo->zs.view.zs) {
+      fbinfo->zs.preload.z = true;
+
+      if (fbinfo->zs.clear.z) {
+         if (render_info) {
+            const VkRenderingAttachmentInfo *att =
+               render_info->pDepthAttachment;
+
+            clear_atts[clear_att_count++] = (VkClearAttachment){
+               .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
+               .clearValue = att->clearValue,
+            };
+         }
+         fbinfo->zs.clear.z = false;
+      }
+   }
+
+   if (fbinfo->zs.view.s ||
+       (fbinfo->zs.view.zs &&
+        util_format_is_depth_and_stencil(fbinfo->zs.view.zs->format))) {
+      fbinfo->zs.preload.s = true;
+
+      if (fbinfo->zs.clear.s) {
+         if (render_info) {
+            const VkRenderingAttachmentInfo *att =
+               render_info->pStencilAttachment;
+
+            clear_atts[clear_att_count++] = (VkClearAttachment){
+               .aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT,
+               .clearValue = att->clearValue,
+            };
+         }
+
+         fbinfo->zs.clear.s = false;
+      }
+   }
+
+#if PAN_ARCH >= 10
+   /* insert a barrier for preload */
+   const VkMemoryBarrier2 mem_barrier = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+      .srcStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
+                      VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
+                      VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+      .srcAccessMask = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
+                       VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+      .dstStageMask = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
+      .dstAccessMask = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT,
+   };
+   const VkDependencyInfo dep_info = {
+      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+      .memoryBarrierCount = 1,
+      .pMemoryBarriers = &mem_barrier,
+   };
+   panvk_per_arch(CmdPipelineBarrier2)(panvk_cmd_buffer_to_handle(cmdbuf),
+                                       &dep_info);
+#endif
+
+   if (clear_att_count && render_info) {
+      VkClearRect clear_rect = {
+         .rect = render_info->renderArea,
+         .baseArrayLayer = 0,
+         .layerCount = render_info->viewMask ? 1 : render_info->layerCount,
+      };
+
+      panvk_per_arch(CmdClearAttachments)(panvk_cmd_buffer_to_handle(cmdbuf),
+                                          clear_att_count, clear_atts, 1,
+                                          &clear_rect);
+   }
+}
+
+void
+panvk_per_arch(cmd_preload_render_area_border)(
+   struct panvk_cmd_buffer *cmdbuf, const VkRenderingInfo *render_info)
+{
+   const unsigned meta_tile_size = pan_meta_tile_size(PAN_ARCH);
+   struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
+   struct pan_fb_info *fbinfo = &state->render.fb.info;
+
+   bool render_area_is_aligned =
+      ((fbinfo->draw_extent.minx | fbinfo->draw_extent.miny) %
+       meta_tile_size) == 0 &&
+      (fbinfo->draw_extent.maxx + 1 == fbinfo->width ||
+       (fbinfo->draw_extent.maxx % meta_tile_size) == (meta_tile_size - 1)) &&
+      (fbinfo->draw_extent.maxy + 1 == fbinfo->height ||
+       (fbinfo->draw_extent.maxy % meta_tile_size) == (meta_tile_size - 1));
+
+   /* If the render area is aligned on the meta tile size, we're good. */
+   if (!render_area_is_aligned)
+      panvk_per_arch(cmd_force_fb_preload)(cmdbuf, render_info);
+}
+
+static void
+prepare_iam_sysvals(struct panvk_cmd_buffer *cmdbuf, BITSET_WORD *dirty_sysvals)
+{
+   const struct vk_input_attachment_location_state *ial =
+      &cmdbuf->vk.dynamic_graphics_state.ial;
+   struct panvk_input_attachment_info iam[INPUT_ATTACHMENT_MAP_SIZE];
+   uint32_t catt_count =
+      ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN
+         ? MAX_RTS
+         : ial->color_attachment_count;
+
+   memset(iam, ~0, sizeof(iam));
+
+   assert(catt_count <= MAX_RTS);
+
+   for (uint32_t i = 0; i < catt_count; i++) {
+      if (ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED ||
+          !(cmdbuf->state.gfx.render.bound_attachments &
+            MESA_VK_RP_ATTACHMENT_COLOR_BIT(i)))
+         continue;
+
+      VkFormat fmt = cmdbuf->state.gfx.render.color_attachments.fmts[i];
+      enum pipe_format pfmt = vk_format_to_pipe_format(fmt);
+      struct mali_internal_conversion_packed conv;
+      uint32_t ia_idx = ial->color_map[i] + 1;
+      assert(ia_idx < ARRAY_SIZE(iam));
+
+      iam[ia_idx].target = PANVK_COLOR_ATTACHMENT(i);
+
+      pan_pack(&conv, INTERNAL_CONVERSION, cfg) {
+         cfg.memory_format =
+            GENX(pan_dithered_format_from_pipe_format)(pfmt, false);
+#if PAN_ARCH < 9
+         cfg.register_format =
+            vk_format_is_uint(fmt)   ? MALI_REGISTER_FILE_FORMAT_U32
+            : vk_format_is_sint(fmt) ? MALI_REGISTER_FILE_FORMAT_I32
+                                     : MALI_REGISTER_FILE_FORMAT_F32;
+#endif
+      }
+
+      iam[ia_idx].conversion = conv.opaque[0];
+   }
+
+   if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
+      uint32_t ia_idx =
+         ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX ? 0 : ial->depth_att + 1;
+
+      assert(ia_idx < ARRAY_SIZE(iam));
+      iam[ia_idx].target = PANVK_ZS_ATTACHMENT;
+
+#if PAN_ARCH < 9
+      /* On v7, we need to pass the depth format around. If we use a conversion
+       * of zero, like we do on v9+, the GPU reports an INVALID_INSTR_ENC. */
+      VkFormat fmt = cmdbuf->state.gfx.render.z_attachment.fmt;
+      enum pipe_format pfmt = vk_format_to_pipe_format(fmt);
+      struct mali_internal_conversion_packed conv;
+
+      pan_pack(&conv, INTERNAL_CONVERSION, cfg) {
+         cfg.register_format = MALI_REGISTER_FILE_FORMAT_F32;
+         cfg.memory_format =
+            GENX(pan_dithered_format_from_pipe_format)(pfmt, false);
+      }
+      iam[ia_idx].conversion = conv.opaque[0];
+#endif
+   }
+
+   if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
+      uint32_t ia_idx =
+         ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX ? 0 : ial->stencil_att + 1;
+
+      assert(ia_idx < ARRAY_SIZE(iam));
+      iam[ia_idx].target = PANVK_ZS_ATTACHMENT;
+   }
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(iam); i++)
+      set_gfx_sysval(cmdbuf, dirty_sysvals, iam[i], iam[i]);
+}
+
+/* This value has been selected to get
+ * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero passing.
+ */
+#define MIN_DEPTH_CLIP_RANGE 37.7E-06f
+
+void
+panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
+                                         const struct panvk_draw_info *info)
+{
+   struct vk_color_blend_state *cb = &cmdbuf->vk.dynamic_graphics_state.cb;
+   const struct panvk_shader_variant *fs =
+      panvk_shader_only_variant(get_fs(cmdbuf));
+   uint32_t noperspective_varyings = fs ? fs->info.varyings.noperspective : 0;
+   BITSET_DECLARE(dirty_sysvals, MAX_SYSVAL_FAUS) = {0};
+
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.noperspective_varyings,
+                  noperspective_varyings);
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.first_vertex, info->vertex.base);
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.base_instance, info->instance.base);
+
+#if PAN_ARCH < 9
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
+                  info->vertex.raw_offset);
+   set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
+
+   /* iter13: VK_EXT_transform_feedback sysvals — always set (per draw),
+    * reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count);
+   {
+      const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx;
+      /* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS
+       * (= 1<<63). This is the Panfrost-Gallium memory-sink idiom — the
+       * Bifrost MMU silently discards stores to this address, so a pipeline
+       * with XFB outputs used in a non-XFB draw (or in an XFB draw with
+       * fewer bound buffers than the shader declares) is safe instead of
+       * faulting. See gallium/drivers/panfrost/pan_cmdstream.c PAN_SYSVAL_XFB. */
+      uint64_t _xa0 = PAN_SHADER_OOB_ADDRESS, _xa1 = PAN_SHADER_OOB_ADDRESS,
+               _xa2 = PAN_SHADER_OOB_ADDRESS, _xa3 = PAN_SHADER_OOB_ADDRESS;
+      if (_gfx->xfb.active) {
+         if (_gfx->xfb.buffer_count > 0 && _gfx->xfb.buffers[0].addr)
+            _xa0 = _gfx->xfb.buffers[0].addr + _gfx->xfb.buffers[0].offset;
+         if (_gfx->xfb.buffer_count > 1 && _gfx->xfb.buffers[1].addr)
+            _xa1 = _gfx->xfb.buffers[1].addr + _gfx->xfb.buffers[1].offset;
+         if (_gfx->xfb.buffer_count > 2 && _gfx->xfb.buffers[2].addr)
+            _xa2 = _gfx->xfb.buffers[2].addr + _gfx->xfb.buffers[2].offset;
+         if (_gfx->xfb.buffer_count > 3 && _gfx->xfb.buffers[3].addr)
+            _xa3 = _gfx->xfb.buffers[3].addr + _gfx->xfb.buffers[3].offset;
+      }
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[0], _xa0);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[1], _xa1);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[2], _xa2);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[3], _xa3);
+   }
+#endif
+
+   if (dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS)) {
+      for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++) {
+         set_gfx_sysval(cmdbuf, dirty_sysvals, blend.constants[i],
+                        cb->blend_constants[i]);
+      }
+   }
+
+   for (unsigned i = 0; i < MAX_RTS; i++) {
+      set_gfx_sysval(cmdbuf, dirty_sysvals, fs.blend_descs[i],
+                     cmdbuf->state.gfx.fs.blend_descs[i]);
+   }
+
+   if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
+       dyn_gfx_state_dirty(cmdbuf, VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
+       dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
+       dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE)) {
+      const struct vk_rasterization_state *rs =
+         &cmdbuf->vk.dynamic_graphics_state.rs;
+      const struct vk_viewport_state *vp =
+         &cmdbuf->vk.dynamic_graphics_state.vp;
+      const VkViewport *viewport = &vp->viewports[0];
+
+      /* Doing the viewport transform in the vertex shader and then depth
+       * clipping with the viewport depth range gets a similar result to
+       * clipping in clip-space, but loses precision when the viewport depth
+       * range is very small. When minDepth == maxDepth, this completely
+       * flattens the clip-space depth and results in never clipping.
+       *
+       * To work around this, set a lower limit on depth range when clipping is
+       * enabled. This results in slightly incorrect fragment depth values, and
+       * doesn't help with the precision loss, but at least clipping isn't
+       * completely broken.
+       */
+      float z_min = viewport->minDepth;
+      float z_max = viewport->maxDepth;
+      if (vk_rasterization_state_depth_clip_enable(rs) &&
+          fabsf(z_max - z_min) < MIN_DEPTH_CLIP_RANGE) {
+         float z_sign = z_min <= z_max ? 1.0f : -1.0f;
+
+         float z_center = 0.5f * (z_max + z_min);
+         /* Bump offset off-center if necessary, to not go out of range */
+         z_center = CLAMP(z_center, 0.5f * MIN_DEPTH_CLIP_RANGE,
+                          1.0f - 0.5f * MIN_DEPTH_CLIP_RANGE);
+
+         z_min = z_center - 0.5f * z_sign * MIN_DEPTH_CLIP_RANGE;
+         z_max = z_center + 0.5f * z_sign * MIN_DEPTH_CLIP_RANGE;
+      }
+
+      /* Upload the viewport scale. Defined as (px/2, py/2, pz) at the start of
+       * section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the
+       * end of the section, the spec defines:
+       *
+       * px = width
+       * py = height
+       * pz = maxDepth - minDepth         if negativeOneToOne is false
+       * pz = (maxDepth - minDepth) / 2   if negativeOneToOne is true
+       */
+      set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.x,
+                     0.5f * viewport->width);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.y,
+                     0.5f * viewport->height);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.z,
+                     vp->depth_clip_negative_one_to_one ?
+                        0.5f * (z_max - z_min) : z_max - z_min);
+
+      /* Upload the viewport offset. Defined as (ox, oy, oz) at the start of
+       * section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the
+       * end of the section, the spec defines:
+       *
+       * ox = x + width/2
+       * oy = y + height/2
+       * oz = minDepth                    if negativeOneToOne is false
+       * oz = (maxDepth + minDepth) / 2   if negativeOneToOne is true
+       */
+      set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.x,
+                     (0.5f * viewport->width) + viewport->x);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.y,
+                     (0.5f * viewport->height) + viewport->y);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.z,
+                     vp->depth_clip_negative_one_to_one ?
+                        0.5f * (z_min + z_max) : z_min);
+
+   }
+
+   if (dyn_gfx_state_dirty(cmdbuf, INPUT_ATTACHMENT_MAP))
+      prepare_iam_sysvals(cmdbuf, dirty_sysvals);
+
+   const struct panvk_shader_variant *vs =
+      panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader);
+
+#if PAN_ARCH < 9
+   struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
+   struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
+   struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
+
+   if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS)) {
+      set_gfx_sysval(cmdbuf, dirty_sysvals,
+                     desc.sets[PANVK_DESC_TABLE_VS_DYN_SSBOS],
+                     vs_desc_state->dyn_ssbos);
+   }
+
+   if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, FS)) {
+      set_gfx_sysval(cmdbuf, dirty_sysvals,
+                     desc.sets[PANVK_DESC_TABLE_FS_DYN_SSBOS],
+                     fs_desc_state->dyn_ssbos);
+   }
+
+   for (uint32_t i = 0; i < MAX_SETS; i++) {
+      uint32_t used_set_mask =
+         vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
+
+      if (used_set_mask & BITFIELD_BIT(i)) {
+         set_gfx_sysval(cmdbuf, dirty_sysvals, desc.sets[i],
+                        desc_state->sets[i]->descs.dev);
+      }
+   }
+#endif
+
+   /* We mask the dirty sysvals by the shader usage, and only flag
+    * the push uniforms dirty if those intersect. */
+   BITSET_DECLARE(dirty_shader_sysvals, MAX_SYSVAL_FAUS);
+   BITSET_AND(dirty_shader_sysvals, dirty_sysvals, vs->fau.used_sysvals);
+   if (!BITSET_IS_EMPTY(dirty_shader_sysvals))
+      gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
+
+   if (fs) {
+      BITSET_AND(dirty_shader_sysvals, dirty_sysvals, fs->fau.used_sysvals);
+
+      /* If blend constants are not read by the blend shader, we can consider
+       * they are not read at all, so clear the dirty bits to avoid re-emitting
+       * FAUs when we can. */
+      if (!cmdbuf->state.gfx.cb.info.shader_loads_blend_const)
+         BITSET_CLEAR_COUNT(dirty_shader_sysvals, 0, 4);
+
+      if (!BITSET_IS_EMPTY(dirty_shader_sysvals))
+         gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdBindVertexBuffers2)(VkCommandBuffer commandBuffer,
+                                      uint32_t firstBinding,
+                                      uint32_t bindingCount,
+                                      const VkBuffer *pBuffers,
+                                      const VkDeviceSize *pOffsets,
+                                      const VkDeviceSize *pSizes,
+                                      const VkDeviceSize *pStrides)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+
+   assert(firstBinding + bindingCount <= MAX_VBS);
+
+   if (pStrides) {
+      vk_cmd_set_vertex_binding_strides(&cmdbuf->vk, firstBinding,
+                                        bindingCount, pStrides);
+   }
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      VK_FROM_HANDLE(panvk_buffer, buffer, pBuffers[i]);
+
+      if (buffer) {
+         cmdbuf->state.gfx.vb.bufs[firstBinding + i].address =
+            panvk_buffer_gpu_ptr(buffer, pOffsets[i]);
+         cmdbuf->state.gfx.vb.bufs[firstBinding + i].size = panvk_buffer_range(
+            buffer, pOffsets[i], pSizes ? pSizes[i] : VK_WHOLE_SIZE);
+      } else {
+         cmdbuf->state.gfx.vb.bufs[firstBinding + i].address = 0;
+         cmdbuf->state.gfx.vb.bufs[firstBinding + i].size = 0;
+      }
+   }
+
+   cmdbuf->state.gfx.vb.count =
+      MAX2(cmdbuf->state.gfx.vb.count, firstBinding + bindingCount);
+   gfx_state_set_dirty(cmdbuf, VB);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdBindIndexBuffer2)(VkCommandBuffer commandBuffer,
+                                    VkBuffer buffer, VkDeviceSize offset,
+                                    VkDeviceSize size, VkIndexType indexType)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   VK_FROM_HANDLE(panvk_buffer, buf, buffer);
+
+   if (buf) {
+      cmdbuf->state.gfx.ib.size = panvk_buffer_range(buf, offset, size);
+      assert(cmdbuf->state.gfx.ib.size <= UINT32_MAX);
+      cmdbuf->state.gfx.ib.dev_addr = panvk_buffer_gpu_ptr(buf, offset);
+   } else {
+      cmdbuf->state.gfx.ib.size = 0;
+      /* In case of NullDescriptors, we need to set a non-NULL address and rely
+       * on out-of-bounds behavior against the zero size of the buffer. Note
+       * that this only works for v10+, as v9 does not have a way to specify the
+       * index buffer size. */
+      cmdbuf->state.gfx.ib.dev_addr = PAN_ARCH >= 10 ? 0x1000 : 0;
+   }
+   cmdbuf->state.gfx.ib.index_size = vk_index_type_to_bytes(indexType);
+
+   gfx_state_set_dirty(cmdbuf, IB);
+}
@@ -0,0 +1,442 @@
+#!/usr/bin/env python3
+"""
+iter13: apply VK_EXT_transform_feedback implementation to Mesa 26.0.6 PanVk.
+
+Run from inside /home/mfritsche/mesa-build/mesa-26.0.6/ on ohm.
+Idempotent — checks if changes are already present and skips if so.
+
+The implementation is single-variant (Vulkan spec allows undefined behavior
+for XFB-output shaders bound outside Begin/EndTransformFeedback, so we
+don't need defensive two-variant compilation for v1).
+
+Files modified:
+  1. src/panfrost/vulkan/panvk_shader.h
+  2. src/panfrost/vulkan/panvk_vX_physical_device.c
+  3. src/panfrost/vulkan/panvk_vX_shader.c
+  4. src/panfrost/vulkan/panvk_cmd_draw.h
+  5. src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
+  6. src/panfrost/vulkan/meson.build
+Files created:
+  7. src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c
+"""
+
+import os
+import sys
+
+ROOT = os.path.abspath(os.path.dirname(__file__)) if "MESA_ROOT" not in os.environ else os.environ["MESA_ROOT"]
+# Default: assume cwd is mesa root
+if os.path.basename(os.getcwd()).startswith("mesa-"):
+    ROOT = os.getcwd()
+
+print(f"[iter13] applying patches under {ROOT}")
+
+
+def replace_once(path, old, new, marker_in_new=None):
+    """Replace `old` with `new` in file at path. If `marker_in_new` is in the
+    file already, treat as already-applied and skip."""
+    full = os.path.join(ROOT, path)
+    with open(full) as f:
+        content = f.read()
+    if marker_in_new and marker_in_new in content:
+        print(f"  [skip] {path} — already patched ({marker_in_new!r} present)")
+        return
+    if old not in content:
+        print(f"  [FAIL] {path} — expected pattern not found:\n    {old[:100]!r}")
+        sys.exit(2)
+    count = content.count(old)
+    if count > 1:
+        print(f"  [FAIL] {path} — pattern matches {count} times, need exactly 1")
+        sys.exit(2)
+    new_content = content.replace(old, new)
+    with open(full, "w") as f:
+        f.write(new_content)
+    print(f"  [ok] {path}")
+
+
+def create_file(path, content, skip_if_exists=True):
+    full = os.path.join(ROOT, path)
+    if skip_if_exists and os.path.exists(full):
+        print(f"  [skip] {path} — exists")
+        return
+    os.makedirs(os.path.dirname(full), exist_ok=True)
+    with open(full, "w") as f:
+        f.write(content)
+    print(f"  [ok] {path} (created)")
+
+
+# ============================================================
+# 1. panvk_shader.h — extend vs sysval struct (PAN_ARCH < 9)
+# ============================================================
+
+print("\n[1/7] panvk_shader.h — add num_vertices + xfb_address[4] to vs sysvals")
+replace_once(
+    "src/panfrost/vulkan/panvk_shader.h",
+    """   struct {
+#if PAN_ARCH < 9
+      int32_t raw_vertex_offset;
+#endif
+      int32_t first_vertex;
+      int32_t base_instance;
+      uint32_t noperspective_varyings;
+   } vs;""",
+    """   struct {
+#if PAN_ARCH < 9
+      int32_t raw_vertex_offset;
+      uint32_t num_vertices;       /* iter13: XFB needs per-draw vertex count */
+      uint32_t _pad_xfb;            /* keep 8-byte alignment before u64 array */
+      aligned_u64 xfb_address[4];  /* iter13: 4 transform feedback buffer base addresses */
+#endif
+      int32_t first_vertex;
+      int32_t base_instance;
+      uint32_t noperspective_varyings;
+   } vs;""",
+    marker_in_new="xfb_address[4]",
+)
+
+
+# ============================================================
+# 2. panvk_vX_physical_device.c — expose ext + features + properties
+# ============================================================
+
+print("\n[2/7] panvk_vX_physical_device.c — expose VK_EXT_transform_feedback")
+
+# A. Add extension to the ext list (find a stable nearby line)
+replace_once(
+    "src/panfrost/vulkan/panvk_vX_physical_device.c",
+    "      .EXT_robustness2 = true,",
+    """      .EXT_robustness2 = true,
+      .EXT_transform_feedback = PAN_ARCH < 9,   /* iter13: JM-class only for now */""",
+    marker_in_new="EXT_transform_feedback",
+)
+
+# B. Add features. The features block has /* VK_KHR_robustness2 */ nearby.
+replace_once(
+    "src/panfrost/vulkan/panvk_vX_physical_device.c",
+    """      /* VK_KHR_robustness2 */
+      .robustBufferAccess2 = PAN_ARCH >= 11,
+      .robustImageAccess2 = false,
+      .nullDescriptor = true,""",
+    """      /* VK_KHR_robustness2 */
+      .robustBufferAccess2 = PAN_ARCH >= 11,
+      .robustImageAccess2 = false,
+      .nullDescriptor = true,
+
+      /* VK_EXT_transform_feedback (iter13) */
+      .transformFeedback = PAN_ARCH < 9,
+      .geometryStreams = false,""",
+    marker_in_new=".transformFeedback = PAN_ARCH < 9",
+)
+
+# C. Add properties. Anchor to the existing /* VK_KHR_robustness2 */ properties
+# block near line 1019. We'll add right after it.
+replace_once(
+    "src/panfrost/vulkan/panvk_vX_physical_device.c",
+    """      /* VK_KHR_robustness2 */
+      .robustStorageBufferAccessSizeAlignment = 1,
+      .robustUniformBufferAccessSizeAlignment = 1,""",
+    """      /* VK_KHR_robustness2 */
+      .robustStorageBufferAccessSizeAlignment = 1,
+      .robustUniformBufferAccessSizeAlignment = 1,
+
+      /* VK_EXT_transform_feedback (iter13) */
+      .maxTransformFeedbackStreams = 1,
+      .maxTransformFeedbackBuffers = 4,
+      .maxTransformFeedbackBufferSize = UINT32_MAX,
+      .maxTransformFeedbackStreamDataSize = 512,
+      .maxTransformFeedbackBufferDataSize = 512,
+      .maxTransformFeedbackBufferDataStride = 2048,
+      .transformFeedbackQueries = false,
+      .transformFeedbackStreamsLinesTriangles = false,
+      .transformFeedbackRasterizationStreamSelect = false,
+      .transformFeedbackDraw = false,""",
+    marker_in_new="maxTransformFeedbackStreams",
+)
+
+
+# ============================================================
+# 3. panvk_vX_shader.c — intrinsic lowering + NIR pass wiring
+# ============================================================
+
+print("\n[3/7] panvk_vX_shader.c — intrinsic lowering + pan_nir_lower_xfb wiring")
+
+# A. Add intrinsic cases inside the PAN_ARCH < 9 block.
+# Anchor to the existing `vs.raw_vertex_offset` case.
+replace_once(
+    "src/panfrost/vulkan/panvk_vX_shader.c",
+    """#if PAN_ARCH < 9
+   case nir_intrinsic_load_raw_vertex_offset_pan:
+      val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
+      break;""",
+    """#if PAN_ARCH < 9
+   case nir_intrinsic_load_raw_vertex_offset_pan:
+      val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
+      break;
+   case nir_intrinsic_load_num_vertices:    /* iter13: XFB index calc */
+      val = load_sysval(b, graphics, bit_size, vs.num_vertices);
+      break;
+   case nir_intrinsic_load_xfb_address: {   /* iter13: XFB buffer N base address */
+      unsigned idx = nir_intrinsic_base(intr);
+      switch (idx) {
+      case 0: val = load_sysval(b, graphics, bit_size, vs.xfb_address[0]); break;
+      case 1: val = load_sysval(b, graphics, bit_size, vs.xfb_address[1]); break;
+      case 2: val = load_sysval(b, graphics, bit_size, vs.xfb_address[2]); break;
+      case 3: val = load_sysval(b, graphics, bit_size, vs.xfb_address[3]); break;
+      default: return false;
+      }
+      break;
+   }""",
+    marker_in_new="load_num_vertices",
+)
+
+# B. Wire pan_nir_lower_xfb into the lowering chain.
+# We want it right after nir_lower_system_values runs.
+# Look for the existing call.
+replace_once(
+    "src/panfrost/vulkan/panvk_vX_shader.c",
+    """   NIR_PASS(_, nir, nir_lower_system_values);
+
+   nir_lower_compute_system_values_options options = {""",
+    """   NIR_PASS(_, nir, nir_lower_system_values);
+
+#if PAN_ARCH < 9
+   /* iter13: VK_EXT_transform_feedback — if the shader has XFB output
+    * decorations, run the Mesa standard XFB-info NIR pass + Panfrost's
+    * own NIR lowering that turns store_output into nir_store_global
+    * to the per-buffer base address (the panvk lowering above wires
+    * nir_load_xfb_address to vs.xfb_address[N]). Single-variant: if
+    * an app binds an XFB pipeline outside vkCmdBeginTransformFeedback,
+    * the writes go to address 0 — undefined behavior per spec. */
+   if (nir->info.stage == MESA_SHADER_VERTEX &&
+       nir->xfb_info != NULL) {
+      NIR_PASS(_, nir, pan_nir_lower_xfb);
+   }
+#endif
+
+   nir_lower_compute_system_values_options options = {""",
+    marker_in_new="pan_nir_lower_xfb",
+)
+
+# C. Add #include for pan_nir.h at the top (where pan_nir_lower_xfb is declared)
+replace_once(
+    "src/panfrost/vulkan/panvk_vX_shader.c",
+    '#include "panvk_shader.h"',
+    '#include "panvk_shader.h"\n#include "pan_nir.h"   /* iter13: pan_nir_lower_xfb */',
+    marker_in_new='/* iter13: pan_nir_lower_xfb */',
+)
+
+
+# ============================================================
+# 4. panvk_cmd_draw.h — add XFB state struct + pipeline state member
+# ============================================================
+
+print("\n[4/7] panvk_cmd_draw.h — add panvk_xfb_state to cmd buffer state")
+
+# We add a definition and inject xfb into the graphics state.
+# We need to find the right place. Looking at the file: there's a `struct
+# panvk_graphics_state` or similar that holds per-cmdbuf graphics state.
+
+# This is intrinsically file-specific; we need to read the file to find the right spot.
+# For now, place a self-contained inclusion at the top of the file and add
+# state as a separate sibling struct in the gfx state. The cleaner long-term
+# place is inside the existing graphics state struct.
+
+# Defer the inclusion approach. Instead use a forward declaration + put the
+# struct definition in jm/panvk_vX_cmd_xfb.c and reference via include.
+
+# Actually let's just add a state struct to panvk_cmd_draw.h after the sysvals member.
+replace_once(
+    "src/panfrost/vulkan/panvk_cmd_draw.h",
+    "   struct panvk_graphics_sysvals sysvals;",
+    """   struct panvk_graphics_sysvals sysvals;
+
+#if PAN_ARCH < 9
+   /* iter13: VK_EXT_transform_feedback state (JM-class only for now). */
+   struct {
+      bool active;
+      uint32_t buffer_count;
+      struct {
+         uint64_t addr;
+         uint64_t offset;
+         uint64_t size;
+      } buffers[4];
+   } xfb;
+#endif""",
+    marker_in_new="iter13: VK_EXT_transform_feedback state",
+)
+
+
+# ============================================================
+# 5. panvk_vX_cmd_draw.c (arch-templated, NOT jm/) — populate XFB sysvals
+# ============================================================
+
+print("\n[5/7] panvk_vX_cmd_draw.c — populate vs.num_vertices + vs.xfb_address[] inside the PAN_ARCH<9 block")
+
+# Insert just inside the existing `#if PAN_ARCH < 9` block where
+# raw_vertex_offset is set. info->vertex.count is available in scope.
+replace_once(
+    "src/panfrost/vulkan/panvk_vX_cmd_draw.c",
+    """#if PAN_ARCH < 9
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
+                  info->vertex.raw_offset);
+   set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
+#endif""",
+    """#if PAN_ARCH < 9
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
+                  info->vertex.raw_offset);
+   set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
+
+   /* iter13: VK_EXT_transform_feedback sysvals — always set (per draw),
+    * reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */
+   set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count);
+   {
+      const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx;
+      uint64_t _xa0 = 0, _xa1 = 0, _xa2 = 0, _xa3 = 0;
+      if (_gfx->xfb.active) {
+         if (_gfx->xfb.buffer_count > 0)
+            _xa0 = _gfx->xfb.buffers[0].addr + _gfx->xfb.buffers[0].offset;
+         if (_gfx->xfb.buffer_count > 1)
+            _xa1 = _gfx->xfb.buffers[1].addr + _gfx->xfb.buffers[1].offset;
+         if (_gfx->xfb.buffer_count > 2)
+            _xa2 = _gfx->xfb.buffers[2].addr + _gfx->xfb.buffers[2].offset;
+         if (_gfx->xfb.buffer_count > 3)
+            _xa3 = _gfx->xfb.buffers[3].addr + _gfx->xfb.buffers[3].offset;
+      }
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[0], _xa0);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[1], _xa1);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[2], _xa2);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[3], _xa3);
+   }
+#endif""",
+    marker_in_new="iter13: VK_EXT_transform_feedback sysvals",
+)
+
+
+# ============================================================
+# 6. NEW: jm/panvk_vX_cmd_xfb.c — Vulkan command handlers
+# ============================================================
+
+print("\n[6/7] jm/panvk_vX_cmd_xfb.c — XFB Vulkan command handlers (NEW FILE)")
+
+xfb_c = r'''/*
+ * Copyright © 2026 mfritsche / claude-noether
+ * SPDX-License-Identifier: MIT
+ *
+ * iter13: VK_EXT_transform_feedback command handlers for the JM
+ * architecture path (Bifrost v6/v7 + Valhall-JM v9).
+ *
+ * The runtime contract:
+ *   - vkCmdBindTransformFeedbackBuffersEXT: stash (gpu_addr, offset, size)
+ *     for each slot into cmdbuf->state.gfx.xfb.buffers[].
+ *   - vkCmdBeginTransformFeedbackEXT: set cmdbuf->state.gfx.xfb.active = true.
+ *     Mark sysvals dirty so the next draw re-emits vs.xfb_address[].
+ *   - vkCmdEndTransformFeedbackEXT: set active = false.
+ *
+ * Counter buffers (firstCounterBuffer/counterBufferCount/pCounterBuffers/
+ * pCounterBufferOffsets) are accepted by API but ignored — v1 doesn't
+ * support pause/resume. transformFeedbackDraw is advertised as false.
+ *
+ * Per-draw integration: jm/panvk_vX_cmd_draw.c reads cmdbuf->state.gfx.xfb
+ * and populates vs.xfb_address[i] for shader use. The pan_nir_lower_xfb
+ * pass in panvk_vX_shader.c emits nir_load_xfb_address(i) which lowers
+ * (via panvk_vX_shader.c sysval handler) to a load from the per-draw
+ * sysval push area.
+ */
+
+#include "vk_log.h"
+
+#include "panvk_cmd_buffer.h"
+#include "panvk_cmd_draw.h"
+#include "panvk_buffer.h"
+#include "panvk_entrypoints.h"
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdBindTransformFeedbackBuffersEXT)(
+   VkCommandBuffer commandBuffer,
+   uint32_t firstBinding,
+   uint32_t bindingCount,
+   const VkBuffer *pBuffers,
+   const VkDeviceSize *pOffsets,
+   const VkDeviceSize *pSizes)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      uint32_t slot = firstBinding + i;
+      if (slot >= 4)
+         continue;
+
+      VK_FROM_HANDLE(panvk_buffer, buf, pBuffers[i]);
+      gfx->xfb.buffers[slot].addr = panvk_buffer_gpu_ptr(buf, 0);
+      gfx->xfb.buffers[slot].offset = pOffsets[i];
+      gfx->xfb.buffers[slot].size =
+         (pSizes != NULL && pSizes[i] != VK_WHOLE_SIZE)
+            ? pSizes[i]
+            : (buf->vk.size - pOffsets[i]);
+   }
+
+   if (firstBinding + bindingCount > gfx->xfb.buffer_count)
+      gfx->xfb.buffer_count = firstBinding + bindingCount;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdBeginTransformFeedbackEXT)(
+   VkCommandBuffer commandBuffer,
+   uint32_t firstCounterBuffer,
+   uint32_t counterBufferCount,
+   const VkBuffer *pCounterBuffers,
+   const VkDeviceSize *pCounterBufferOffsets)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
+
+   /* Counter buffers ignored in v1 — see VkPhysicalDeviceTransformFeedback
+    * PropertiesEXT.transformFeedbackDraw = false in panvk_vX_physical_device.c.
+    */
+   (void)firstCounterBuffer;
+   (void)counterBufferCount;
+   (void)pCounterBuffers;
+   (void)pCounterBufferOffsets;
+
+   gfx->xfb.active = true;
+   /* Per-draw set_gfx_sysval picks up the change automatically — no
+    * explicit dirty marking required (set_gfx_sysval uses memcmp +
+    * BITSET to detect state diffs and re-emit sysvals). */
+}
+
+VKAPI_ATTR void VKAPI_CALL
+panvk_per_arch(CmdEndTransformFeedbackEXT)(
+   VkCommandBuffer commandBuffer,
+   uint32_t firstCounterBuffer,
+   uint32_t counterBufferCount,
+   const VkBuffer *pCounterBuffers,
+   const VkDeviceSize *pCounterBufferOffsets)
+{
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
+
+   (void)firstCounterBuffer;
+   (void)counterBufferCount;
+   (void)pCounterBuffers;
+   (void)pCounterBufferOffsets;
+
+   gfx->xfb.active = false;
+}
+'''
+create_file("src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c", xfb_c)
+
+
+# ============================================================
+# 7. meson.build — register the new file in the jm_files array
+# ============================================================
+
+print("\n[7/7] meson.build — register jm/panvk_vX_cmd_xfb.c")
+replace_once(
+    "src/panfrost/vulkan/meson.build",
+    "jm_files = [\n  'jm/panvk_vX_bind_queue.c',",
+    "jm_files = [\n  'jm/panvk_vX_bind_queue.c',\n  'jm/panvk_vX_cmd_xfb.c',   # iter13",
+    marker_in_new="iter13",
+)
+
+
+print("\n[iter13] all patches applied — run incremental ninja build next")
@@ -0,0 +1,438 @@
+/*
+ * iter13 minimal Vulkan transform feedback probe.
+ *
+ * Goal: drive a single-stream, single-buffer VK_EXT_transform_feedback
+ * capture end-to-end on (patched) PanVk-Bifrost — 3 vertices, each emitting
+ * one vec4 with a known pattern, captured into a host-visible buffer, read
+ * back and verified byte-exactly.
+ *
+ * Uses VK_EXT_transform_feedback. If the extension isn't exposed by the
+ * driver, the probe exits with an error before doing any GPU work.
+ *
+ * Pipeline shape:
+ *   - vertex shader (probe_xfb.vert) writes a vec4 per vertex
+ *   - no fragment shader needed (rasterizerDiscardEnable=VK_TRUE)
+ *   - dynamic rendering with 0 color attachments
+ *   - vkCmdBindTransformFeedbackBuffersEXT + vkCmdBeginTransformFeedbackEXT
+ *     wrap a vkCmdDraw(3, 1, 0, 0)
+ *   - readback buffer is 3*16 = 48 bytes
+ *
+ * Pure Vulkan 1.0 core + VK_KHR_dynamic_rendering + VK_EXT_transform_feedback.
+ */
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <vulkan/vulkan.h>
+
+#define VERTEX_COUNT 3
+#define XFB_BUFFER_BYTES (VERTEX_COUNT * 16)   /* 3 vec4s = 48 bytes */
+#define VSPV_PATH "probe_xfb.vert.spv"
+
+#define STEP(name) do { fprintf(stderr, "[step] " name "\n"); fflush(stderr); } while (0)
+
+#define VK_CHECK(call) do {                                                    \
+    VkResult _r = (call);                                                      \
+    if (_r != VK_SUCCESS) {                                                    \
+        fprintf(stderr, "[fail] " #call " => %d at %s:%d\n",                   \
+                (int)_r, __FILE__, __LINE__);                                  \
+        exit(2);                                                               \
+    }                                                                          \
+} while (0)
+
+static uint32_t *read_spv(const char *path, size_t *out_bytes)
+{
+    FILE *f = fopen(path, "rb");
+    if (!f) { fprintf(stderr, "[fail] open %s: %s\n", path, strerror(errno)); exit(3); }
+    fseek(f, 0, SEEK_END);
+    long n = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    uint32_t *buf = malloc((size_t)n);
+    fread(buf, 1, (size_t)n, f);
+    fclose(f);
+    *out_bytes = (size_t)n;
+    return buf;
+}
+
+static uint32_t pick_memtype(const VkPhysicalDeviceMemoryProperties *mp,
+                             uint32_t type_bits, VkMemoryPropertyFlags want)
+{
+    for (uint32_t i = 0; i < mp->memoryTypeCount; i++) {
+        if ((type_bits & (1u << i)) &&
+            (mp->memoryTypes[i].propertyFlags & want) == want)
+            return i;
+    }
+    fprintf(stderr, "[fail] no memtype\n"); exit(4);
+}
+
+static uint32_t pick_host_visible(const VkPhysicalDeviceMemoryProperties *mp,
+                                  uint32_t type_bits)
+{
+    VkMemoryPropertyFlags pref =
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+        VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    for (uint32_t i = 0; i < mp->memoryTypeCount; i++) {
+        if ((type_bits & (1u << i)) &&
+            (mp->memoryTypes[i].propertyFlags & pref) == pref) return i;
+    }
+    for (uint32_t i = 0; i < mp->memoryTypeCount; i++) {
+        if ((type_bits & (1u << i)) &&
+            (mp->memoryTypes[i].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
+            return i;
+    }
+    fprintf(stderr, "[fail] no HOST_VISIBLE\n"); exit(4);
+}
+
+int main(void)
+{
+    STEP("vkCreateInstance");
+    const char *inst_exts[] = { "VK_KHR_get_physical_device_properties2" };
+    VkApplicationInfo app = {
+        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+        .pApplicationName = "panvk-bifrost iter13 XFB probe",
+        .apiVersion = VK_API_VERSION_1_0,
+    };
+    VkInstanceCreateInfo ici = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pApplicationInfo = &app,
+        .enabledExtensionCount = 1,
+        .ppEnabledExtensionNames = inst_exts,
+    };
+    VkInstance inst;
+    VK_CHECK(vkCreateInstance(&ici, NULL, &inst));
+
+    uint32_t n_phys = 0;
+    VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, NULL));
+    VkPhysicalDevice *phys = calloc(n_phys, sizeof(*phys));
+    VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, phys));
+    VkPhysicalDevice gpu = phys[0];
+
+    /* Check VK_EXT_transform_feedback is exposed before we proceed. */
+    uint32_t ext_count = 0;
+    vkEnumerateDeviceExtensionProperties(gpu, NULL, &ext_count, NULL);
+    VkExtensionProperties *exts = calloc(ext_count, sizeof(*exts));
+    vkEnumerateDeviceExtensionProperties(gpu, NULL, &ext_count, exts);
+    int has_xfb = 0;
+    for (uint32_t i = 0; i < ext_count; i++) {
+        if (!strcmp(exts[i].extensionName, "VK_EXT_transform_feedback"))
+            has_xfb = 1;
+    }
+    free(exts);
+    if (!has_xfb) {
+        fprintf(stderr, "[fail] VK_EXT_transform_feedback NOT exposed by driver "
+                "(this is the iter13 implementation gap — re-run on a Mesa "
+                "build with the iter13 patches applied)\n");
+        return 9;
+    }
+    fprintf(stderr, "[info] VK_EXT_transform_feedback present on device\n");
+
+    VkPhysicalDeviceMemoryProperties mp;
+    vkGetPhysicalDeviceMemoryProperties(gpu, &mp);
+
+    /* Query the transform feedback features struct via vkGetPhysicalDeviceFeatures2. */
+    PFN_vkGetPhysicalDeviceFeatures2KHR pGetFeats2 =
+        (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(
+            inst, "vkGetPhysicalDeviceFeatures2KHR");
+    if (!pGetFeats2) { fprintf(stderr, "[fail] no vkGetPhysicalDeviceFeatures2KHR\n"); return 5; }
+
+    VkPhysicalDeviceTransformFeedbackFeaturesEXT xfb_feats = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT,
+    };
+    VkPhysicalDeviceFeatures2 feats2 = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+        .pNext = &xfb_feats,
+    };
+    pGetFeats2(gpu, &feats2);
+    fprintf(stderr, "[info] transformFeedback=%u geometryStreams=%u\n",
+            xfb_feats.transformFeedback, xfb_feats.geometryStreams);
+    if (!xfb_feats.transformFeedback) {
+        fprintf(stderr, "[fail] transformFeedback feature is FALSE — driver exposes ext but not feature\n");
+        return 10;
+    }
+
+    /* ---- queue family ---- */
+    uint32_t n_qf = 0;
+    vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, NULL);
+    VkQueueFamilyProperties *qfp = calloc(n_qf, sizeof(*qfp));
+    vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, qfp);
+    uint32_t qfam = UINT32_MAX;
+    for (uint32_t i = 0; i < n_qf; i++) {
+        if (qfp[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { qfam = i; break; }
+    }
+
+    /* ---- device with XFB + dynamic_rendering enabled ---- */
+    STEP("vkCreateDevice (+VK_EXT_transform_feedback, +dynamic_rendering chain)");
+    const char *dev_exts[] = {
+        "VK_KHR_multiview", "VK_KHR_maintenance2",
+        "VK_KHR_create_renderpass2", "VK_KHR_depth_stencil_resolve",
+        "VK_KHR_dynamic_rendering",
+        "VK_EXT_transform_feedback",
+    };
+
+    VkPhysicalDeviceTransformFeedbackFeaturesEXT enable_xfb = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT,
+        .transformFeedback = VK_TRUE,
+        .geometryStreams = VK_FALSE,
+    };
+    VkPhysicalDeviceDynamicRenderingFeaturesKHR dyn_feat = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES_KHR,
+        .pNext = &enable_xfb,
+        .dynamicRendering = VK_TRUE,
+    };
+    float qprio = 1.0f;
+    VkDeviceQueueCreateInfo qci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = qfam, .queueCount = 1, .pQueuePriorities = &qprio,
+    };
+    VkDeviceCreateInfo dci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .pNext = &dyn_feat,
+        .queueCreateInfoCount = 1, .pQueueCreateInfos = &qci,
+        .enabledExtensionCount = sizeof(dev_exts)/sizeof(dev_exts[0]),
+        .ppEnabledExtensionNames = dev_exts,
+    };
+    VkDevice dev;
+    VK_CHECK(vkCreateDevice(gpu, &dci, NULL, &dev));
+
+    VkQueue queue;
+    vkGetDeviceQueue(dev, qfam, 0, &queue);
+
+    /* ---- XFB function pointers ---- */
+    PFN_vkCmdBindTransformFeedbackBuffersEXT pBindXfb =
+        (PFN_vkCmdBindTransformFeedbackBuffersEXT)vkGetDeviceProcAddr(
+            dev, "vkCmdBindTransformFeedbackBuffersEXT");
+    PFN_vkCmdBeginTransformFeedbackEXT pBeginXfb =
+        (PFN_vkCmdBeginTransformFeedbackEXT)vkGetDeviceProcAddr(
+            dev, "vkCmdBeginTransformFeedbackEXT");
+    PFN_vkCmdEndTransformFeedbackEXT pEndXfb =
+        (PFN_vkCmdEndTransformFeedbackEXT)vkGetDeviceProcAddr(
+            dev, "vkCmdEndTransformFeedbackEXT");
+    PFN_vkCmdBeginRenderingKHR pBeginRendering =
+        (PFN_vkCmdBeginRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdBeginRenderingKHR");
+    PFN_vkCmdEndRenderingKHR pEndRendering =
+        (PFN_vkCmdEndRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdEndRenderingKHR");
+    if (!pBindXfb || !pBeginXfb || !pEndXfb || !pBeginRendering || !pEndRendering) {
+        fprintf(stderr, "[fail] one or more XFB / dynamic_rendering entry points missing\n");
+        return 11;
+    }
+
+    /* ---- XFB capture buffer (host-visible) ---- */
+    STEP("vkCreateBuffer XFB capture (host-visible)");
+    VkBufferCreateInfo xfb_bci = {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .size = XFB_BUFFER_BYTES,
+        .usage = VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT |
+                 VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+    };
+    VkBuffer xfb_buf;
+    VK_CHECK(vkCreateBuffer(dev, &xfb_bci, NULL, &xfb_buf));
+
+    VkMemoryRequirements xfb_mr;
+    vkGetBufferMemoryRequirements(dev, xfb_buf, &xfb_mr);
+    VkMemoryAllocateInfo xfb_mai = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = xfb_mr.size,
+        .memoryTypeIndex = pick_host_visible(&mp, xfb_mr.memoryTypeBits),
+    };
+    VkDeviceMemory xfb_mem;
+    VK_CHECK(vkAllocateMemory(dev, &xfb_mai, NULL, &xfb_mem));
+    VK_CHECK(vkBindBufferMemory(dev, xfb_buf, xfb_mem, 0));
+
+    /* Pre-fill with sentinel so we can detect "GPU never wrote" vs "wrong write". */
+    void *mapped = NULL;
+    VK_CHECK(vkMapMemory(dev, xfb_mem, 0, VK_WHOLE_SIZE, 0, &mapped));
+    uint32_t *u32 = (uint32_t *)mapped;
+    for (uint32_t i = 0; i < XFB_BUFFER_BYTES / 4; i++) u32[i] = 0xDEADBEEFu;
+
+    /* ---- pipeline (vertex stage only, raster-discard, no color attachment) ---- */
+    STEP("vkCreatePipelineLayout + vert shader");
+    VkPipelineLayoutCreateInfo plci = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+    };
+    VkPipelineLayout pl;
+    VK_CHECK(vkCreatePipelineLayout(dev, &plci, NULL, &pl));
+
+    size_t spv_bytes = 0;
+    uint32_t *spv = read_spv(VSPV_PATH, &spv_bytes);
+    VkShaderModuleCreateInfo smci = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .codeSize = spv_bytes, .pCode = spv,
+    };
+    VkShaderModule vsm;
+    VK_CHECK(vkCreateShaderModule(dev, &smci, NULL, &vsm));
+    free(spv);
+
+    VkPipelineShaderStageCreateInfo stages[1] = {
+        { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+          .stage = VK_SHADER_STAGE_VERTEX_BIT, .module = vsm, .pName = "main" },
+    };
+    VkPipelineVertexInputStateCreateInfo vi = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+    };
+    VkPipelineInputAssemblyStateCreateInfo ia = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+    };
+    VkViewport vp_dummy = { 0, 0, 1, 1, 0.0f, 1.0f };
+    VkRect2D sc_dummy = {{0,0}, {1,1}};
+    VkPipelineViewportStateCreateInfo vp = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+        .viewportCount = 1, .pViewports = &vp_dummy,
+        .scissorCount = 1, .pScissors = &sc_dummy,
+    };
+    VkPipelineRasterizationStateCreateInfo rs = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+        .rasterizerDiscardEnable = VK_TRUE,   /* THE point — no rasterization */
+        .polygonMode = VK_POLYGON_MODE_FILL,
+        .cullMode = VK_CULL_MODE_NONE,
+        .lineWidth = 1.0f,
+    };
+    VkPipelineMultisampleStateCreateInfo ms = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+        .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+    };
+    VkPipelineRenderingCreateInfoKHR pri = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO_KHR,
+        .colorAttachmentCount = 0,   /* No color attachment with raster discard. */
+    };
+    VkGraphicsPipelineCreateInfo gpci = {
+        .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+        .pNext = &pri,
+        .stageCount = 1, .pStages = stages,
+        .pVertexInputState = &vi,
+        .pInputAssemblyState = &ia,
+        .pViewportState = &vp,
+        .pRasterizationState = &rs,
+        .pMultisampleState = &ms,
+        .layout = pl,
+    };
+    STEP("vkCreateGraphicsPipelines (raster-discard + XFB-output VS)");
+    VkPipeline pipe;
+    VK_CHECK(vkCreateGraphicsPipelines(dev, VK_NULL_HANDLE, 1, &gpci, NULL, &pipe));
+
+    /* ---- command buffer ---- */
+    VkCommandPoolCreateInfo cpoolci = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .queueFamilyIndex = qfam,
+    };
+    VkCommandPool cpool;
+    VK_CHECK(vkCreateCommandPool(dev, &cpoolci, NULL, &cpool));
+    VkCommandBufferAllocateInfo cbai = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = cpool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+    VkCommandBuffer cb;
+    VK_CHECK(vkAllocateCommandBuffers(dev, &cbai, &cb));
+
+    STEP("record (bind XFB buffer + begin XFB + draw + end XFB)");
+    VkCommandBufferBeginInfo cbbi = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+    VK_CHECK(vkBeginCommandBuffer(cb, &cbbi));
+
+    /* Bind XFB buffer to slot 0 */
+    VkDeviceSize xfb_offset = 0, xfb_size = XFB_BUFFER_BYTES;
+    pBindXfb(cb, 0, 1, &xfb_buf, &xfb_offset, &xfb_size);
+
+    /* Dynamic rendering with NO color attachments (raster-discard).
+     * Render-area is required by the spec to be > 0 even if discarded;
+     * use 1x1. */
+    VkRenderingInfoKHR ri = {
+        .sType = VK_STRUCTURE_TYPE_RENDERING_INFO_KHR,
+        .renderArea = {{0,0}, {1,1}},
+        .layerCount = 1,
+        .colorAttachmentCount = 0,
+    };
+    pBeginRendering(cb, &ri);
+
+    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, pipe);
+    pBeginXfb(cb, 0, 0, NULL, NULL);
+    vkCmdDraw(cb, VERTEX_COUNT, 1, 0, 0);
+    pEndXfb(cb, 0, 0, NULL, NULL);
+
+    pEndRendering(cb);
+
+    /* Sync XFB writes for host read. */
+    VkBufferMemoryBarrier bb = {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+        .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT,
+        .dstAccessMask = VK_ACCESS_HOST_READ_BIT,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .buffer = xfb_buf, .offset = 0, .size = VK_WHOLE_SIZE,
+    };
+    vkCmdPipelineBarrier(cb,
+        VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT,
+        VK_PIPELINE_STAGE_HOST_BIT,
+        0, 0, NULL, 1, &bb, 0, NULL);
+
+    VK_CHECK(vkEndCommandBuffer(cb));
+
+    /* ---- submit ---- */
+    VkFenceCreateInfo fci = { .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO };
+    VkFence fence;
+    VK_CHECK(vkCreateFence(dev, &fci, NULL, &fence));
+    VkSubmitInfo si = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1, .pCommandBuffers = &cb,
+    };
+    STEP("submit + wait (10s)");
+    VK_CHECK(vkQueueSubmit(queue, 1, &si, fence));
+    VkResult wr = vkWaitForFences(dev, 1, &fence, VK_TRUE, 10ULL * 1000 * 1000 * 1000);
+    if (wr != VK_SUCCESS) {
+        fprintf(stderr, "[fail] vkWaitForFences => %d\n", wr); return 7;
+    }
+
+    /* ---- verify ---- */
+    STEP("readback + verify");
+    VkMappedMemoryRange mmr = {
+        .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+        .memory = xfb_mem, .offset = 0, .size = VK_WHOLE_SIZE,
+    };
+    vkInvalidateMappedMemoryRanges(dev, 1, &mmr);
+
+    /* Expected: each vec4 = (vertex_id, 0, 4660.0, 51966.0) as float32 */
+    int mismatches = 0;
+    float *floats = (float *)mapped;
+    for (uint32_t v = 0; v < VERTEX_COUNT; v++) {
+        float got[4] = { floats[v*4 + 0], floats[v*4 + 1], floats[v*4 + 2], floats[v*4 + 3] };
+        float want[4] = { (float)v, 0.0f, (float)0x1234, (float)0xcafe };
+        for (int c = 0; c < 4; c++) {
+            if (got[c] != want[c]) {
+                fprintf(stderr, "[diff] vertex %u comp %d: got=%f want=%f\n",
+                        v, c, got[c], want[c]);
+                mismatches++;
+            }
+        }
+        fprintf(stderr, "[info] vertex %u: (%f, %f, %f, %f)\n",
+                v, got[0], got[1], got[2], got[3]);
+    }
+
+    /* ---- teardown ---- */
+    vkUnmapMemory(dev, xfb_mem);
+    vkDestroyFence(dev, fence, NULL);
+    vkDestroyCommandPool(dev, cpool, NULL);
+    vkDestroyPipeline(dev, pipe, NULL);
+    vkDestroyShaderModule(dev, vsm, NULL);
+    vkDestroyPipelineLayout(dev, pl, NULL);
+    vkDestroyBuffer(dev, xfb_buf, NULL);
+    vkFreeMemory(dev, xfb_mem, NULL);
+    vkDestroyDevice(dev, NULL);
+    vkDestroyInstance(inst, NULL);
+    free(phys); free(qfp);
+
+    if (mismatches == 0) {
+        fprintf(stderr, "[PASS] PanVk-Bifrost transform feedback: 3 vertices captured correctly.\n");
+        return 0;
+    } else {
+        fprintf(stderr, "[FAIL] %d mismatches across 3 vertices.\n", mismatches);
+        return 1;
+    }
+}
@@ -0,0 +1,24 @@
+#version 450
+
+// iter13 XFB probe vertex shader.
+// Writes a known pattern per vertex into transform feedback buffer 0.
+// Each vertex emits one vec4: (vertex_id, instance_id, 0x1234, 0xcafe).
+// With a 3-vertex single-instance draw + buffer offset 0,
+// expected capture (LE float32 array of vec4s):
+//   vertex 0: 0.0, 0.0, 4660.0, 51966.0
+//   vertex 1: 1.0, 0.0, 4660.0, 51966.0
+//   vertex 2: 2.0, 0.0, 4660.0, 51966.0
+
+layout(xfb_buffer = 0, xfb_offset = 0, xfb_stride = 16, location = 0) out vec4 captured;
+
+void main() {
+    // Position is unused (rasterizerDiscardEnable=VK_TRUE) but needed for valid pipeline.
+    gl_Position = vec4(0, 0, 0, 1);
+
+    captured = vec4(
+        float(gl_VertexIndex),
+        float(gl_InstanceIndex),
+        float(0x1234),
+        float(0xcafe)
+    );
+}
@@ -0,0 +1,266 @@
+/*
+ * iter13 Janet-CRITICAL regression: XFB-capable pipeline used WITHOUT
+ * vkCmdBeginTransformFeedback must NOT fault the GPU.
+ *
+ * Same pipeline shape as probe_xfb.c, but the draw is not wrapped in
+ * Begin/End XFB and no XFB buffer is bound. The vertex shader still
+ * emits a store_global instruction (xfb_address[0] is read from sysval).
+ *
+ * With the memory-sink fix (xfb_address defaults to PAN_SHADER_OOB_ADDRESS
+ * = 0x8000_0000_0000_0000), the store is silently discarded by the MMU.
+ * Without that fix, the store goes to address 0 → page fault → GPU job
+ * failure.
+ *
+ * Pass criterion: vkQueueSubmit + vkWaitForFences returns VK_SUCCESS
+ * (no DEVICE_LOST). No buffer to read back — we only care that the GPU
+ * survives the draw.
+ */
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <vulkan/vulkan.h>
+
+#define VSPV_PATH "probe_xfb.vert.spv"
+
+#define STEP(name) do { fprintf(stderr, "[step] " name "\n"); fflush(stderr); } while (0)
+
+#define VK_CHECK(call) do {                                                    \
+    VkResult _r = (call);                                                      \
+    if (_r != VK_SUCCESS) {                                                    \
+        fprintf(stderr, "[fail] " #call " => %d at %s:%d\n",                   \
+                (int)_r, __FILE__, __LINE__);                                  \
+        exit(2);                                                               \
+    }                                                                          \
+} while (0)
+
+static uint32_t *read_spv(const char *path, size_t *out_bytes)
+{
+    FILE *f = fopen(path, "rb");
+    if (!f) { fprintf(stderr, "[fail] open %s: %s\n", path, strerror(errno)); exit(3); }
+    fseek(f, 0, SEEK_END);
+    long n = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    uint32_t *buf = malloc((size_t)n);
+    fread(buf, 1, (size_t)n, f);
+    fclose(f);
+    *out_bytes = (size_t)n;
+    return buf;
+}
+
+int main(void)
+{
+    STEP("vkCreateInstance");
+    VkApplicationInfo app = {
+        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+        .pApplicationName = "panvk-bifrost iter13 XFB no-draw probe",
+        .apiVersion = VK_API_VERSION_1_0,
+    };
+    const char *inst_exts[] = { "VK_KHR_get_physical_device_properties2" };
+    VkInstanceCreateInfo ici = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pApplicationInfo = &app,
+        .enabledExtensionCount = 1,
+        .ppEnabledExtensionNames = inst_exts,
+    };
+    VkInstance inst;
+    VK_CHECK(vkCreateInstance(&ici, NULL, &inst));
+
+    uint32_t n_phys = 0;
+    VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, NULL));
+    VkPhysicalDevice *phys = calloc(n_phys, sizeof(*phys));
+    VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, phys));
+    VkPhysicalDevice gpu = phys[0];
+
+    uint32_t n_qf = 0;
+    vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, NULL);
+    VkQueueFamilyProperties *qfp = calloc(n_qf, sizeof(*qfp));
+    vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, qfp);
+    uint32_t qfam = UINT32_MAX;
+    for (uint32_t i = 0; i < n_qf; i++) {
+        if (qfp[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { qfam = i; break; }
+    }
+
+    STEP("vkCreateDevice (+XFB feature enabled + dynamic_rendering)");
+    const char *dev_exts[] = {
+        "VK_KHR_multiview", "VK_KHR_maintenance2",
+        "VK_KHR_create_renderpass2", "VK_KHR_depth_stencil_resolve",
+        "VK_KHR_dynamic_rendering",
+        "VK_EXT_transform_feedback",
+    };
+    VkPhysicalDeviceTransformFeedbackFeaturesEXT enable_xfb = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT,
+        .transformFeedback = VK_TRUE,
+        .geometryStreams = VK_FALSE,
+    };
+    VkPhysicalDeviceDynamicRenderingFeaturesKHR dyn_feat = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES_KHR,
+        .pNext = &enable_xfb,
+        .dynamicRendering = VK_TRUE,
+    };
+    float qprio = 1.0f;
+    VkDeviceQueueCreateInfo qci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = qfam, .queueCount = 1, .pQueuePriorities = &qprio,
+    };
+    VkDeviceCreateInfo dci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .pNext = &dyn_feat,
+        .queueCreateInfoCount = 1, .pQueueCreateInfos = &qci,
+        .enabledExtensionCount = sizeof(dev_exts)/sizeof(dev_exts[0]),
+        .ppEnabledExtensionNames = dev_exts,
+    };
+    VkDevice dev;
+    VK_CHECK(vkCreateDevice(gpu, &dci, NULL, &dev));
+
+    VkQueue queue;
+    vkGetDeviceQueue(dev, qfam, 0, &queue);
+
+    PFN_vkCmdBeginRenderingKHR pBeginRendering =
+        (PFN_vkCmdBeginRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdBeginRenderingKHR");
+    PFN_vkCmdEndRenderingKHR pEndRendering =
+        (PFN_vkCmdEndRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdEndRenderingKHR");
+
+    /* Same XFB-bearing vertex shader as probe_xfb — its SPIR-V has the
+     * xfb_buffer / xfb_offset decorations on `captured`. PanVk's driver
+     * will run pan_nir_lower_xfb on it, producing nir_store_global to
+     * vs.xfb_address[0]. We rely on the driver setting that sysval to
+     * PAN_SHADER_OOB_ADDRESS when xfb is inactive. */
+    STEP("vkCreateGraphicsPipelines (XFB-capable VS, no XFB buffer bound)");
+    VkPipelineLayoutCreateInfo plci = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+    };
+    VkPipelineLayout pl;
+    VK_CHECK(vkCreatePipelineLayout(dev, &plci, NULL, &pl));
+
+    size_t spv_bytes = 0;
+    uint32_t *spv = read_spv(VSPV_PATH, &spv_bytes);
+    VkShaderModuleCreateInfo smci = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .codeSize = spv_bytes, .pCode = spv,
+    };
+    VkShaderModule vsm;
+    VK_CHECK(vkCreateShaderModule(dev, &smci, NULL, &vsm));
+    free(spv);
+
+    VkPipelineShaderStageCreateInfo stages[1] = {
+        { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+          .stage = VK_SHADER_STAGE_VERTEX_BIT, .module = vsm, .pName = "main" },
+    };
+    VkPipelineVertexInputStateCreateInfo vi = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+    };
+    VkPipelineInputAssemblyStateCreateInfo ia = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+    };
+    VkViewport vp_dummy = { 0, 0, 1, 1, 0.0f, 1.0f };
+    VkRect2D sc_dummy = {{0,0}, {1,1}};
+    VkPipelineViewportStateCreateInfo vp = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+        .viewportCount = 1, .pViewports = &vp_dummy,
+        .scissorCount = 1, .pScissors = &sc_dummy,
+    };
+    VkPipelineRasterizationStateCreateInfo rs = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+        .rasterizerDiscardEnable = VK_TRUE,
+        .polygonMode = VK_POLYGON_MODE_FILL,
+        .cullMode = VK_CULL_MODE_NONE,
+        .lineWidth = 1.0f,
+    };
+    VkPipelineMultisampleStateCreateInfo ms = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+        .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+    };
+    VkPipelineRenderingCreateInfoKHR pri = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO_KHR,
+        .colorAttachmentCount = 0,
+    };
+    VkGraphicsPipelineCreateInfo gpci = {
+        .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+        .pNext = &pri,
+        .stageCount = 1, .pStages = stages,
+        .pVertexInputState = &vi,
+        .pInputAssemblyState = &ia,
+        .pViewportState = &vp,
+        .pRasterizationState = &rs,
+        .pMultisampleState = &ms,
+        .layout = pl,
+    };
+    VkPipeline pipe;
+    VK_CHECK(vkCreateGraphicsPipelines(dev, VK_NULL_HANDLE, 1, &gpci, NULL, &pipe));
+
+    VkCommandPoolCreateInfo cpoolci = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .queueFamilyIndex = qfam,
+    };
+    VkCommandPool cpool;
+    VK_CHECK(vkCreateCommandPool(dev, &cpoolci, NULL, &cpool));
+    VkCommandBufferAllocateInfo cbai = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = cpool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+    VkCommandBuffer cb;
+    VK_CHECK(vkAllocateCommandBuffers(dev, &cbai, &cb));
+
+    STEP("record (draw WITHOUT XFB Begin/End; no buffer bound)");
+    VkCommandBufferBeginInfo cbbi = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+    VK_CHECK(vkBeginCommandBuffer(cb, &cbbi));
+
+    VkRenderingInfoKHR ri = {
+        .sType = VK_STRUCTURE_TYPE_RENDERING_INFO_KHR,
+        .renderArea = {{0,0}, {1,1}},
+        .layerCount = 1,
+        .colorAttachmentCount = 0,
+    };
+    pBeginRendering(cb, &ri);
+
+    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, pipe);
+    /* No vkCmdBindTransformFeedbackBuffersEXT.
+     * No vkCmdBeginTransformFeedbackEXT.
+     * Just draw — the XFB store in the shader must be silently discarded. */
+    vkCmdDraw(cb, 3, 1, 0, 0);
+
+    pEndRendering(cb);
+
+    VK_CHECK(vkEndCommandBuffer(cb));
+
+    VkFenceCreateInfo fci = { .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO };
+    VkFence fence;
+    VK_CHECK(vkCreateFence(dev, &fci, NULL, &fence));
+    VkSubmitInfo si = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1, .pCommandBuffers = &cb,
+    };
+    STEP("submit + wait (10s) — expect VK_SUCCESS, not DEVICE_LOST");
+    VK_CHECK(vkQueueSubmit(queue, 1, &si, fence));
+    VkResult wr = vkWaitForFences(dev, 1, &fence, VK_TRUE, 10ULL * 1000 * 1000 * 1000);
+    if (wr == VK_ERROR_DEVICE_LOST) {
+        fprintf(stderr, "[FAIL] DEVICE_LOST — the XFB store-global probably faulted "
+                "(memory-sink sentinel not applied).\n");
+        return 1;
+    }
+    if (wr != VK_SUCCESS) {
+        fprintf(stderr, "[FAIL] vkWaitForFences => %d\n", wr);
+        return 2;
+    }
+
+    vkDestroyFence(dev, fence, NULL);
+    vkDestroyCommandPool(dev, cpool, NULL);
+    vkDestroyPipeline(dev, pipe, NULL);
+    vkDestroyShaderModule(dev, vsm, NULL);
+    vkDestroyPipelineLayout(dev, pl, NULL);
+    vkDestroyDevice(dev, NULL);
+    vkDestroyInstance(inst, NULL);
+    free(phys); free(qfp);
+
+    fprintf(stderr, "[PASS] XFB-capable pipeline survives non-XFB draw — memory-sink active.\n");
+    return 0;
+}