Merge pull request 'mesa-panvk-bifrost: r3 -> r4 — iter17 XFB primitive decomposition' (#70) from claude-noether/marfrit-packages:noether/mesa-panvk-bifrost-r4-iter17-xfb-decomp into main

Reviewed-on: #70
2026-05-21 12:18:23 +00:00
parent 1c8c186681 83e8eca56d
commit a9f1b833b9
2 changed files with 646 additions and 2 deletions
@@ -0,0 +1,629 @@
+diff -urN a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build
+--- a/src/panfrost/vulkan/meson.build	2026-05-21 14:04:02.529474145 +0200
+++ b/src/panfrost/vulkan/meson.build	2026-05-21 14:04:04.106755486 +0200
+@@ -123,6 +123,7 @@
+   'panvk_vX_nir_lower_input_attachment_loads.c',
+   'panvk_vX_sampler.c',
+   'panvk_vX_shader.c',
+  'panvk_vX_xfb_lower.c',
+   sha1_h,
+ ]
+ 
+diff -urN a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h
+--- a/src/panfrost/vulkan/panvk_shader.h	2026-05-21 14:04:02.525251986 +0200
+++ b/src/panfrost/vulkan/panvk_shader.h	2026-05-21 14:04:04.084251800 +0200
+@@ -154,6 +154,8 @@
+       /* aligned_u64 attribute below inserts the 4-byte alignment gap
+        * after num_vertices automatically — no explicit pad needed. */
+       aligned_u64 xfb_address[4];  /* iter13: 4 transform feedback buffer base addresses */
+      uint32_t xfb_topology;       /* iter17: panvk_xfb_topology enum value */
+      uint32_t xfb_output_count;   /* iter17: per-instance output verts after decomp */
+ #endif
+       int32_t first_vertex;
+       int32_t base_instance;
+@@ -569,4 +571,76 @@
+    struct pan_compute_dim local_size, const void *bin_ptr, size_t bin_size,
+    struct panvk_shader **shader_out);
+ 
+
+#if PAN_ARCH < 9
+/* iter17: encoding for vs.xfb_topology sysval. Maps VkPrimitiveTopology values
+ * we need to distinguish at shader runtime for XFB capture. LIST topologies
+ * use the iter13 single-store fast path; non-LIST need per-vertex decomposition. */
+enum panvk_xfb_topology {
+   PANVK_XFB_TOPO_LIST            = 0,
+   PANVK_XFB_TOPO_LINE_STRIP      = 1,
+   PANVK_XFB_TOPO_TRI_STRIP       = 2,
+   PANVK_XFB_TOPO_TRI_FAN         = 3,
+   PANVK_XFB_TOPO_LINE_LIST_ADJ   = 4,
+   PANVK_XFB_TOPO_LINE_STRIP_ADJ  = 5,
+   PANVK_XFB_TOPO_TRI_LIST_ADJ    = 6,
+   PANVK_XFB_TOPO_TRI_STRIP_ADJ   = 7,
+};
+
+#include "panvk_macros.h"
+struct nir_shader;
+bool panvk_per_arch(nir_lower_xfb)(struct nir_shader *nir);
+
+/* Map VkPrimitiveTopology to panvk_xfb_topology enum (driver-side helper). */
+static inline uint32_t
+panvk_vk_topology_to_xfb_enum(VkPrimitiveTopology topo)
+{
+   switch (topo) {
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+      return PANVK_XFB_TOPO_LINE_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+      return PANVK_XFB_TOPO_TRI_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+      return PANVK_XFB_TOPO_TRI_FAN;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+      return PANVK_XFB_TOPO_LINE_LIST_ADJ;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+      return PANVK_XFB_TOPO_LINE_STRIP_ADJ;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+      return PANVK_XFB_TOPO_TRI_LIST_ADJ;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+      return PANVK_XFB_TOPO_TRI_STRIP_ADJ;
+   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+   default:
+      return PANVK_XFB_TOPO_LIST;
+   }
+}
+
+/* Compute the per-instance output vertex count for a given (topology, input count). */
+static inline uint32_t
+panvk_xfb_output_count(VkPrimitiveTopology topo, uint32_t input_count)
+{
+   switch (topo) {
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+      return input_count >= 1 ? 2u * (input_count - 1u) : 0u;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+      return input_count >= 2 ? 3u * (input_count - 2u) : 0u;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+      return (input_count / 4u) * 2u;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+      return input_count >= 3 ? 2u * (input_count - 3u) : 0u;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+      return (input_count / 6u) * 3u;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+      return input_count >= 6 ? 3u * (input_count / 2u - 2u) : 0u;
+   default:
+      return input_count;  /* LIST topologies: 1:1 mapping */
+   }
+}
+#endif
+
+
+ #endif
+diff -urN a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c
+--- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c	2026-05-21 14:04:02.528576354 +0200
+++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c	2026-05-21 14:04:04.091357598 +0200
+@@ -727,6 +727,20 @@
+    /* iter13: VK_EXT_transform_feedback sysvals — always set (per draw),
+     * reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */
+    set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count);
+
+   /* iter17: XFB primitive-decomposition sysvals.
+    * xfb_topology = enum value for the current bound topology.
+    * xfb_output_count = per-instance output vertex count after decomposition.
+    * For LIST topologies, output_count == input vertex count and the shader
+    * takes the iter13 single-store fast path. */
+   {
+      VkPrimitiveTopology vk_topo =
+         cmdbuf->vk.dynamic_graphics_state.ia.primitive_topology;
+      uint32_t topo_enum = panvk_vk_topology_to_xfb_enum(vk_topo);
+      uint32_t out_count = panvk_xfb_output_count(vk_topo, info->vertex.count);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_topology, topo_enum);
+      set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_output_count, out_count);
+   }
+    {
+       const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx;
+       /* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS
+diff -urN a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c
+--- a/src/panfrost/vulkan/panvk_vX_shader.c	2026-05-21 14:04:02.527576494 +0200
+++ b/src/panfrost/vulkan/panvk_vX_shader.c	2026-05-21 14:04:04.098356619 +0200
+@@ -895,7 +895,10 @@
+        nir->info.has_transform_feedback_varyings) {
+       NIR_PASS(_, nir, nir_opt_constant_folding);
+       NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info);
+-      NIR_PASS(_, nir, pan_nir_lower_xfb);
+      /* iter17: panvk-specific replacement for pan_nir_lower_xfb that handles
+       * primitive decomposition for non-LIST topologies. Single-store LIST
+       * fast path matches iter13 behavior. */
+      NIR_PASS(_, nir, panvk_per_arch(nir_lower_xfb));
+    }
+ #endif
+ }
+diff -urN a/src/panfrost/vulkan/panvk_vX_xfb_lower.c b/src/panfrost/vulkan/panvk_vX_xfb_lower.c
+--- a/src/panfrost/vulkan/panvk_vX_xfb_lower.c	1970-01-01 01:00:00.000000000 +0100
+++ b/src/panfrost/vulkan/panvk_vX_xfb_lower.c	2026-05-21 14:04:04.115354242 +0200
+@@ -0,0 +1,486 @@
+/*
+ * Copyright © 2026 mfritsche / claude-noether
+ * SPDX-License-Identifier: MIT
+ *
+ * iter17: panvk-specific replacement for pan_nir_lower_xfb that handles
+ * primitive decomposition for transform_feedback on non-LIST topologies
+ * (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY).
+ *
+ * Approach: emit a topology dispatch at the start of each store_output
+ * lowering. The shader reads vs.xfb_topology sysval at runtime and branches
+ * into per-topology emission logic. For each affected topology, the lowered
+ * code emits guarded conditional stores — one per primitive this vertex
+ * contributes to, computing the output buffer position via primitive index
+ * and slot within the decomposed primitive.
+ *
+ * For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that
+ * matches iter13's single-store behavior.
+ *
+ * For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives
+ * as slot 2 — handled via a NIR loop bounded by num_vertices.
+ *
+ * See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context.
+ */
+
+#include "panvk_macros.h"
+
+#if PAN_ARCH < 9
+
+#include "panvk_shader.h"
+
+#include "compiler/nir/nir_builder.h"
+#include "pan_nir.h"
+
+#include <vulkan/vulkan_core.h>
+
+/* ----- Address arithmetic ----- */
+
+static nir_def *
+xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx,
+               uint16_t stride, uint16_t offset_bytes)
+{
+   nir_def *byte_off = nir_iadd_imm(b,
+      nir_imul_imm(b, out_idx, stride), offset_bytes);
+   return nir_iadd(b, buf, nir_u2u64(b, byte_off));
+}
+
+static void
+emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count,
+                nir_def *instance_id, nir_def *raw_vid, nir_def *value,
+                uint16_t stride, uint16_t offset_bytes)
+{
+   nir_def *out_idx = nir_iadd(b,
+      nir_imul(b, instance_id, output_count), raw_vid);
+   nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
+   nir_store_global(b, value, addr);
+}
+
+static void
+emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count,
+                nir_def *instance_id, nir_def *eligible,
+                nir_def *prim_idx, nir_def *slot,
+                uint32_t verts_per_prim,
+                nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   nir_push_if(b, eligible);
+   {
+      nir_def *out_idx = nir_iadd(b,
+         nir_imul(b, instance_id, output_count),
+         nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot));
+      nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
+      nir_store_global(b, value, addr);
+   }
+   nir_pop_if(b, NULL);
+}
+
+/* ----- Per-topology emission ----- */
+
+/* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */
+static void
+emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N,
+               nir_def *buf, nir_def *output_count, nir_def *instance_id,
+               nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   nir_def *Nm2 = nir_iadd_imm(b, N, -2);
+   nir_def *Nm1 = nir_iadd_imm(b, N, -1);
+
+   /* Prim v, slot 0: v < N-2 */
+   emit_prim_store(b, buf, output_count, instance_id,
+      nir_ult(b, v, Nm2),
+      v, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
+
+   /* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */
+   {
+      nir_def *prim = nir_iadd_imm(b, v, -1);
+      nir_def *parity = nir_iand_imm(b, prim, 1u);
+      nir_def *slot = nir_iadd_imm(b, parity, 1);
+      nir_def *eligible = nir_iand(b,
+         nir_uge(b, v, nir_imm_int(b, 1)),
+         nir_ult(b, v, Nm1));
+      emit_prim_store(b, buf, output_count, instance_id, eligible,
+                      prim, slot, 3, value, stride, offset_bytes);
+   }
+
+   /* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */
+   {
+      nir_def *prim = nir_iadd_imm(b, v, -2);
+      nir_def *parity = nir_iand_imm(b, prim, 1u);
+      nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity);
+      nir_def *eligible = nir_iand(b,
+         nir_uge(b, v, nir_imm_int(b, 2)),
+         nir_ult(b, v, N));
+      emit_prim_store(b, buf, output_count, instance_id, eligible,
+                      prim, slot, 3, value, stride, offset_bytes);
+   }
+}
+
+/* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */
+static void
+emit_line_strip(nir_builder *b, nir_def *v, nir_def *N,
+                nir_def *buf, nir_def *output_count, nir_def *instance_id,
+                nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   nir_def *Nm1 = nir_iadd_imm(b, N, -1);
+
+   /* Prim v, slot 0: v < N-1 */
+   emit_prim_store(b, buf, output_count, instance_id,
+      nir_ult(b, v, Nm1),
+      v, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
+
+   /* Prim v-1, slot 1: 1 <= v < N */
+   {
+      nir_def *prim = nir_iadd_imm(b, v, -1);
+      nir_def *eligible = nir_iand(b,
+         nir_uge(b, v, nir_imm_int(b, 1)),
+         nir_ult(b, v, N));
+      emit_prim_store(b, buf, output_count, instance_id, eligible,
+                      prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
+   }
+}
+
+/* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}.
+ *   vertex v=0: contributes to ALL prims as slot 2 (loop required)
+ *   vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2)
+ *   vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1)
+ */
+static void
+emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N,
+             nir_def *buf, nir_def *output_count, nir_def *instance_id,
+             nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   nir_def *Nm1 = nir_iadd_imm(b, N, -1);
+   nir_def *Nm2 = nir_iadd_imm(b, N, -2);
+
+   /* Prim v-1, slot 0: 1 <= v < N-1 */
+   {
+      nir_def *prim = nir_iadd_imm(b, v, -1);
+      nir_def *eligible = nir_iand(b,
+         nir_uge(b, v, nir_imm_int(b, 1)),
+         nir_ult(b, v, Nm1));
+      emit_prim_store(b, buf, output_count, instance_id, eligible,
+                      prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
+   }
+
+   /* Prim v-2, slot 1: 2 <= v < N */
+   {
+      nir_def *prim = nir_iadd_imm(b, v, -2);
+      nir_def *eligible = nir_iand(b,
+         nir_uge(b, v, nir_imm_int(b, 2)),
+         nir_ult(b, v, N));
+      emit_prim_store(b, buf, output_count, instance_id, eligible,
+                      prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes);
+   }
+
+   /* Central vertex (v == 0): loop over all prims, write to slot 2. */
+   nir_push_if(b, nir_ieq_imm(b, v, 0));
+   {
+      nir_variable *p_var = nir_local_variable_create(b->impl,
+         glsl_uint_type(), "fan_p");
+      nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1);
+      nir_push_loop(b);
+      {
+         nir_def *p = nir_load_var(b, p_var);
+         nir_push_if(b, nir_uge(b, p, Nm2));
+         {
+            nir_jump(b, nir_jump_break);
+         }
+         nir_pop_if(b, NULL);
+
+         nir_def *out_idx = nir_iadd(b,
+            nir_imul(b, instance_id, output_count),
+            nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2));
+         nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
+         nir_store_global(b, value, addr);
+
+         nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1);
+      }
+      nir_pop_loop(b, NULL);
+   }
+   nir_pop_if(b, NULL);
+}
+
+/* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}.
+ *   v contributes if v%4 == 1: prim v/4 slot 0
+ *   v contributes if v%4 == 2: prim v/4 slot 1
+ */
+static void
+emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N,
+                   nir_def *buf, nir_def *output_count, nir_def *instance_id,
+                   nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   (void)N; /* eligibility is mod-based, not range-based */
+   nir_def *vmod4 = nir_iand_imm(b, v, 3u);
+   nir_def *prim = nir_ushr_imm(b, v, 2);  /* v / 4 */
+
+   emit_prim_store(b, buf, output_count, instance_id,
+      nir_ieq_imm(b, vmod4, 1),
+      prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
+
+   emit_prim_store(b, buf, output_count, instance_id,
+      nir_ieq_imm(b, vmod4, 2),
+      prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
+}
+
+/* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}.
+ *   v contributes to prim v-1 slot 0 (1 <= v <= N-2)
+ *   v contributes to prim v-2 slot 1 (2 <= v <= N-1)
+ */
+static void
+emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
+                    nir_def *buf, nir_def *output_count, nir_def *instance_id,
+                    nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   nir_def *Nm1 = nir_iadd_imm(b, N, -1);
+   nir_def *Nm2 = nir_iadd_imm(b, N, -2);
+
+   /* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */
+   {
+      nir_def *prim = nir_iadd_imm(b, v, -1);
+      nir_def *eligible = nir_iand(b,
+         nir_uge(b, v, nir_imm_int(b, 1)),
+         nir_ult(b, v, Nm1));
+      (void)Nm2;
+      emit_prim_store(b, buf, output_count, instance_id, eligible,
+                      prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
+   }
+
+   /* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */
+   {
+      nir_def *prim = nir_iadd_imm(b, v, -2);
+      nir_def *eligible = nir_iand(b,
+         nir_uge(b, v, nir_imm_int(b, 2)),
+         nir_ult(b, v, N));
+      emit_prim_store(b, buf, output_count, instance_id, eligible,
+                      prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
+   }
+}
+
+/* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}.
+ *   v contributes if v%6 == 0: prim v/6 slot 0
+ *   v contributes if v%6 == 2: prim v/6 slot 1
+ *   v contributes if v%6 == 4: prim v/6 slot 2
+ */
+static void
+emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N,
+                  nir_def *buf, nir_def *output_count, nir_def *instance_id,
+                  nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   (void)N;
+   nir_def *vmod6 = nir_umod_imm(b, v, 6);
+   nir_def *prim = nir_udiv_imm(b, v, 6);
+
+   for (uint32_t slot = 0; slot < 3; slot++) {
+      emit_prim_store(b, buf, output_count, instance_id,
+         nir_ieq_imm(b, vmod6, slot * 2),
+         prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes);
+   }
+}
+
+/* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits:
+ *   even i: {2i, 2i+2, 2i+4}    (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4)
+ *   odd  i: {2i, 2i+4, 2i+2}    (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2)
+ *
+ * Only EVEN input vertices contribute (since all output indices are 2*something).
+ * For even input v:
+ *   prim v/2 slot 0 (always, if v/2 < N/2-2)
+ *   prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd   (when v >= 2)
+ *   prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd   (when v >= 4)
+ */
+static void
+emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
+                   nir_def *buf, nir_def *output_count, nir_def *instance_id,
+                   nir_def *value, uint16_t stride, uint16_t offset_bytes)
+{
+   /* Bail for odd input vertices — they never contribute. */
+   nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0);
+   nir_push_if(b, v_is_even);
+   {
+      nir_def *N_half = nir_ushr_imm(b, N, 1);
+      nir_def *max_prim = nir_iadd_imm(b, N_half, -2);  /* N/2 - 2 */
+      nir_def *v_half = nir_ushr_imm(b, v, 1);
+
+      /* Prim v/2 slot 0: v/2 < N/2 - 2 */
+      emit_prim_store(b, buf, output_count, instance_id,
+         nir_ult(b, v_half, max_prim),
+         v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
+
+      /* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */
+      {
+         nir_def *prim = nir_iadd_imm(b, v_half, -1);
+         nir_def *parity = nir_iand_imm(b, prim, 1u);
+         nir_def *slot = nir_iadd_imm(b, parity, 1);  /* even→1, odd→2 */
+         nir_def *eligible = nir_iand(b,
+            nir_uge(b, v, nir_imm_int(b, 2)),
+            nir_ult(b, prim, max_prim));
+         emit_prim_store(b, buf, output_count, instance_id, eligible,
+                         prim, slot, 3, value, stride, offset_bytes);
+      }
+
+      /* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */
+      {
+         nir_def *prim = nir_iadd_imm(b, v_half, -2);
+         nir_def *parity = nir_iand_imm(b, prim, 1u);
+         nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity);  /* even→2, odd→1 */
+         nir_def *eligible = nir_iand(b,
+            nir_uge(b, v, nir_imm_int(b, 4)),
+            nir_ult(b, prim, max_prim));
+         emit_prim_store(b, buf, output_count, instance_id, eligible,
+                         prim, slot, 3, value, stride, offset_bytes);
+      }
+   }
+   nir_pop_if(b, NULL);
+}
+
+/* ----- Main lowering: per store_output XFB channel ----- */
+
+static void
+lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr,
+                        unsigned channel_idx, unsigned num_components,
+                        unsigned buffer, unsigned offset_words)
+{
+   assert(buffer < MAX_XFB_BUFFERS);
+   assert(nir_intrinsic_component(intr) == 0);
+
+   uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
+   assert(stride != 0);
+   uint16_t offset_bytes = offset_words * 4;
+
+   BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+   BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
+   nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology);
+   nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count);
+   nir_def *N = nir_load_num_vertices(b);
+   nir_def *v = nir_load_raw_vertex_id_pan(b);
+   nir_def *instance = nir_load_instance_id(b);
+   nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
+
+   nir_def *src = intr->src[0].ssa;
+   nir_component_mask_t mask = nir_component_mask(num_components);
+   nir_def *value = nir_channels(b, src, mask << channel_idx);
+
+   /* Topology dispatch ladder. LIST first (fast path). */
+   nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST));
+   {
+      emit_list_store(b, buf, out_count, instance, v, value,
+                      stride, offset_bytes);
+   }
+   nir_push_else(b, NULL);
+   {
+      /* iter17 Janet Finding 3: gate all non-LIST emission on
+       * output_count > 0. For degenerate input counts (N < min required
+       * for the topology), output_count is 0 and we must emit NO stores
+       * — otherwise N-2 / N-3 / etc. arithmetic underflows in the
+       * eligibility predicates and we falsely fire stores. */
+      nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count));
+      {
+      nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP));
+      {
+         emit_tri_strip(b, v, N, buf, out_count, instance, value,
+                        stride, offset_bytes);
+      }
+      nir_push_else(b, NULL);
+      {
+         nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP));
+         {
+            emit_line_strip(b, v, N, buf, out_count, instance, value,
+                            stride, offset_bytes);
+         }
+         nir_push_else(b, NULL);
+         {
+            nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN));
+            {
+               emit_tri_fan(b, v, N, buf, out_count, instance, value,
+                            stride, offset_bytes);
+            }
+            nir_push_else(b, NULL);
+            {
+               nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ));
+               {
+                  emit_line_list_adj(b, v, N, buf, out_count, instance, value,
+                                     stride, offset_bytes);
+               }
+               nir_push_else(b, NULL);
+               {
+                  nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ));
+                  {
+                     emit_line_strip_adj(b, v, N, buf, out_count, instance, value,
+                                         stride, offset_bytes);
+                  }
+                  nir_push_else(b, NULL);
+                  {
+                     nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ));
+                     {
+                        emit_tri_list_adj(b, v, N, buf, out_count, instance, value,
+                                          stride, offset_bytes);
+                     }
+                     nir_push_else(b, NULL);
+                     {
+                        /* TRI_STRIP_ADJ — last case */
+                        emit_tri_strip_adj(b, v, N, buf, out_count, instance, value,
+                                           stride, offset_bytes);
+                     }
+                     nir_pop_if(b, NULL);
+                  }
+                  nir_pop_if(b, NULL);
+               }
+               nir_pop_if(b, NULL);
+            }
+            nir_pop_if(b, NULL);
+         }
+         nir_pop_if(b, NULL);
+      }
+      nir_pop_if(b, NULL);
+      }
+      nir_pop_if(b, NULL);  /* Janet Finding 3: close output_count > 0 guard */
+   }
+   nir_pop_if(b, NULL);
+}
+
+/* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite +
+ * dispatch store_output through our topology-aware emission. */
+static bool
+lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr,
+                 UNUSED void *data)
+{
+   if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
+      b->cursor = nir_instr_remove(&intr->instr);
+      nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b),
+                               nir_load_raw_vertex_offset_pan(b));
+      nir_def_rewrite_uses(&intr->def, repl);
+      return true;
+   }
+
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   bool progress = false;
+   b->cursor = nir_before_instr(&intr->instr);
+
+   /* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2.
+    * Outer loop selects which annotation; inner picks which channel. */
+   for (unsigned i = 0; i < 2; ++i) {
+      nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr)
+                         : nir_intrinsic_io_xfb(intr);
+      for (unsigned j = 0; j < 2; ++j) {
+         if (!xfb.out[j].num_components)
+            continue;
+         lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components,
+                                 xfb.out[j].buffer, xfb.out[j].offset);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      nir_instr_remove(&intr->instr);
+   return progress;
+}
+
+bool
+panvk_per_arch(nir_lower_xfb)(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(
+      nir, lower_xfb_iter17, nir_metadata_control_flow, NULL);
+}
+
+#endif /* PAN_ARCH < 9 */
@@ -30,7 +30,7 @@

 pkgname=mesa-panvk-bifrost
 _mesaver=26.0.6
-pkgver=26.0.6.r3
+pkgver=26.0.6.r4
 pkgrel=1
 pkgdesc="Patched Mesa libvulkan_panfrost.so exposing Bifrost-gen Mali to Vulkan apps (panvk-bifrost campaign)"
 arch=('aarch64')
@@ -80,6 +80,7 @@ source=(
    "0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch"
    "0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch"
    "0003-panvk-bifrost-vk-ext-transform-feedback.patch"
+    "0004-panvk-bifrost-xfb-primitive-decomposition.patch"
    "brave-vulkan"
    "icd.json"
 )
@@ -90,6 +91,7 @@ sha256sums=(
    'SKIP'
    'SKIP'
    'SKIP'
+    'SKIP'
 )

 prepare() {
@@ -116,6 +118,15 @@ prepare() {
    # reports "Hardware accelerated" across the board for the affected paths).
    patch -p1 < "${srcdir}/0003-panvk-bifrost-vk-ext-transform-feedback.patch"

+    # iter17: XFB primitive decomposition for non-LIST topologies (TRI_STRIP,
+    # TRI_FAN, LINE_STRIP, *_WITH_ADJACENCY). Replacement panvk-specific
+    # NIR pass (panvk_per_arch(nir_lower_xfb)) substituted for upstream
+    # pan_nir_lower_xfb. Closes the 162 dEQP-VK winding_* failures from
+    # iter15 (958 P / 81 F / 0 Crash on full XFB CTS — remaining 81 fails
+    # are by-design resume_* tests, transformFeedbackDraw=false).
+    # Phase-doc context: ~/src/panvk-bifrost/iter17/phase{0,1,2,4,5,6,8}_*.md.
+    patch -p1 < "${srcdir}/0004-panvk-bifrost-xfb-primitive-decomposition.patch"
+
    # Sanity-check the patches landed.
    grep -q "KHR_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
    grep -q "EXT_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
@@ -124,8 +135,12 @@ prepare() {
    grep -q "has_vk1_2 = true;" src/panfrost/vulkan/panvk_vX_physical_device.c
    # iter13 sanity:
    grep -q "EXT_transform_feedback = PAN_ARCH < 9," src/panfrost/vulkan/panvk_vX_physical_device.c
-    grep -q "pan_nir_lower_xfb" src/panfrost/vulkan/panvk_vX_shader.c
    test -f src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c
+    # iter17 sanity: pan_nir_lower_xfb call site has been replaced; new file present.
+    grep -q "panvk_per_arch(nir_lower_xfb)" src/panfrost/vulkan/panvk_vX_shader.c
+    grep -q "xfb_topology" src/panfrost/vulkan/panvk_shader.h
+    grep -q "panvk_xfb_topology" src/panfrost/vulkan/panvk_shader.h
+    test -f src/panfrost/vulkan/panvk_vX_xfb_lower.c
 }

 build() {