diff -urN a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build --- a/src/panfrost/vulkan/meson.build 2026-05-21 14:04:02.529474145 +0200 +++ b/src/panfrost/vulkan/meson.build 2026-05-21 14:04:04.106755486 +0200 @@ -123,6 +123,7 @@ 'panvk_vX_nir_lower_input_attachment_loads.c', 'panvk_vX_sampler.c', 'panvk_vX_shader.c', + 'panvk_vX_xfb_lower.c', sha1_h, ] diff -urN a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h --- a/src/panfrost/vulkan/panvk_shader.h 2026-05-21 14:04:02.525251986 +0200 +++ b/src/panfrost/vulkan/panvk_shader.h 2026-05-21 14:04:04.084251800 +0200 @@ -154,6 +154,8 @@ /* aligned_u64 attribute below inserts the 4-byte alignment gap * after num_vertices automatically — no explicit pad needed. */ aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */ + uint32_t xfb_topology; /* iter17: panvk_xfb_topology enum value */ + uint32_t xfb_output_count; /* iter17: per-instance output verts after decomp */ #endif int32_t first_vertex; int32_t base_instance; @@ -569,4 +571,76 @@ struct pan_compute_dim local_size, const void *bin_ptr, size_t bin_size, struct panvk_shader **shader_out); + +#if PAN_ARCH < 9 +/* iter17: encoding for vs.xfb_topology sysval. Maps VkPrimitiveTopology values + * we need to distinguish at shader runtime for XFB capture. LIST topologies + * use the iter13 single-store fast path; non-LIST need per-vertex decomposition. */ +enum panvk_xfb_topology { + PANVK_XFB_TOPO_LIST = 0, + PANVK_XFB_TOPO_LINE_STRIP = 1, + PANVK_XFB_TOPO_TRI_STRIP = 2, + PANVK_XFB_TOPO_TRI_FAN = 3, + PANVK_XFB_TOPO_LINE_LIST_ADJ = 4, + PANVK_XFB_TOPO_LINE_STRIP_ADJ = 5, + PANVK_XFB_TOPO_TRI_LIST_ADJ = 6, + PANVK_XFB_TOPO_TRI_STRIP_ADJ = 7, +}; + +#include "panvk_macros.h" +struct nir_shader; +bool panvk_per_arch(nir_lower_xfb)(struct nir_shader *nir); + +/* Map VkPrimitiveTopology to panvk_xfb_topology enum (driver-side helper). */ +static inline uint32_t +panvk_vk_topology_to_xfb_enum(VkPrimitiveTopology topo) +{ + switch (topo) { + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + return PANVK_XFB_TOPO_LINE_STRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + return PANVK_XFB_TOPO_TRI_STRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + return PANVK_XFB_TOPO_TRI_FAN; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + return PANVK_XFB_TOPO_LINE_LIST_ADJ; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + return PANVK_XFB_TOPO_LINE_STRIP_ADJ; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + return PANVK_XFB_TOPO_TRI_LIST_ADJ; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + return PANVK_XFB_TOPO_TRI_STRIP_ADJ; + case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: + default: + return PANVK_XFB_TOPO_LIST; + } +} + +/* Compute the per-instance output vertex count for a given (topology, input count). */ +static inline uint32_t +panvk_xfb_output_count(VkPrimitiveTopology topo, uint32_t input_count) +{ + switch (topo) { + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + return input_count >= 1 ? 2u * (input_count - 1u) : 0u; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + return input_count >= 2 ? 3u * (input_count - 2u) : 0u; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + return (input_count / 4u) * 2u; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + return input_count >= 3 ? 2u * (input_count - 3u) : 0u; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + return (input_count / 6u) * 3u; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + return input_count >= 6 ? 3u * (input_count / 2u - 2u) : 0u; + default: + return input_count; /* LIST topologies: 1:1 mapping */ + } +} +#endif + + #endif diff -urN a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c --- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-21 14:04:02.528576354 +0200 +++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-21 14:04:04.091357598 +0200 @@ -727,6 +727,20 @@ /* iter13: VK_EXT_transform_feedback sysvals — always set (per draw), * reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count); + + /* iter17: XFB primitive-decomposition sysvals. + * xfb_topology = enum value for the current bound topology. + * xfb_output_count = per-instance output vertex count after decomposition. + * For LIST topologies, output_count == input vertex count and the shader + * takes the iter13 single-store fast path. */ + { + VkPrimitiveTopology vk_topo = + cmdbuf->vk.dynamic_graphics_state.ia.primitive_topology; + uint32_t topo_enum = panvk_vk_topology_to_xfb_enum(vk_topo); + uint32_t out_count = panvk_xfb_output_count(vk_topo, info->vertex.count); + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_topology, topo_enum); + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_output_count, out_count); + } { const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx; /* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS diff -urN a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c --- a/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-21 14:04:02.527576494 +0200 +++ b/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-21 14:04:04.098356619 +0200 @@ -895,7 +895,10 @@ nir->info.has_transform_feedback_varyings) { NIR_PASS(_, nir, nir_opt_constant_folding); NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info); - NIR_PASS(_, nir, pan_nir_lower_xfb); + /* iter17: panvk-specific replacement for pan_nir_lower_xfb that handles + * primitive decomposition for non-LIST topologies. Single-store LIST + * fast path matches iter13 behavior. */ + NIR_PASS(_, nir, panvk_per_arch(nir_lower_xfb)); } #endif } diff -urN a/src/panfrost/vulkan/panvk_vX_xfb_lower.c b/src/panfrost/vulkan/panvk_vX_xfb_lower.c --- a/src/panfrost/vulkan/panvk_vX_xfb_lower.c 1970-01-01 01:00:00.000000000 +0100 +++ b/src/panfrost/vulkan/panvk_vX_xfb_lower.c 2026-05-21 14:04:04.115354242 +0200 @@ -0,0 +1,486 @@ +/* + * Copyright © 2026 mfritsche / claude-noether + * SPDX-License-Identifier: MIT + * + * iter17: panvk-specific replacement for pan_nir_lower_xfb that handles + * primitive decomposition for transform_feedback on non-LIST topologies + * (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY). + * + * Approach: emit a topology dispatch at the start of each store_output + * lowering. The shader reads vs.xfb_topology sysval at runtime and branches + * into per-topology emission logic. For each affected topology, the lowered + * code emits guarded conditional stores — one per primitive this vertex + * contributes to, computing the output buffer position via primitive index + * and slot within the decomposed primitive. + * + * For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that + * matches iter13's single-store behavior. + * + * For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives + * as slot 2 — handled via a NIR loop bounded by num_vertices. + * + * See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context. + */ + +#include "panvk_macros.h" + +#if PAN_ARCH < 9 + +#include "panvk_shader.h" + +#include "compiler/nir/nir_builder.h" +#include "pan_nir.h" + +#include + +/* ----- Address arithmetic ----- */ + +static nir_def * +xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx, + uint16_t stride, uint16_t offset_bytes) +{ + nir_def *byte_off = nir_iadd_imm(b, + nir_imul_imm(b, out_idx, stride), offset_bytes); + return nir_iadd(b, buf, nir_u2u64(b, byte_off)); +} + +static void +emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count, + nir_def *instance_id, nir_def *raw_vid, nir_def *value, + uint16_t stride, uint16_t offset_bytes) +{ + nir_def *out_idx = nir_iadd(b, + nir_imul(b, instance_id, output_count), raw_vid); + nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); + nir_store_global(b, value, addr); +} + +static void +emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count, + nir_def *instance_id, nir_def *eligible, + nir_def *prim_idx, nir_def *slot, + uint32_t verts_per_prim, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + nir_push_if(b, eligible); + { + nir_def *out_idx = nir_iadd(b, + nir_imul(b, instance_id, output_count), + nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot)); + nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); + nir_store_global(b, value, addr); + } + nir_pop_if(b, NULL); +} + +/* ----- Per-topology emission ----- */ + +/* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */ +static void +emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N, + nir_def *buf, nir_def *output_count, nir_def *instance_id, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + nir_def *Nm2 = nir_iadd_imm(b, N, -2); + nir_def *Nm1 = nir_iadd_imm(b, N, -1); + + /* Prim v, slot 0: v < N-2 */ + emit_prim_store(b, buf, output_count, instance_id, + nir_ult(b, v, Nm2), + v, nir_imm_int(b, 0), 3, value, stride, offset_bytes); + + /* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */ + { + nir_def *prim = nir_iadd_imm(b, v, -1); + nir_def *parity = nir_iand_imm(b, prim, 1u); + nir_def *slot = nir_iadd_imm(b, parity, 1); + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 1)), + nir_ult(b, v, Nm1)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, slot, 3, value, stride, offset_bytes); + } + + /* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */ + { + nir_def *prim = nir_iadd_imm(b, v, -2); + nir_def *parity = nir_iand_imm(b, prim, 1u); + nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 2)), + nir_ult(b, v, N)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, slot, 3, value, stride, offset_bytes); + } +} + +/* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */ +static void +emit_line_strip(nir_builder *b, nir_def *v, nir_def *N, + nir_def *buf, nir_def *output_count, nir_def *instance_id, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + nir_def *Nm1 = nir_iadd_imm(b, N, -1); + + /* Prim v, slot 0: v < N-1 */ + emit_prim_store(b, buf, output_count, instance_id, + nir_ult(b, v, Nm1), + v, nir_imm_int(b, 0), 2, value, stride, offset_bytes); + + /* Prim v-1, slot 1: 1 <= v < N */ + { + nir_def *prim = nir_iadd_imm(b, v, -1); + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 1)), + nir_ult(b, v, N)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); + } +} + +/* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}. + * vertex v=0: contributes to ALL prims as slot 2 (loop required) + * vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2) + * vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1) + */ +static void +emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N, + nir_def *buf, nir_def *output_count, nir_def *instance_id, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + nir_def *Nm1 = nir_iadd_imm(b, N, -1); + nir_def *Nm2 = nir_iadd_imm(b, N, -2); + + /* Prim v-1, slot 0: 1 <= v < N-1 */ + { + nir_def *prim = nir_iadd_imm(b, v, -1); + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 1)), + nir_ult(b, v, Nm1)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes); + } + + /* Prim v-2, slot 1: 2 <= v < N */ + { + nir_def *prim = nir_iadd_imm(b, v, -2); + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 2)), + nir_ult(b, v, N)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes); + } + + /* Central vertex (v == 0): loop over all prims, write to slot 2. */ + nir_push_if(b, nir_ieq_imm(b, v, 0)); + { + nir_variable *p_var = nir_local_variable_create(b->impl, + glsl_uint_type(), "fan_p"); + nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1); + nir_push_loop(b); + { + nir_def *p = nir_load_var(b, p_var); + nir_push_if(b, nir_uge(b, p, Nm2)); + { + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + + nir_def *out_idx = nir_iadd(b, + nir_imul(b, instance_id, output_count), + nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2)); + nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); + nir_store_global(b, value, addr); + + nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1); + } + nir_pop_loop(b, NULL); + } + nir_pop_if(b, NULL); +} + +/* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}. + * v contributes if v%4 == 1: prim v/4 slot 0 + * v contributes if v%4 == 2: prim v/4 slot 1 + */ +static void +emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N, + nir_def *buf, nir_def *output_count, nir_def *instance_id, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + (void)N; /* eligibility is mod-based, not range-based */ + nir_def *vmod4 = nir_iand_imm(b, v, 3u); + nir_def *prim = nir_ushr_imm(b, v, 2); /* v / 4 */ + + emit_prim_store(b, buf, output_count, instance_id, + nir_ieq_imm(b, vmod4, 1), + prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes); + + emit_prim_store(b, buf, output_count, instance_id, + nir_ieq_imm(b, vmod4, 2), + prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); +} + +/* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}. + * v contributes to prim v-1 slot 0 (1 <= v <= N-2) + * v contributes to prim v-2 slot 1 (2 <= v <= N-1) + */ +static void +emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N, + nir_def *buf, nir_def *output_count, nir_def *instance_id, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + nir_def *Nm1 = nir_iadd_imm(b, N, -1); + nir_def *Nm2 = nir_iadd_imm(b, N, -2); + + /* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */ + { + nir_def *prim = nir_iadd_imm(b, v, -1); + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 1)), + nir_ult(b, v, Nm1)); + (void)Nm2; + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes); + } + + /* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */ + { + nir_def *prim = nir_iadd_imm(b, v, -2); + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 2)), + nir_ult(b, v, N)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); + } +} + +/* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}. + * v contributes if v%6 == 0: prim v/6 slot 0 + * v contributes if v%6 == 2: prim v/6 slot 1 + * v contributes if v%6 == 4: prim v/6 slot 2 + */ +static void +emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N, + nir_def *buf, nir_def *output_count, nir_def *instance_id, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + (void)N; + nir_def *vmod6 = nir_umod_imm(b, v, 6); + nir_def *prim = nir_udiv_imm(b, v, 6); + + for (uint32_t slot = 0; slot < 3; slot++) { + emit_prim_store(b, buf, output_count, instance_id, + nir_ieq_imm(b, vmod6, slot * 2), + prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes); + } +} + +/* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits: + * even i: {2i, 2i+2, 2i+4} (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4) + * odd i: {2i, 2i+4, 2i+2} (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2) + * + * Only EVEN input vertices contribute (since all output indices are 2*something). + * For even input v: + * prim v/2 slot 0 (always, if v/2 < N/2-2) + * prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd (when v >= 2) + * prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd (when v >= 4) + */ +static void +emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N, + nir_def *buf, nir_def *output_count, nir_def *instance_id, + nir_def *value, uint16_t stride, uint16_t offset_bytes) +{ + /* Bail for odd input vertices — they never contribute. */ + nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0); + nir_push_if(b, v_is_even); + { + nir_def *N_half = nir_ushr_imm(b, N, 1); + nir_def *max_prim = nir_iadd_imm(b, N_half, -2); /* N/2 - 2 */ + nir_def *v_half = nir_ushr_imm(b, v, 1); + + /* Prim v/2 slot 0: v/2 < N/2 - 2 */ + emit_prim_store(b, buf, output_count, instance_id, + nir_ult(b, v_half, max_prim), + v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes); + + /* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */ + { + nir_def *prim = nir_iadd_imm(b, v_half, -1); + nir_def *parity = nir_iand_imm(b, prim, 1u); + nir_def *slot = nir_iadd_imm(b, parity, 1); /* even→1, odd→2 */ + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 2)), + nir_ult(b, prim, max_prim)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, slot, 3, value, stride, offset_bytes); + } + + /* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */ + { + nir_def *prim = nir_iadd_imm(b, v_half, -2); + nir_def *parity = nir_iand_imm(b, prim, 1u); + nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); /* even→2, odd→1 */ + nir_def *eligible = nir_iand(b, + nir_uge(b, v, nir_imm_int(b, 4)), + nir_ult(b, prim, max_prim)); + emit_prim_store(b, buf, output_count, instance_id, eligible, + prim, slot, 3, value, stride, offset_bytes); + } + } + nir_pop_if(b, NULL); +} + +/* ----- Main lowering: per store_output XFB channel ----- */ + +static void +lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr, + unsigned channel_idx, unsigned num_components, + unsigned buffer, unsigned offset_words) +{ + assert(buffer < MAX_XFB_BUFFERS); + assert(nir_intrinsic_component(intr) == 0); + + uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; + assert(stride != 0); + uint16_t offset_bytes = offset_words * 4; + + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + + nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology); + nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count); + nir_def *N = nir_load_num_vertices(b); + nir_def *v = nir_load_raw_vertex_id_pan(b); + nir_def *instance = nir_load_instance_id(b); + nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer); + + nir_def *src = intr->src[0].ssa; + nir_component_mask_t mask = nir_component_mask(num_components); + nir_def *value = nir_channels(b, src, mask << channel_idx); + + /* Topology dispatch ladder. LIST first (fast path). */ + nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST)); + { + emit_list_store(b, buf, out_count, instance, v, value, + stride, offset_bytes); + } + nir_push_else(b, NULL); + { + /* iter17 Janet Finding 3: gate all non-LIST emission on + * output_count > 0. For degenerate input counts (N < min required + * for the topology), output_count is 0 and we must emit NO stores + * — otherwise N-2 / N-3 / etc. arithmetic underflows in the + * eligibility predicates and we falsely fire stores. */ + nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count)); + { + nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP)); + { + emit_tri_strip(b, v, N, buf, out_count, instance, value, + stride, offset_bytes); + } + nir_push_else(b, NULL); + { + nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP)); + { + emit_line_strip(b, v, N, buf, out_count, instance, value, + stride, offset_bytes); + } + nir_push_else(b, NULL); + { + nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN)); + { + emit_tri_fan(b, v, N, buf, out_count, instance, value, + stride, offset_bytes); + } + nir_push_else(b, NULL); + { + nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ)); + { + emit_line_list_adj(b, v, N, buf, out_count, instance, value, + stride, offset_bytes); + } + nir_push_else(b, NULL); + { + nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ)); + { + emit_line_strip_adj(b, v, N, buf, out_count, instance, value, + stride, offset_bytes); + } + nir_push_else(b, NULL); + { + nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ)); + { + emit_tri_list_adj(b, v, N, buf, out_count, instance, value, + stride, offset_bytes); + } + nir_push_else(b, NULL); + { + /* TRI_STRIP_ADJ — last case */ + emit_tri_strip_adj(b, v, N, buf, out_count, instance, value, + stride, offset_bytes); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); /* Janet Finding 3: close output_count > 0 guard */ + } + nir_pop_if(b, NULL); +} + +/* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite + + * dispatch store_output through our topology-aware emission. */ +static bool +lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr, + UNUSED void *data) +{ + if (intr->intrinsic == nir_intrinsic_load_vertex_id) { + b->cursor = nir_instr_remove(&intr->instr); + nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b), + nir_load_raw_vertex_offset_pan(b)); + nir_def_rewrite_uses(&intr->def, repl); + return true; + } + + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + bool progress = false; + b->cursor = nir_before_instr(&intr->instr); + + /* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2. + * Outer loop selects which annotation; inner picks which channel. */ + for (unsigned i = 0; i < 2; ++i) { + nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr) + : nir_intrinsic_io_xfb(intr); + for (unsigned j = 0; j < 2; ++j) { + if (!xfb.out[j].num_components) + continue; + lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components, + xfb.out[j].buffer, xfb.out[j].offset); + progress = true; + } + } + + if (progress) + nir_instr_remove(&intr->instr); + return progress; +} + +bool +panvk_per_arch(nir_lower_xfb)(nir_shader *nir) +{ + return nir_shader_intrinsics_pass( + nir, lower_xfb_iter17, nir_metadata_control_flow, NULL); +} + +#endif /* PAN_ARCH < 9 */