/* * Copyright © 2026 mfritsche / claude-noether * SPDX-License-Identifier: MIT * * iter17: panvk-specific replacement for pan_nir_lower_xfb that handles * primitive decomposition for transform_feedback on non-LIST topologies * (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY). * * Approach: emit a topology dispatch at the start of each store_output * lowering. The shader reads vs.xfb_topology sysval at runtime and branches * into per-topology emission logic. For each affected topology, the lowered * code emits guarded conditional stores — one per primitive this vertex * contributes to, computing the output buffer position via primitive index * and slot within the decomposed primitive. * * For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that * matches iter13's single-store behavior. * * For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives * as slot 2 — handled via a NIR loop bounded by num_vertices. * * See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context. */ #include "panvk_macros.h" #if PAN_ARCH < 9 #include "panvk_shader.h" #include "compiler/nir/nir_builder.h" #include "pan_nir.h" #include /* ----- Address arithmetic ----- */ static nir_def * xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx, uint16_t stride, uint16_t offset_bytes) { nir_def *byte_off = nir_iadd_imm(b, nir_imul_imm(b, out_idx, stride), offset_bytes); return nir_iadd(b, buf, nir_u2u64(b, byte_off)); } static void emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *raw_vid, nir_def *value, uint16_t stride, uint16_t offset_bytes) { nir_def *out_idx = nir_iadd(b, nir_imul(b, instance_id, output_count), raw_vid); nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); nir_store_global(b, value, addr); } static void emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *eligible, nir_def *prim_idx, nir_def *slot, uint32_t verts_per_prim, nir_def *value, uint16_t stride, uint16_t offset_bytes) { nir_push_if(b, eligible); { nir_def *out_idx = nir_iadd(b, nir_imul(b, instance_id, output_count), nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot)); nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); nir_store_global(b, value, addr); } nir_pop_if(b, NULL); } /* ----- Per-topology emission ----- */ /* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */ static void emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *value, uint16_t stride, uint16_t offset_bytes) { nir_def *Nm2 = nir_iadd_imm(b, N, -2); nir_def *Nm1 = nir_iadd_imm(b, N, -1); /* Prim v, slot 0: v < N-2 */ emit_prim_store(b, buf, output_count, instance_id, nir_ult(b, v, Nm2), v, nir_imm_int(b, 0), 3, value, stride, offset_bytes); /* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */ { nir_def *prim = nir_iadd_imm(b, v, -1); nir_def *parity = nir_iand_imm(b, prim, 1u); nir_def *slot = nir_iadd_imm(b, parity, 1); nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 1)), nir_ult(b, v, Nm1)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, slot, 3, value, stride, offset_bytes); } /* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */ { nir_def *prim = nir_iadd_imm(b, v, -2); nir_def *parity = nir_iand_imm(b, prim, 1u); nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 2)), nir_ult(b, v, N)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, slot, 3, value, stride, offset_bytes); } } /* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */ static void emit_line_strip(nir_builder *b, nir_def *v, nir_def *N, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *value, uint16_t stride, uint16_t offset_bytes) { nir_def *Nm1 = nir_iadd_imm(b, N, -1); /* Prim v, slot 0: v < N-1 */ emit_prim_store(b, buf, output_count, instance_id, nir_ult(b, v, Nm1), v, nir_imm_int(b, 0), 2, value, stride, offset_bytes); /* Prim v-1, slot 1: 1 <= v < N */ { nir_def *prim = nir_iadd_imm(b, v, -1); nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 1)), nir_ult(b, v, N)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); } } /* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}. * vertex v=0: contributes to ALL prims as slot 2 (loop required) * vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2) * vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1) */ static void emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *value, uint16_t stride, uint16_t offset_bytes) { nir_def *Nm1 = nir_iadd_imm(b, N, -1); nir_def *Nm2 = nir_iadd_imm(b, N, -2); /* Prim v-1, slot 0: 1 <= v < N-1 */ { nir_def *prim = nir_iadd_imm(b, v, -1); nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 1)), nir_ult(b, v, Nm1)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes); } /* Prim v-2, slot 1: 2 <= v < N */ { nir_def *prim = nir_iadd_imm(b, v, -2); nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 2)), nir_ult(b, v, N)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes); } /* Central vertex (v == 0): loop over all prims, write to slot 2. */ nir_push_if(b, nir_ieq_imm(b, v, 0)); { nir_variable *p_var = nir_local_variable_create(b->impl, glsl_uint_type(), "fan_p"); nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1); nir_push_loop(b); { nir_def *p = nir_load_var(b, p_var); nir_push_if(b, nir_uge(b, p, Nm2)); { nir_jump(b, nir_jump_break); } nir_pop_if(b, NULL); nir_def *out_idx = nir_iadd(b, nir_imul(b, instance_id, output_count), nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2)); nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); nir_store_global(b, value, addr); nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1); } nir_pop_loop(b, NULL); } nir_pop_if(b, NULL); } /* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}. * v contributes if v%4 == 1: prim v/4 slot 0 * v contributes if v%4 == 2: prim v/4 slot 1 */ static void emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *value, uint16_t stride, uint16_t offset_bytes) { (void)N; /* eligibility is mod-based, not range-based */ nir_def *vmod4 = nir_iand_imm(b, v, 3u); nir_def *prim = nir_ushr_imm(b, v, 2); /* v / 4 */ emit_prim_store(b, buf, output_count, instance_id, nir_ieq_imm(b, vmod4, 1), prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes); emit_prim_store(b, buf, output_count, instance_id, nir_ieq_imm(b, vmod4, 2), prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); } /* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}. * v contributes to prim v-1 slot 0 (1 <= v <= N-2) * v contributes to prim v-2 slot 1 (2 <= v <= N-1) */ static void emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *value, uint16_t stride, uint16_t offset_bytes) { nir_def *Nm1 = nir_iadd_imm(b, N, -1); nir_def *Nm2 = nir_iadd_imm(b, N, -2); /* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */ { nir_def *prim = nir_iadd_imm(b, v, -1); nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 1)), nir_ult(b, v, Nm1)); (void)Nm2; emit_prim_store(b, buf, output_count, instance_id, eligible, prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes); } /* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */ { nir_def *prim = nir_iadd_imm(b, v, -2); nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 2)), nir_ult(b, v, N)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); } } /* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}. * v contributes if v%6 == 0: prim v/6 slot 0 * v contributes if v%6 == 2: prim v/6 slot 1 * v contributes if v%6 == 4: prim v/6 slot 2 */ static void emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *value, uint16_t stride, uint16_t offset_bytes) { (void)N; nir_def *vmod6 = nir_umod_imm(b, v, 6); nir_def *prim = nir_udiv_imm(b, v, 6); for (uint32_t slot = 0; slot < 3; slot++) { emit_prim_store(b, buf, output_count, instance_id, nir_ieq_imm(b, vmod6, slot * 2), prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes); } } /* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits: * even i: {2i, 2i+2, 2i+4} (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4) * odd i: {2i, 2i+4, 2i+2} (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2) * * Only EVEN input vertices contribute (since all output indices are 2*something). * For even input v: * prim v/2 slot 0 (always, if v/2 < N/2-2) * prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd (when v >= 2) * prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd (when v >= 4) */ static void emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N, nir_def *buf, nir_def *output_count, nir_def *instance_id, nir_def *value, uint16_t stride, uint16_t offset_bytes) { /* Bail for odd input vertices — they never contribute. */ nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0); nir_push_if(b, v_is_even); { nir_def *N_half = nir_ushr_imm(b, N, 1); nir_def *max_prim = nir_iadd_imm(b, N_half, -2); /* N/2 - 2 */ nir_def *v_half = nir_ushr_imm(b, v, 1); /* Prim v/2 slot 0: v/2 < N/2 - 2 */ emit_prim_store(b, buf, output_count, instance_id, nir_ult(b, v_half, max_prim), v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes); /* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */ { nir_def *prim = nir_iadd_imm(b, v_half, -1); nir_def *parity = nir_iand_imm(b, prim, 1u); nir_def *slot = nir_iadd_imm(b, parity, 1); /* even→1, odd→2 */ nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 2)), nir_ult(b, prim, max_prim)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, slot, 3, value, stride, offset_bytes); } /* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */ { nir_def *prim = nir_iadd_imm(b, v_half, -2); nir_def *parity = nir_iand_imm(b, prim, 1u); nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); /* even→2, odd→1 */ nir_def *eligible = nir_iand(b, nir_uge(b, v, nir_imm_int(b, 4)), nir_ult(b, prim, max_prim)); emit_prim_store(b, buf, output_count, instance_id, eligible, prim, slot, 3, value, stride, offset_bytes); } } nir_pop_if(b, NULL); } /* ----- Main lowering: per store_output XFB channel ----- */ static void lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr, unsigned channel_idx, unsigned num_components, unsigned buffer, unsigned offset_words) { assert(buffer < MAX_XFB_BUFFERS); assert(nir_intrinsic_component(intr) == 0); uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; assert(stride != 0); uint16_t offset_bytes = offset_words * 4; BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology); nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count); nir_def *N = nir_load_num_vertices(b); nir_def *v = nir_load_raw_vertex_id_pan(b); nir_def *instance = nir_load_instance_id(b); nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer); nir_def *src = intr->src[0].ssa; nir_component_mask_t mask = nir_component_mask(num_components); nir_def *value = nir_channels(b, src, mask << channel_idx); /* Topology dispatch ladder. LIST first (fast path). */ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST)); { emit_list_store(b, buf, out_count, instance, v, value, stride, offset_bytes); } nir_push_else(b, NULL); { /* iter17 Janet Finding 3: gate all non-LIST emission on * output_count > 0. For degenerate input counts (N < min required * for the topology), output_count is 0 and we must emit NO stores * — otherwise N-2 / N-3 / etc. arithmetic underflows in the * eligibility predicates and we falsely fire stores. */ nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count)); { nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP)); { emit_tri_strip(b, v, N, buf, out_count, instance, value, stride, offset_bytes); } nir_push_else(b, NULL); { nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP)); { emit_line_strip(b, v, N, buf, out_count, instance, value, stride, offset_bytes); } nir_push_else(b, NULL); { nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN)); { emit_tri_fan(b, v, N, buf, out_count, instance, value, stride, offset_bytes); } nir_push_else(b, NULL); { nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ)); { emit_line_list_adj(b, v, N, buf, out_count, instance, value, stride, offset_bytes); } nir_push_else(b, NULL); { nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ)); { emit_line_strip_adj(b, v, N, buf, out_count, instance, value, stride, offset_bytes); } nir_push_else(b, NULL); { nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ)); { emit_tri_list_adj(b, v, N, buf, out_count, instance, value, stride, offset_bytes); } nir_push_else(b, NULL); { /* TRI_STRIP_ADJ — last case */ emit_tri_strip_adj(b, v, N, buf, out_count, instance, value, stride, offset_bytes); } nir_pop_if(b, NULL); } nir_pop_if(b, NULL); } nir_pop_if(b, NULL); } nir_pop_if(b, NULL); } nir_pop_if(b, NULL); } nir_pop_if(b, NULL); } nir_pop_if(b, NULL); /* Janet Finding 3: close output_count > 0 guard */ } nir_pop_if(b, NULL); } /* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite + * dispatch store_output through our topology-aware emission. */ static bool lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { if (intr->intrinsic == nir_intrinsic_load_vertex_id) { b->cursor = nir_instr_remove(&intr->instr); nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b), nir_load_raw_vertex_offset_pan(b)); nir_def_rewrite_uses(&intr->def, repl); return true; } if (intr->intrinsic != nir_intrinsic_store_output) return false; bool progress = false; b->cursor = nir_before_instr(&intr->instr); /* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2. * Outer loop selects which annotation; inner picks which channel. */ for (unsigned i = 0; i < 2; ++i) { nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr); for (unsigned j = 0; j < 2; ++j) { if (!xfb.out[j].num_components) continue; lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components, xfb.out[j].buffer, xfb.out[j].offset); progress = true; } } if (progress) nir_instr_remove(&intr->instr); return progress; } bool panvk_per_arch(nir_lower_xfb)(nir_shader *nir) { return nir_shader_intrinsics_pass( nir, lower_xfb_iter17, nir_metadata_control_flow, NULL); } #endif /* PAN_ARCH < 9 */