forked from marfrit/panvk-bifrost
initial seed: retrofit campaign lineage from local working trees
panvk-bifrost campaigns (r1..r4 Vulkan compositor + r5.video1 Vulkan
video decode) shipped before this repo existed; the deliverable
patches live in marfrit-packages, but the reasoning chain, phase docs,
and source-state evidence lived only in local working trees on the
development host.
This retrofit imports:
- mesa-panvk-bifrost/ — r1..r4 era phase docs (iter1..iter18)
(libmali stub blobs at iter18/blob/ excluded
— 109MB of RE artifacts replaced with a README
pointer)
- mesa-panvk-bifrost-video/ — sibling campaign phase docs + probe
- evidence/ — frozen .tgz source snapshots at each milestone
(basis for the 0005 patch diff generation)
Future iterations should branch off here from day one, so each iter is
a commit rather than a snapshot. See [[feedback-session-local-process-pins]]
for the process drift this retrofit closes.
Total: 1.9 MB across 124 files.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,486 @@
|
||||
/*
|
||||
* Copyright © 2026 mfritsche / claude-noether
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* iter17: panvk-specific replacement for pan_nir_lower_xfb that handles
|
||||
* primitive decomposition for transform_feedback on non-LIST topologies
|
||||
* (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY).
|
||||
*
|
||||
* Approach: emit a topology dispatch at the start of each store_output
|
||||
* lowering. The shader reads vs.xfb_topology sysval at runtime and branches
|
||||
* into per-topology emission logic. For each affected topology, the lowered
|
||||
* code emits guarded conditional stores — one per primitive this vertex
|
||||
* contributes to, computing the output buffer position via primitive index
|
||||
* and slot within the decomposed primitive.
|
||||
*
|
||||
* For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that
|
||||
* matches iter13's single-store behavior.
|
||||
*
|
||||
* For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives
|
||||
* as slot 2 — handled via a NIR loop bounded by num_vertices.
|
||||
*
|
||||
* See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context.
|
||||
*/
|
||||
|
||||
#include "panvk_macros.h"
|
||||
|
||||
#if PAN_ARCH < 9
|
||||
|
||||
#include "panvk_shader.h"
|
||||
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "pan_nir.h"
|
||||
|
||||
#include <vulkan/vulkan_core.h>
|
||||
|
||||
/* ----- Address arithmetic ----- */
|
||||
|
||||
static nir_def *
|
||||
xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx,
|
||||
uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
nir_def *byte_off = nir_iadd_imm(b,
|
||||
nir_imul_imm(b, out_idx, stride), offset_bytes);
|
||||
return nir_iadd(b, buf, nir_u2u64(b, byte_off));
|
||||
}
|
||||
|
||||
static void
|
||||
emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count,
|
||||
nir_def *instance_id, nir_def *raw_vid, nir_def *value,
|
||||
uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
nir_def *out_idx = nir_iadd(b,
|
||||
nir_imul(b, instance_id, output_count), raw_vid);
|
||||
nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
|
||||
nir_store_global(b, value, addr);
|
||||
}
|
||||
|
||||
static void
|
||||
emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count,
|
||||
nir_def *instance_id, nir_def *eligible,
|
||||
nir_def *prim_idx, nir_def *slot,
|
||||
uint32_t verts_per_prim,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
nir_push_if(b, eligible);
|
||||
{
|
||||
nir_def *out_idx = nir_iadd(b,
|
||||
nir_imul(b, instance_id, output_count),
|
||||
nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot));
|
||||
nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
|
||||
nir_store_global(b, value, addr);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
|
||||
/* ----- Per-topology emission ----- */
|
||||
|
||||
/* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */
|
||||
static void
|
||||
emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
nir_def *Nm2 = nir_iadd_imm(b, N, -2);
|
||||
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||
|
||||
/* Prim v, slot 0: v < N-2 */
|
||||
emit_prim_store(b, buf, output_count, instance_id,
|
||||
nir_ult(b, v, Nm2),
|
||||
v, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
|
||||
|
||||
/* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||
nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||
nir_def *slot = nir_iadd_imm(b, parity, 1);
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 1)),
|
||||
nir_ult(b, v, Nm1));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, slot, 3, value, stride, offset_bytes);
|
||||
}
|
||||
|
||||
/* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -2);
|
||||
nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||
nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity);
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 2)),
|
||||
nir_ult(b, v, N));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, slot, 3, value, stride, offset_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */
|
||||
static void
|
||||
emit_line_strip(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||
|
||||
/* Prim v, slot 0: v < N-1 */
|
||||
emit_prim_store(b, buf, output_count, instance_id,
|
||||
nir_ult(b, v, Nm1),
|
||||
v, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
|
||||
|
||||
/* Prim v-1, slot 1: 1 <= v < N */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 1)),
|
||||
nir_ult(b, v, N));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}.
|
||||
* vertex v=0: contributes to ALL prims as slot 2 (loop required)
|
||||
* vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2)
|
||||
* vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1)
|
||||
*/
|
||||
static void
|
||||
emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||
nir_def *Nm2 = nir_iadd_imm(b, N, -2);
|
||||
|
||||
/* Prim v-1, slot 0: 1 <= v < N-1 */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 1)),
|
||||
nir_ult(b, v, Nm1));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
|
||||
}
|
||||
|
||||
/* Prim v-2, slot 1: 2 <= v < N */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -2);
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 2)),
|
||||
nir_ult(b, v, N));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes);
|
||||
}
|
||||
|
||||
/* Central vertex (v == 0): loop over all prims, write to slot 2. */
|
||||
nir_push_if(b, nir_ieq_imm(b, v, 0));
|
||||
{
|
||||
nir_variable *p_var = nir_local_variable_create(b->impl,
|
||||
glsl_uint_type(), "fan_p");
|
||||
nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1);
|
||||
nir_push_loop(b);
|
||||
{
|
||||
nir_def *p = nir_load_var(b, p_var);
|
||||
nir_push_if(b, nir_uge(b, p, Nm2));
|
||||
{
|
||||
nir_jump(b, nir_jump_break);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
|
||||
nir_def *out_idx = nir_iadd(b,
|
||||
nir_imul(b, instance_id, output_count),
|
||||
nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2));
|
||||
nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
|
||||
nir_store_global(b, value, addr);
|
||||
|
||||
nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1);
|
||||
}
|
||||
nir_pop_loop(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
|
||||
/* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}.
|
||||
* v contributes if v%4 == 1: prim v/4 slot 0
|
||||
* v contributes if v%4 == 2: prim v/4 slot 1
|
||||
*/
|
||||
static void
|
||||
emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
(void)N; /* eligibility is mod-based, not range-based */
|
||||
nir_def *vmod4 = nir_iand_imm(b, v, 3u);
|
||||
nir_def *prim = nir_ushr_imm(b, v, 2); /* v / 4 */
|
||||
|
||||
emit_prim_store(b, buf, output_count, instance_id,
|
||||
nir_ieq_imm(b, vmod4, 1),
|
||||
prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
|
||||
|
||||
emit_prim_store(b, buf, output_count, instance_id,
|
||||
nir_ieq_imm(b, vmod4, 2),
|
||||
prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
|
||||
}
|
||||
|
||||
/* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}.
|
||||
* v contributes to prim v-1 slot 0 (1 <= v <= N-2)
|
||||
* v contributes to prim v-2 slot 1 (2 <= v <= N-1)
|
||||
*/
|
||||
static void
|
||||
emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||
nir_def *Nm2 = nir_iadd_imm(b, N, -2);
|
||||
|
||||
/* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 1)),
|
||||
nir_ult(b, v, Nm1));
|
||||
(void)Nm2;
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
|
||||
}
|
||||
|
||||
/* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -2);
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 2)),
|
||||
nir_ult(b, v, N));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}.
|
||||
* v contributes if v%6 == 0: prim v/6 slot 0
|
||||
* v contributes if v%6 == 2: prim v/6 slot 1
|
||||
* v contributes if v%6 == 4: prim v/6 slot 2
|
||||
*/
|
||||
static void
|
||||
emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
(void)N;
|
||||
nir_def *vmod6 = nir_umod_imm(b, v, 6);
|
||||
nir_def *prim = nir_udiv_imm(b, v, 6);
|
||||
|
||||
for (uint32_t slot = 0; slot < 3; slot++) {
|
||||
emit_prim_store(b, buf, output_count, instance_id,
|
||||
nir_ieq_imm(b, vmod6, slot * 2),
|
||||
prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits:
|
||||
* even i: {2i, 2i+2, 2i+4} (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4)
|
||||
* odd i: {2i, 2i+4, 2i+2} (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2)
|
||||
*
|
||||
* Only EVEN input vertices contribute (since all output indices are 2*something).
|
||||
* For even input v:
|
||||
* prim v/2 slot 0 (always, if v/2 < N/2-2)
|
||||
* prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd (when v >= 2)
|
||||
* prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd (when v >= 4)
|
||||
*/
|
||||
static void
|
||||
emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||
nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||
{
|
||||
/* Bail for odd input vertices — they never contribute. */
|
||||
nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0);
|
||||
nir_push_if(b, v_is_even);
|
||||
{
|
||||
nir_def *N_half = nir_ushr_imm(b, N, 1);
|
||||
nir_def *max_prim = nir_iadd_imm(b, N_half, -2); /* N/2 - 2 */
|
||||
nir_def *v_half = nir_ushr_imm(b, v, 1);
|
||||
|
||||
/* Prim v/2 slot 0: v/2 < N/2 - 2 */
|
||||
emit_prim_store(b, buf, output_count, instance_id,
|
||||
nir_ult(b, v_half, max_prim),
|
||||
v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
|
||||
|
||||
/* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v_half, -1);
|
||||
nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||
nir_def *slot = nir_iadd_imm(b, parity, 1); /* even→1, odd→2 */
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 2)),
|
||||
nir_ult(b, prim, max_prim));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, slot, 3, value, stride, offset_bytes);
|
||||
}
|
||||
|
||||
/* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v_half, -2);
|
||||
nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||
nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); /* even→2, odd→1 */
|
||||
nir_def *eligible = nir_iand(b,
|
||||
nir_uge(b, v, nir_imm_int(b, 4)),
|
||||
nir_ult(b, prim, max_prim));
|
||||
emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||
prim, slot, 3, value, stride, offset_bytes);
|
||||
}
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
|
||||
/* ----- Main lowering: per store_output XFB channel ----- */
|
||||
|
||||
static void
|
||||
lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr,
|
||||
unsigned channel_idx, unsigned num_components,
|
||||
unsigned buffer, unsigned offset_words)
|
||||
{
|
||||
assert(buffer < MAX_XFB_BUFFERS);
|
||||
assert(nir_intrinsic_component(intr) == 0);
|
||||
|
||||
uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
|
||||
assert(stride != 0);
|
||||
uint16_t offset_bytes = offset_words * 4;
|
||||
|
||||
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
|
||||
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
|
||||
|
||||
nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology);
|
||||
nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count);
|
||||
nir_def *N = nir_load_num_vertices(b);
|
||||
nir_def *v = nir_load_raw_vertex_id_pan(b);
|
||||
nir_def *instance = nir_load_instance_id(b);
|
||||
nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
|
||||
|
||||
nir_def *src = intr->src[0].ssa;
|
||||
nir_component_mask_t mask = nir_component_mask(num_components);
|
||||
nir_def *value = nir_channels(b, src, mask << channel_idx);
|
||||
|
||||
/* Topology dispatch ladder. LIST first (fast path). */
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST));
|
||||
{
|
||||
emit_list_store(b, buf, out_count, instance, v, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
/* iter17 Janet Finding 3: gate all non-LIST emission on
|
||||
* output_count > 0. For degenerate input counts (N < min required
|
||||
* for the topology), output_count is 0 and we must emit NO stores
|
||||
* — otherwise N-2 / N-3 / etc. arithmetic underflows in the
|
||||
* eligibility predicates and we falsely fire stores. */
|
||||
nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count));
|
||||
{
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP));
|
||||
{
|
||||
emit_tri_strip(b, v, N, buf, out_count, instance, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP));
|
||||
{
|
||||
emit_line_strip(b, v, N, buf, out_count, instance, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN));
|
||||
{
|
||||
emit_tri_fan(b, v, N, buf, out_count, instance, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ));
|
||||
{
|
||||
emit_line_list_adj(b, v, N, buf, out_count, instance, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ));
|
||||
{
|
||||
emit_line_strip_adj(b, v, N, buf, out_count, instance, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ));
|
||||
{
|
||||
emit_tri_list_adj(b, v, N, buf, out_count, instance, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_push_else(b, NULL);
|
||||
{
|
||||
/* TRI_STRIP_ADJ — last case */
|
||||
emit_tri_strip_adj(b, v, N, buf, out_count, instance, value,
|
||||
stride, offset_bytes);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
nir_pop_if(b, NULL); /* Janet Finding 3: close output_count > 0 guard */
|
||||
}
|
||||
nir_pop_if(b, NULL);
|
||||
}
|
||||
|
||||
/* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite +
|
||||
* dispatch store_output through our topology-aware emission. */
|
||||
static bool
|
||||
lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr,
|
||||
UNUSED void *data)
|
||||
{
|
||||
if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
|
||||
b->cursor = nir_instr_remove(&intr->instr);
|
||||
nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b),
|
||||
nir_load_raw_vertex_offset_pan(b));
|
||||
nir_def_rewrite_uses(&intr->def, repl);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (intr->intrinsic != nir_intrinsic_store_output)
|
||||
return false;
|
||||
|
||||
bool progress = false;
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
|
||||
/* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2.
|
||||
* Outer loop selects which annotation; inner picks which channel. */
|
||||
for (unsigned i = 0; i < 2; ++i) {
|
||||
nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr)
|
||||
: nir_intrinsic_io_xfb(intr);
|
||||
for (unsigned j = 0; j < 2; ++j) {
|
||||
if (!xfb.out[j].num_components)
|
||||
continue;
|
||||
lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components,
|
||||
xfb.out[j].buffer, xfb.out[j].offset);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
nir_instr_remove(&intr->instr);
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
panvk_per_arch(nir_lower_xfb)(nir_shader *nir)
|
||||
{
|
||||
return nir_shader_intrinsics_pass(
|
||||
nir, lower_xfb_iter17, nir_metadata_control_flow, NULL);
|
||||
}
|
||||
|
||||
#endif /* PAN_ARCH < 9 */
|
||||
@@ -0,0 +1,68 @@
|
||||
# Phase 0 — substrate lock for iter17
|
||||
|
||||
**Goal:** close the 162 `winding_*` CTS failures from iter15 via **NIR-pass-level primitive decomposition** in (a panvk-specific replacement of) `pan_nir_lower_xfb`. iter16 attempted dispatch-level decomposition and hit an opaque wall; this iter bypasses that entire surface.
|
||||
|
||||
Operator framing 2026-05-21: "2 it is" — picked Path C from iter16's deferred-close architect consultation.
|
||||
|
||||
## What changed since iter16
|
||||
|
||||
- iter16's WIP patches REVERTED on ohm. Source tree at `/home/mfritsche/mesa-build/mesa-26.0.6/` is back to clean iter13 r3 state (iter8+iter9 sed-applied + iter13 unified-diff applied).
|
||||
- Verification: probe_winding.c against the rebuilt iter13-only lib captures 8 entries for TRIANGLE_STRIP — matches the pre-iter16 baseline.
|
||||
- `panvk_vX_winding.c` left on disk as an orphan (not in meson). May be reused as a reference for the per-topology mapping logic when porting to NIR builder form. Or deleted in Phase 4 if unused.
|
||||
|
||||
## What iter17 needs (NIR-pass approach)
|
||||
|
||||
Currently `pan_nir_lower_xfb` at `src/panfrost/compiler/pan_nir_lower_xfb.c` (80 LoC) emits ONE `nir_store_global` per VS invocation:
|
||||
|
||||
```
|
||||
index = instance_id * num_vertices + raw_vertex_id_pan
|
||||
addr = xfb_address[buffer] + index * stride + offset
|
||||
store_global(addr, captured_value)
|
||||
```
|
||||
|
||||
For strip/fan/adjacency topologies, the spec wants OUTPUT-VERTEX indexing, not INPUT-vertex indexing. iter17's approach: emit MULTIPLE store_globals per VS invocation, one for each primitive this vertex contributes to. For TRIANGLE_STRIP with input vertex v on a strip of N vertices:
|
||||
- Contributes to prim (v−2) if v ≥ 2: slot 2 if (v−2)%2==0 else slot 1
|
||||
- Contributes to prim (v−1) if v ≥ 1 and v+1 < N: slot 1 if (v−1)%2==0 else slot 2
|
||||
- Contributes to prim v if v+2 < N: slot 0
|
||||
|
||||
For each contribution, compute the XFB output position (`prim_idx * verts_per_prim + slot`) and emit a guarded store. All seven affected topologies have similar contribution maps.
|
||||
|
||||
## Topology must be available at NIR-pass time
|
||||
|
||||
Pipeline compilation doesn't currently know the draw topology — that's draw-state. Two options:
|
||||
|
||||
| Approach | Cost | Notes |
|
||||
|---|---|---|
|
||||
| Variant explosion: compile 1 shader per (XFB-bearing × topology) combo | 1+7 = 8 variants per XFB shader, on top of iter13's 1 variant. Modest shader-cache bloat but no runtime overhead. | Pipeline knows topology at draw-bind time → select variant. |
|
||||
| Sysval `vs.xfb_topology` + runtime switch in shader | 1 variant per XFB shader. Single shader with switch on the topology sysval, branches to per-topology contribution logic. | Slight per-VS-invocation overhead from the switch; cleaner cache. |
|
||||
|
||||
**Lean: sysval approach** (Phase 2 will lock it). Variant explosion is wasteful when ANGLE (the only real consumer) pre-decomposes anyway and the workload here is purely for raw-Vulkan-app compliance with CTS.
|
||||
|
||||
## Out-of-scope failure modes
|
||||
|
||||
- `pan_nir_lower_xfb` is **upstream Mesa code shared with Panfrost-Gallium**. Modifying it directly would affect Gallium GL XFB on Bifrost+Valhall — same hardware, different code path consumers. Per [[feedback-no-upstream-proposals]] we won't upstream; per safety we won't disturb the Gallium consumers either.
|
||||
- **Decision (locked here):** instead of modifying `pan_nir_lower_xfb`, write a **panvk-specific replacement pass** in `src/panfrost/vulkan/panvk_vX_xfb_lower.c` (or similar) that does what `pan_nir_lower_xfb` does AND the multi-store decomposition. iter13's call to `pan_nir_lower_xfb` in `panvk_vX_shader.c` is replaced with our new pass. Gallium consumers stay untouched.
|
||||
|
||||
## Time / complexity estimate
|
||||
|
||||
- Phase 1 source map (read pan_nir_lower_xfb.c, understand NIR builders): 1-2h
|
||||
- Phase 2 design lock (sysval format, per-topology contribution logic): 1-2h
|
||||
- Phase 3 probe: already exists (iter16/probe_winding.c) — just reuse
|
||||
- Phase 4 implementation: 1-3 days (write panvk_vX_xfb_lower.c, wire into panvk_vX_shader.c, fix until probe passes)
|
||||
- Phase 5 review: spawn janet/Plan reviewer
|
||||
- Phase 6 CTS rerun: ~2h
|
||||
- Phase 8 PKGBUILD + close: standard
|
||||
|
||||
Total estimate: 3-5 working days for the full cycle, comparable to iter16's plan.
|
||||
|
||||
## Risk
|
||||
|
||||
The iter17 approach trades dispatch-level surface (which broke in iter16) for NIR-pass surface. The NIR-pass is more concentrated and testable in isolation, but Mesa's NIR API is complex. Failure modes for iter17:
|
||||
|
||||
- NIR builders for per-vertex contribution logic might not compose right with iter13's existing pan_nir_lower_xfb structure
|
||||
- Topology sysval threading might run into the same "shader compile doesn't know topology" issue at a slightly different layer
|
||||
- Bifrost compiler might not optimize the multi-store pattern well, causing GPU stalls on register pressure
|
||||
|
||||
If iter17 hits a wall as deep as iter16's, the campaign retreats with TWO documented attempt-and-defer iterations on the winding problem. That's still useful — clear documentation that this corner is hard.
|
||||
|
||||
— claude-noether, 2026-05-21
|
||||
@@ -0,0 +1,144 @@
|
||||
# Phase 1 — source map for iter17
|
||||
|
||||
## `pan_nir_lower_xfb.c` (80 LoC)
|
||||
|
||||
Anatomy:
|
||||
|
||||
| Lines | Function | What it does |
|
||||
|---|---|---|
|
||||
| 9-40 | `lower_xfb_output` | Per (output, channel) → emit ONE `store_global` |
|
||||
| 42-77 | `lower_xfb` | Per intrinsic: handle `load_vertex_id` rewrite + dispatch to `lower_xfb_output` for each non-zero channel in the `nir_io_xfb` annotation |
|
||||
| 79-84 | `pan_nir_lower_xfb` | Top-level wrapper calling `nir_shader_intrinsics_pass` |
|
||||
|
||||
### Core formula (lines 23-34)
|
||||
|
||||
```c
|
||||
nir_def *index = nir_iadd(b,
|
||||
nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)),
|
||||
nir_load_raw_vertex_id_pan(b));
|
||||
nir_def *addr = xfb_address[buffer] + index * stride + offset_bytes;
|
||||
nir_store_global(b, value, addr);
|
||||
```
|
||||
|
||||
**Critical observation:** `nir_load_num_vertices(b)` is a sysval — already in iter13's `panvk_graphics_sysvals.vs.num_vertices`. iter16's design added a second sysval (`xfb.decomposed_count`) for the override case. iter17 doesn't need that one; we keep input_count in `num_vertices` and do the decomposition arithmetic in the shader using a *third* sysval: `vs.xfb_topology`.
|
||||
|
||||
## NIR builder pattern we'll use
|
||||
|
||||
For our panvk-specific replacement pass, the existing single store becomes:
|
||||
|
||||
```c
|
||||
nir_def *topology = load_sysval(b, vs.xfb_topology); /* uint32 */
|
||||
|
||||
/* Branch per topology family. Each branch emits 1-3 (or more for TRI_FAN)
|
||||
* conditional stores per VS invocation. */
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP));
|
||||
{
|
||||
emit_tri_strip_stores(b, /* contribution arithmetic */);
|
||||
}
|
||||
nir_push_else(b);
|
||||
{
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP));
|
||||
{
|
||||
emit_line_strip_stores(b, ...);
|
||||
}
|
||||
/* ... etc per topology ... */
|
||||
}
|
||||
```
|
||||
|
||||
## Per-vertex contribution map
|
||||
|
||||
For each affected topology, **input vertex v** contributes to a small set of `(primitive_idx, slot)` pairs.
|
||||
|
||||
### TRIANGLE_STRIP (canonical case)
|
||||
|
||||
Decomposition: prim p emits `{p, p+1+p%2, p+2-p%2}` (even/odd winding flip).
|
||||
|
||||
Inverse — for input vertex v on a strip of N vertices, contributes to:
|
||||
|
||||
| Primitive | Eligibility | Slot |
|
||||
|---|---|---|
|
||||
| p = v | 0 ≤ v ≤ N−3 | 0 |
|
||||
| p = v − 1 | 1 ≤ v ≤ N−2 | 1 if (v−1) even, else 2 |
|
||||
| p = v − 2 | 2 ≤ v ≤ N−1 | 2 if (v−2) even, else 1 |
|
||||
|
||||
Up to 3 stores per VS invocation. Each store guarded by the eligibility predicate.
|
||||
|
||||
### LINE_STRIP
|
||||
|
||||
Decomposition: prim p emits `{p, p+1}`. Vertex v contributes to:
|
||||
|
||||
| Primitive | Eligibility | Slot |
|
||||
|---|---|---|
|
||||
| p = v | 0 ≤ v ≤ N−2 | 0 |
|
||||
| p = v − 1 | 1 ≤ v ≤ N−1 | 1 |
|
||||
|
||||
Up to 2 stores.
|
||||
|
||||
### TRIANGLE_FAN — the awkward case
|
||||
|
||||
Decomposition: prim p emits `{p+1, p+2, 0}`. Vertex v contributes to:
|
||||
|
||||
| Primitive | Eligibility | Slot |
|
||||
|---|---|---|
|
||||
| p = v − 1 | 1 ≤ v ≤ N−2 | 0 |
|
||||
| p = v − 2 | 2 ≤ v ≤ N−1 | 1 |
|
||||
| **p = any in [0, N−2)** | **v == 0** | **2** |
|
||||
|
||||
The **central vertex (v=0)** contributes to ALL primitives as slot 2. That's O(N) stores from a single VS invocation, requiring a **NIR loop** bounded by `num_vertices`.
|
||||
|
||||
### Adjacency variants
|
||||
|
||||
- LINE_LIST_WITH_ADJACENCY: prim p emits `{4p+1, 4p+2}`. Vertex v contributes only if (v%4 ∈ {1, 2}) — O(1) stores.
|
||||
- LINE_STRIP_WITH_ADJACENCY: prim p emits `{p+1, p+2}`. Similar to LINE_STRIP shifted by 1. O(1) stores.
|
||||
- TRIANGLE_LIST_WITH_ADJACENCY: prim p emits `{6p, 6p+2, 6p+4}`. Vertex v contributes only if (v%6 ∈ {0, 2, 4}) — O(1) stores.
|
||||
- TRIANGLE_STRIP_WITH_ADJACENCY: prim p emits `{2p, 2p+2, 2p+4}` for even p; `{2p, 2p+4, 2p+2}` for odd. O(1) stores per vertex.
|
||||
|
||||
## Implications for Phase 2
|
||||
|
||||
- **6 of 7 affected topologies have O(1) contributions per VS invocation** — straightforward `nir_push_if` + emit.
|
||||
- **TRIANGLE_FAN's central vertex needs a NIR loop** — requires `nir_push_loop` and a conditional `nir_break` based on `num_vertices`.
|
||||
- **The runtime topology switch** is a 7-way branch on `vs.xfb_topology` sysval (plus a pass-through for LIST topologies). NIR generates clean conditional code; Bifrost backend should optimize it OK.
|
||||
|
||||
## What the sysval `vs.xfb_topology` looks like
|
||||
|
||||
8-bit integer in graphics_sysvals struct. Enum values:
|
||||
```c
|
||||
enum panvk_xfb_topology {
|
||||
PANVK_XFB_TOPO_LIST = 0, /* pass-through; current iter13 formula */
|
||||
PANVK_XFB_TOPO_LINE_STRIP = 1,
|
||||
PANVK_XFB_TOPO_TRI_STRIP = 2,
|
||||
PANVK_XFB_TOPO_TRI_FAN = 3,
|
||||
PANVK_XFB_TOPO_LINE_LIST_ADJ = 4,
|
||||
PANVK_XFB_TOPO_LINE_STRIP_ADJ = 5,
|
||||
PANVK_XFB_TOPO_TRI_LIST_ADJ = 6,
|
||||
PANVK_XFB_TOPO_TRI_STRIP_ADJ = 7,
|
||||
};
|
||||
```
|
||||
|
||||
Driver maps `VkPrimitiveTopology` → `panvk_xfb_topology` at draw time, sets the sysval via `set_gfx_sysval(cmdbuf, dirty, vs.xfb_topology, val)`.
|
||||
|
||||
## Risk: shader complexity
|
||||
|
||||
The lowered shader after iter17 will have:
|
||||
- 1 sysval load
|
||||
- 7 conditional branches
|
||||
- 2-3 conditional stores per branch (except TRI_FAN which has a loop)
|
||||
- per-store address arithmetic
|
||||
|
||||
That's a lot for what was a single `store_global`. On Bifrost (in-order architecture), branches are cheap but the increased instruction count + register pressure could hurt throughput.
|
||||
|
||||
Mitigation: most XFB workloads are tiny (per-frame, dozens to thousands of vertices). The throughput cost is irrelevant for the CTS-driven correctness target. Real-world XFB-heavy workloads (rare on Bifrost) might prefer iter13's single-store path, but those aren't impacted by iter17's correctness fix because the LIST topology still uses the fast path (topology == PANVK_XFB_TOPO_LIST → emit single store).
|
||||
|
||||
## What to write in Phase 4
|
||||
|
||||
NEW file: `src/panfrost/vulkan/panvk_vX_xfb_lower.c` — a panvk-specific replacement for `pan_nir_lower_xfb`. Calls into pieces of pan_nir_lower_xfb for the LIST case (or re-implements its minimal logic) and adds the per-topology contribution branches for the others. Exposed as `panvk_per_arch(nir_lower_xfb)(nir_shader *)`.
|
||||
|
||||
MODIFIED: `panvk_vX_shader.c` — replace the `NIR_PASS(_, nir, pan_nir_lower_xfb)` call with `NIR_PASS(_, nir, panvk_per_arch(nir_lower_xfb))`.
|
||||
|
||||
MODIFIED: `panvk_shader.h` — add `vs.xfb_topology` to sysval struct.
|
||||
|
||||
MODIFIED: `panvk_vX_cmd_draw.c::cmd_prepare_draw_sysvals` — at draw time, map current topology to enum + `set_gfx_sysval(..., vs.xfb_topology, mapped)`.
|
||||
|
||||
Phase 4 LoC estimate: ~250 (replacement pass) + 30 (sysval threading + draw-time topology map) ≈ 280 LoC.
|
||||
|
||||
— claude-noether, 2026-05-21
|
||||
@@ -0,0 +1,223 @@
|
||||
# Phase 2 — design lock for iter17
|
||||
|
||||
## Locked decisions
|
||||
|
||||
### D1: Replacement pass, not modification of upstream
|
||||
|
||||
Write `src/panfrost/vulkan/panvk_vX_xfb_lower.c` as a panvk-specific NIR pass. Call it from `panvk_vX_shader.c` instead of `pan_nir_lower_xfb`. Leaves Panfrost-Gallium and any other panfrost compiler consumers untouched. Per [[feedback-no-upstream-proposals]] and Phase 0 safety.
|
||||
|
||||
### D2: Runtime topology dispatch via sysval
|
||||
|
||||
Add a `vs.xfb_topology` sysval (uint8_t in `panvk_graphics_sysvals`). Driver maps `VkPrimitiveTopology` → `panvk_xfb_topology` enum at draw time. Shader's lowered XFB code switches on this sysval at runtime.
|
||||
|
||||
Rejected alternative: per-topology shader variants. 7 extra variants per XFB shader, with iter13's existing variant doubling that's a lot of shader cache bloat for marginal runtime benefit. The runtime switch is cheap on Bifrost.
|
||||
|
||||
### D3: TRIANGLE_FAN central-vertex handling
|
||||
|
||||
**Decision: implement.** The NIR loop is straightforward — `nir_push_loop` + bounded by `num_vertices`. Estimated ~30 LoC in the new pass. Closes ~22 of the 162 winding fails (TRIANGLE_FAN's share, roughly 1/7 of 162 ≈ 23).
|
||||
|
||||
Alternative considered: skip TRIANGLE_FAN, document as not-yet-implemented. Would leave 22 fails on the table. Not worth the docs-vs-code tradeoff — the loop isn't that hard.
|
||||
|
||||
### D4: Per-topology contribution emission
|
||||
|
||||
For VS invocation v on topology T, emit conditional stores using `nir_push_if` (eligibility predicate) + `nir_store_global` (address + value).
|
||||
|
||||
Each contribution = `(prim_idx, slot)` pair. Per-topology contribution count:
|
||||
|
||||
| Topology | Stores per VS invocation |
|
||||
|---|---|
|
||||
| TRIANGLE_STRIP | 1-3 (depends on v's position) |
|
||||
| LINE_STRIP | 1-2 |
|
||||
| TRIANGLE_FAN | 1-2 + central vertex (v=0) writes O(N) via loop |
|
||||
| LINE_LIST_WITH_ADJACENCY | 0-1 (only when v%4 ∈ {1, 2}) |
|
||||
| LINE_STRIP_WITH_ADJACENCY | 1-2 |
|
||||
| TRIANGLE_LIST_WITH_ADJACENCY | 0-1 (only when v%6 ∈ {0, 2, 4}) |
|
||||
| TRIANGLE_STRIP_WITH_ADJACENCY | 1-3 |
|
||||
|
||||
All eligibility predicates are O(1) integer comparisons. All address arithmetic is O(1) integer mul/add. No loops except for TRIANGLE_FAN.
|
||||
|
||||
### D5: LIST topologies bypass the new logic
|
||||
|
||||
For POINT_LIST, LINE_LIST, TRIANGLE_LIST: keep iter13's single-store fast path. The topology dispatch ladder starts with `if (topology == PANVK_XFB_TOPO_LIST) { iter13_path() }` — generic optimizer will hoist this nicely.
|
||||
|
||||
### D6: Multiple XFB output channels
|
||||
|
||||
`nir_io_xfb` annotation has up to 4 channels per `store_output`. Current `pan_nir_lower_xfb` loops over them and emits one global store each. Our replacement keeps that outer loop, applies decomposition logic at the inner store level. Each channel writes to a different offset within the same vertex's output slot.
|
||||
|
||||
### D7: Sysval threading
|
||||
|
||||
Add to `panvk_graphics_sysvals` struct (in `panvk_shader.h`):
|
||||
|
||||
```c
|
||||
uint32_t xfb_topology; /* panvk_xfb_topology enum */
|
||||
```
|
||||
|
||||
Enum in same header:
|
||||
```c
|
||||
enum panvk_xfb_topology {
|
||||
PANVK_XFB_TOPO_LIST = 0,
|
||||
PANVK_XFB_TOPO_LINE_STRIP = 1,
|
||||
PANVK_XFB_TOPO_TRI_STRIP = 2,
|
||||
PANVK_XFB_TOPO_TRI_FAN = 3,
|
||||
PANVK_XFB_TOPO_LINE_LIST_ADJ = 4,
|
||||
PANVK_XFB_TOPO_LINE_STRIP_ADJ = 5,
|
||||
PANVK_XFB_TOPO_TRI_LIST_ADJ = 6,
|
||||
PANVK_XFB_TOPO_TRI_STRIP_ADJ = 7,
|
||||
};
|
||||
```
|
||||
|
||||
In `cmd_prepare_draw_sysvals` (around the existing iter13 `vs.num_vertices` line):
|
||||
|
||||
```c
|
||||
uint32_t topo_enum = panvk_topology_to_xfb_enum(
|
||||
cmdbuf->vk.dynamic_graphics_state.ia.primitive_topology);
|
||||
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_topology, topo_enum);
|
||||
```
|
||||
|
||||
Helper `panvk_topology_to_xfb_enum` lives in `panvk_vX_xfb_lower.c` (or a small helper header).
|
||||
|
||||
## Code structure
|
||||
|
||||
```
|
||||
src/panfrost/vulkan/
|
||||
├── panvk_vX_xfb_lower.c NEW — replacement pass + topology mapping helper
|
||||
├── panvk_shader.h MOD — add vs.xfb_topology + enum + load_xfb_topology macro
|
||||
├── panvk_vX_cmd_draw.c MOD — set xfb_topology sysval in cmd_prepare_draw_sysvals
|
||||
└── panvk_vX_shader.c MOD — replace pan_nir_lower_xfb call with panvk_per_arch(nir_lower_xfb)
|
||||
```
|
||||
|
||||
## NIR pseudocode for the replacement pass
|
||||
|
||||
```c
|
||||
static void
|
||||
lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr,
|
||||
unsigned channel_idx, unsigned num_components,
|
||||
unsigned buffer, unsigned offset_words)
|
||||
{
|
||||
uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
|
||||
uint16_t offset_bytes = offset_words * 4;
|
||||
|
||||
nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology);
|
||||
nir_def *v = nir_load_raw_vertex_id_pan(b);
|
||||
nir_def *N = nir_load_num_vertices(b);
|
||||
nir_def *instance = nir_load_instance_id(b);
|
||||
nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
|
||||
nir_def *value = nir_channels(b, intr->src[0].ssa,
|
||||
nir_component_mask(num_components) << channel_idx);
|
||||
|
||||
/* LIST fast path: single store, iter13-compatible formula */
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST));
|
||||
{
|
||||
nir_def *idx = nir_iadd(b, nir_imul(b, instance, N), v);
|
||||
nir_def *addr = compute_addr(b, buf, idx, stride, offset_bytes);
|
||||
nir_store_global(b, value, addr);
|
||||
}
|
||||
nir_push_else(b);
|
||||
{
|
||||
/* TRIANGLE_STRIP */
|
||||
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP));
|
||||
{
|
||||
emit_tri_strip_stores(b, v, N, instance, buf, stride, offset_bytes, value);
|
||||
}
|
||||
nir_push_else(b);
|
||||
/* ... other topologies ... */
|
||||
nir_pop_if(b);
|
||||
}
|
||||
nir_pop_if(b);
|
||||
}
|
||||
|
||||
static void
|
||||
emit_tri_strip_stores(nir_builder *b, nir_def *v, nir_def *N,
|
||||
nir_def *instance, nir_def *buf,
|
||||
uint16_t stride, uint16_t offset_bytes,
|
||||
nir_def *value)
|
||||
{
|
||||
/* prim p = v, slot 0: when v ≤ N-3 (i.e., v < N-2) */
|
||||
{
|
||||
nir_def *eligible = nir_ilt(b, v, nir_iadd_imm(b, N, -2));
|
||||
nir_push_if(b, eligible);
|
||||
{
|
||||
nir_def *prim = v;
|
||||
nir_def *out_idx_in_prim = nir_iadd(b,
|
||||
nir_imul(b, instance, ceil_3_times_N(b, N)), /* TODO: precompute */
|
||||
nir_iadd(b, nir_imul_imm(b, prim, 3),
|
||||
nir_imm_int(b, 0))); /* slot 0 */
|
||||
nir_def *addr = compute_addr(b, buf, out_idx_in_prim, stride, offset_bytes);
|
||||
nir_store_global(b, value, addr);
|
||||
}
|
||||
nir_pop_if(b);
|
||||
}
|
||||
|
||||
/* prim p = v-1, slot = 1 if (v-1) even else 2: when v >= 1 and v ≤ N-2 */
|
||||
{
|
||||
nir_def *eligible = nir_iand(b, nir_uge_imm(b, v, 1),
|
||||
nir_ilt(b, v, nir_iadd_imm(b, N, -1)));
|
||||
nir_push_if(b, eligible);
|
||||
{
|
||||
nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||
nir_def *parity_even = nir_ieq_imm(b,
|
||||
nir_iand_imm(b, prim, 1), 0);
|
||||
nir_def *slot = nir_bcsel(b, parity_even,
|
||||
nir_imm_int(b, 1), nir_imm_int(b, 2));
|
||||
/* ... store ... */
|
||||
}
|
||||
nir_pop_if(b);
|
||||
}
|
||||
|
||||
/* prim p = v-2: when v >= 2 */
|
||||
{
|
||||
/* analogous */
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
For TRIANGLE_FAN central vertex:
|
||||
|
||||
```c
|
||||
/* Special: v == 0 → write to slot 2 of every primitive */
|
||||
nir_push_if(b, nir_ieq_imm(b, v, 0));
|
||||
{
|
||||
/* Loop p from 0 to N-3 (inclusive), write value to slot 2 of prim p */
|
||||
nir_variable *p_var = nir_local_variable_create(b->impl, glsl_uint_type(), "p");
|
||||
nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1);
|
||||
nir_push_loop(b);
|
||||
{
|
||||
nir_def *p = nir_load_var(b, p_var);
|
||||
nir_push_if(b, nir_uge(b, p, nir_iadd_imm(b, N, -2)));
|
||||
{
|
||||
nir_jump(b, nir_jump_break);
|
||||
}
|
||||
nir_pop_if(b);
|
||||
|
||||
nir_def *out_idx = nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2); /* slot 2 */
|
||||
nir_def *addr = compute_addr(b, buf, out_idx, stride, offset_bytes);
|
||||
nir_store_global(b, value, addr);
|
||||
|
||||
nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1);
|
||||
}
|
||||
nir_pop_loop(b);
|
||||
}
|
||||
nir_pop_if(b);
|
||||
```
|
||||
|
||||
## Edge case: per-vertex output count needs to compute total
|
||||
|
||||
For `vs.num_vertices` purposes in the XFB index calculation, we need the OUTPUT-SIDE count (`3*(N-2)` for tri_strip etc), not the input count.
|
||||
|
||||
Solution: don't use `nir_load_num_vertices(b)` for the output index calc in non-LIST branches. Instead, the per-primitive store directly computes `prim * verts_per_prim + slot` which is the output buffer position. The `instance * num_vertices` instance-stride multiplier should ALSO use the output count.
|
||||
|
||||
For multi-instance correctness, we need an `output_vertex_count` value that's the DECOMPOSED count per instance. Two ways:
|
||||
1. Pre-compute as another sysval `vs.xfb_output_count = decompose_count(topology, input_count)` — set CPU-side at draw time.
|
||||
2. Compute it in shader: use a switch over topology + math (e.g., for tri_strip: `3*(N-2)`).
|
||||
|
||||
**Lock: option 1.** Pre-compute on CPU, set as `vs.xfb_output_count` sysval. The CPU has trivially cheap arithmetic for this; shader avoids the per-VS-invocation math.
|
||||
|
||||
So total sysval additions:
|
||||
- `vs.xfb_topology` (uint32 / enum)
|
||||
- `vs.xfb_output_count` (uint32) — per-instance output vertex count after decomposition
|
||||
|
||||
## Phase 3 next
|
||||
|
||||
The probe already exists at `iter16/probe_winding.c`. Reuse it. Will Phase 4 actually-implement.
|
||||
|
||||
— claude-noether, 2026-05-21
|
||||
Reference in New Issue
Block a user