initial seed: retrofit campaign lineage from local working trees

panvk-bifrost campaigns (r1..r4 Vulkan compositor + r5.video1 Vulkan
video decode) shipped before this repo existed; the deliverable
patches live in marfrit-packages, but the reasoning chain, phase docs,
and source-state evidence lived only in local working trees on the
development host.

This retrofit imports:
- mesa-panvk-bifrost/   — r1..r4 era phase docs (iter1..iter18)
                          (libmali stub blobs at iter18/blob/ excluded
                          — 109MB of RE artifacts replaced with a README
                          pointer)
- mesa-panvk-bifrost-video/ — sibling campaign phase docs + probe
- evidence/             — frozen .tgz source snapshots at each milestone
                          (basis for the 0005 patch diff generation)

Future iterations should branch off here from day one, so each iter is
a commit rather than a snapshot. See [[feedback-session-local-process-pins]]
for the process drift this retrofit closes.

Total: 1.9 MB across 124 files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 05:25:37 +02:00
parent 430d0da278
commit a4e7d8ab90
124 changed files with 22551 additions and 1 deletions
@@ -0,0 +1,486 @@
/*
* Copyright © 2026 mfritsche / claude-noether
* SPDX-License-Identifier: MIT
*
* iter17: panvk-specific replacement for pan_nir_lower_xfb that handles
* primitive decomposition for transform_feedback on non-LIST topologies
* (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY).
*
* Approach: emit a topology dispatch at the start of each store_output
* lowering. The shader reads vs.xfb_topology sysval at runtime and branches
* into per-topology emission logic. For each affected topology, the lowered
* code emits guarded conditional stores — one per primitive this vertex
* contributes to, computing the output buffer position via primitive index
* and slot within the decomposed primitive.
*
* For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that
* matches iter13's single-store behavior.
*
* For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives
* as slot 2 — handled via a NIR loop bounded by num_vertices.
*
* See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context.
*/
#include "panvk_macros.h"
#if PAN_ARCH < 9
#include "panvk_shader.h"
#include "compiler/nir/nir_builder.h"
#include "pan_nir.h"
#include <vulkan/vulkan_core.h>
/* ----- Address arithmetic ----- */
static nir_def *
xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx,
uint16_t stride, uint16_t offset_bytes)
{
nir_def *byte_off = nir_iadd_imm(b,
nir_imul_imm(b, out_idx, stride), offset_bytes);
return nir_iadd(b, buf, nir_u2u64(b, byte_off));
}
static void
emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count,
nir_def *instance_id, nir_def *raw_vid, nir_def *value,
uint16_t stride, uint16_t offset_bytes)
{
nir_def *out_idx = nir_iadd(b,
nir_imul(b, instance_id, output_count), raw_vid);
nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
nir_store_global(b, value, addr);
}
static void
emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count,
nir_def *instance_id, nir_def *eligible,
nir_def *prim_idx, nir_def *slot,
uint32_t verts_per_prim,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
nir_push_if(b, eligible);
{
nir_def *out_idx = nir_iadd(b,
nir_imul(b, instance_id, output_count),
nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot));
nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
nir_store_global(b, value, addr);
}
nir_pop_if(b, NULL);
}
/* ----- Per-topology emission ----- */
/* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */
static void
emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N,
nir_def *buf, nir_def *output_count, nir_def *instance_id,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
nir_def *Nm2 = nir_iadd_imm(b, N, -2);
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
/* Prim v, slot 0: v < N-2 */
emit_prim_store(b, buf, output_count, instance_id,
nir_ult(b, v, Nm2),
v, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
/* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */
{
nir_def *prim = nir_iadd_imm(b, v, -1);
nir_def *parity = nir_iand_imm(b, prim, 1u);
nir_def *slot = nir_iadd_imm(b, parity, 1);
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 1)),
nir_ult(b, v, Nm1));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, slot, 3, value, stride, offset_bytes);
}
/* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */
{
nir_def *prim = nir_iadd_imm(b, v, -2);
nir_def *parity = nir_iand_imm(b, prim, 1u);
nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity);
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 2)),
nir_ult(b, v, N));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, slot, 3, value, stride, offset_bytes);
}
}
/* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */
static void
emit_line_strip(nir_builder *b, nir_def *v, nir_def *N,
nir_def *buf, nir_def *output_count, nir_def *instance_id,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
/* Prim v, slot 0: v < N-1 */
emit_prim_store(b, buf, output_count, instance_id,
nir_ult(b, v, Nm1),
v, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
/* Prim v-1, slot 1: 1 <= v < N */
{
nir_def *prim = nir_iadd_imm(b, v, -1);
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 1)),
nir_ult(b, v, N));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
}
}
/* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}.
* vertex v=0: contributes to ALL prims as slot 2 (loop required)
* vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2)
* vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1)
*/
static void
emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N,
nir_def *buf, nir_def *output_count, nir_def *instance_id,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
nir_def *Nm2 = nir_iadd_imm(b, N, -2);
/* Prim v-1, slot 0: 1 <= v < N-1 */
{
nir_def *prim = nir_iadd_imm(b, v, -1);
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 1)),
nir_ult(b, v, Nm1));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
}
/* Prim v-2, slot 1: 2 <= v < N */
{
nir_def *prim = nir_iadd_imm(b, v, -2);
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 2)),
nir_ult(b, v, N));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes);
}
/* Central vertex (v == 0): loop over all prims, write to slot 2. */
nir_push_if(b, nir_ieq_imm(b, v, 0));
{
nir_variable *p_var = nir_local_variable_create(b->impl,
glsl_uint_type(), "fan_p");
nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1);
nir_push_loop(b);
{
nir_def *p = nir_load_var(b, p_var);
nir_push_if(b, nir_uge(b, p, Nm2));
{
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_def *out_idx = nir_iadd(b,
nir_imul(b, instance_id, output_count),
nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2));
nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
nir_store_global(b, value, addr);
nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1);
}
nir_pop_loop(b, NULL);
}
nir_pop_if(b, NULL);
}
/* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}.
* v contributes if v%4 == 1: prim v/4 slot 0
* v contributes if v%4 == 2: prim v/4 slot 1
*/
static void
emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N,
nir_def *buf, nir_def *output_count, nir_def *instance_id,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
(void)N; /* eligibility is mod-based, not range-based */
nir_def *vmod4 = nir_iand_imm(b, v, 3u);
nir_def *prim = nir_ushr_imm(b, v, 2); /* v / 4 */
emit_prim_store(b, buf, output_count, instance_id,
nir_ieq_imm(b, vmod4, 1),
prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
emit_prim_store(b, buf, output_count, instance_id,
nir_ieq_imm(b, vmod4, 2),
prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
}
/* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}.
* v contributes to prim v-1 slot 0 (1 <= v <= N-2)
* v contributes to prim v-2 slot 1 (2 <= v <= N-1)
*/
static void
emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
nir_def *buf, nir_def *output_count, nir_def *instance_id,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
nir_def *Nm1 = nir_iadd_imm(b, N, -1);
nir_def *Nm2 = nir_iadd_imm(b, N, -2);
/* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */
{
nir_def *prim = nir_iadd_imm(b, v, -1);
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 1)),
nir_ult(b, v, Nm1));
(void)Nm2;
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
}
/* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */
{
nir_def *prim = nir_iadd_imm(b, v, -2);
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 2)),
nir_ult(b, v, N));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
}
}
/* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}.
* v contributes if v%6 == 0: prim v/6 slot 0
* v contributes if v%6 == 2: prim v/6 slot 1
* v contributes if v%6 == 4: prim v/6 slot 2
*/
static void
emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N,
nir_def *buf, nir_def *output_count, nir_def *instance_id,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
(void)N;
nir_def *vmod6 = nir_umod_imm(b, v, 6);
nir_def *prim = nir_udiv_imm(b, v, 6);
for (uint32_t slot = 0; slot < 3; slot++) {
emit_prim_store(b, buf, output_count, instance_id,
nir_ieq_imm(b, vmod6, slot * 2),
prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes);
}
}
/* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits:
* even i: {2i, 2i+2, 2i+4} (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4)
* odd i: {2i, 2i+4, 2i+2} (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2)
*
* Only EVEN input vertices contribute (since all output indices are 2*something).
* For even input v:
* prim v/2 slot 0 (always, if v/2 < N/2-2)
* prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd (when v >= 2)
* prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd (when v >= 4)
*/
static void
emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
nir_def *buf, nir_def *output_count, nir_def *instance_id,
nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
/* Bail for odd input vertices — they never contribute. */
nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0);
nir_push_if(b, v_is_even);
{
nir_def *N_half = nir_ushr_imm(b, N, 1);
nir_def *max_prim = nir_iadd_imm(b, N_half, -2); /* N/2 - 2 */
nir_def *v_half = nir_ushr_imm(b, v, 1);
/* Prim v/2 slot 0: v/2 < N/2 - 2 */
emit_prim_store(b, buf, output_count, instance_id,
nir_ult(b, v_half, max_prim),
v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
/* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */
{
nir_def *prim = nir_iadd_imm(b, v_half, -1);
nir_def *parity = nir_iand_imm(b, prim, 1u);
nir_def *slot = nir_iadd_imm(b, parity, 1); /* even→1, odd→2 */
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 2)),
nir_ult(b, prim, max_prim));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, slot, 3, value, stride, offset_bytes);
}
/* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */
{
nir_def *prim = nir_iadd_imm(b, v_half, -2);
nir_def *parity = nir_iand_imm(b, prim, 1u);
nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); /* even→2, odd→1 */
nir_def *eligible = nir_iand(b,
nir_uge(b, v, nir_imm_int(b, 4)),
nir_ult(b, prim, max_prim));
emit_prim_store(b, buf, output_count, instance_id, eligible,
prim, slot, 3, value, stride, offset_bytes);
}
}
nir_pop_if(b, NULL);
}
/* ----- Main lowering: per store_output XFB channel ----- */
static void
lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr,
unsigned channel_idx, unsigned num_components,
unsigned buffer, unsigned offset_words)
{
assert(buffer < MAX_XFB_BUFFERS);
assert(nir_intrinsic_component(intr) == 0);
uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
assert(stride != 0);
uint16_t offset_bytes = offset_words * 4;
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology);
nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count);
nir_def *N = nir_load_num_vertices(b);
nir_def *v = nir_load_raw_vertex_id_pan(b);
nir_def *instance = nir_load_instance_id(b);
nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
nir_def *src = intr->src[0].ssa;
nir_component_mask_t mask = nir_component_mask(num_components);
nir_def *value = nir_channels(b, src, mask << channel_idx);
/* Topology dispatch ladder. LIST first (fast path). */
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST));
{
emit_list_store(b, buf, out_count, instance, v, value,
stride, offset_bytes);
}
nir_push_else(b, NULL);
{
/* iter17 Janet Finding 3: gate all non-LIST emission on
* output_count > 0. For degenerate input counts (N < min required
* for the topology), output_count is 0 and we must emit NO stores
* — otherwise N-2 / N-3 / etc. arithmetic underflows in the
* eligibility predicates and we falsely fire stores. */
nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count));
{
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP));
{
emit_tri_strip(b, v, N, buf, out_count, instance, value,
stride, offset_bytes);
}
nir_push_else(b, NULL);
{
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP));
{
emit_line_strip(b, v, N, buf, out_count, instance, value,
stride, offset_bytes);
}
nir_push_else(b, NULL);
{
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN));
{
emit_tri_fan(b, v, N, buf, out_count, instance, value,
stride, offset_bytes);
}
nir_push_else(b, NULL);
{
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ));
{
emit_line_list_adj(b, v, N, buf, out_count, instance, value,
stride, offset_bytes);
}
nir_push_else(b, NULL);
{
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ));
{
emit_line_strip_adj(b, v, N, buf, out_count, instance, value,
stride, offset_bytes);
}
nir_push_else(b, NULL);
{
nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ));
{
emit_tri_list_adj(b, v, N, buf, out_count, instance, value,
stride, offset_bytes);
}
nir_push_else(b, NULL);
{
/* TRI_STRIP_ADJ — last case */
emit_tri_strip_adj(b, v, N, buf, out_count, instance, value,
stride, offset_bytes);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL); /* Janet Finding 3: close output_count > 0 guard */
}
nir_pop_if(b, NULL);
}
/* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite +
* dispatch store_output through our topology-aware emission. */
static bool
lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr,
UNUSED void *data)
{
if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
b->cursor = nir_instr_remove(&intr->instr);
nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b),
nir_load_raw_vertex_offset_pan(b));
nir_def_rewrite_uses(&intr->def, repl);
return true;
}
if (intr->intrinsic != nir_intrinsic_store_output)
return false;
bool progress = false;
b->cursor = nir_before_instr(&intr->instr);
/* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2.
* Outer loop selects which annotation; inner picks which channel. */
for (unsigned i = 0; i < 2; ++i) {
nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr)
: nir_intrinsic_io_xfb(intr);
for (unsigned j = 0; j < 2; ++j) {
if (!xfb.out[j].num_components)
continue;
lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components,
xfb.out[j].buffer, xfb.out[j].offset);
progress = true;
}
}
if (progress)
nir_instr_remove(&intr->instr);
return progress;
}
bool
panvk_per_arch(nir_lower_xfb)(nir_shader *nir)
{
return nir_shader_intrinsics_pass(
nir, lower_xfb_iter17, nir_metadata_control_flow, NULL);
}
#endif /* PAN_ARCH < 9 */