/*
 * Copyright © 2026 mfritsche / claude-noether
 * SPDX-License-Identifier: MIT
 *
 * iter17: panvk-specific replacement for pan_nir_lower_xfb that handles
 * primitive decomposition for transform_feedback on non-LIST topologies
 * (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY).
 *
 * Approach: emit a topology dispatch at the start of each store_output
 * lowering. The shader reads vs.xfb_topology sysval at runtime and branches
 * into per-topology emission logic. For each affected topology, the lowered
 * code emits guarded conditional stores — one per primitive this vertex
 * contributes to, computing the output buffer position via primitive index
 * and slot within the decomposed primitive.
 *
 * For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that
 * matches iter13's single-store behavior.
 *
 * For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives
 * as slot 2 — handled via a NIR loop bounded by num_vertices.
 *
 * See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context.
 */

#include "panvk_macros.h"

#if PAN_ARCH < 9

#include "panvk_shader.h"

#include "compiler/nir/nir_builder.h"
#include "pan_nir.h"

#include <vulkan/vulkan_core.h>

/* ----- Address arithmetic ----- */

static nir_def *
xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx,
               uint16_t stride, uint16_t offset_bytes)
{
   nir_def *byte_off = nir_iadd_imm(b,
      nir_imul_imm(b, out_idx, stride), offset_bytes);
   return nir_iadd(b, buf, nir_u2u64(b, byte_off));
}

static void
emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count,
                nir_def *instance_id, nir_def *raw_vid, nir_def *value,
                uint16_t stride, uint16_t offset_bytes)
{
   nir_def *out_idx = nir_iadd(b,
      nir_imul(b, instance_id, output_count), raw_vid);
   nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
   nir_store_global(b, value, addr);
}

static void
emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count,
                nir_def *instance_id, nir_def *eligible,
                nir_def *prim_idx, nir_def *slot,
                uint32_t verts_per_prim,
                nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   nir_push_if(b, eligible);
   {
      nir_def *out_idx = nir_iadd(b,
         nir_imul(b, instance_id, output_count),
         nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot));
      nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
      nir_store_global(b, value, addr);
   }
   nir_pop_if(b, NULL);
}

/* ----- Per-topology emission ----- */

/* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */
static void
emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N,
               nir_def *buf, nir_def *output_count, nir_def *instance_id,
               nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   nir_def *Nm2 = nir_iadd_imm(b, N, -2);
   nir_def *Nm1 = nir_iadd_imm(b, N, -1);

   /* Prim v, slot 0: v < N-2 */
   emit_prim_store(b, buf, output_count, instance_id,
      nir_ult(b, v, Nm2),
      v, nir_imm_int(b, 0), 3, value, stride, offset_bytes);

   /* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */
   {
      nir_def *prim = nir_iadd_imm(b, v, -1);
      nir_def *parity = nir_iand_imm(b, prim, 1u);
      nir_def *slot = nir_iadd_imm(b, parity, 1);
      nir_def *eligible = nir_iand(b,
         nir_uge(b, v, nir_imm_int(b, 1)),
         nir_ult(b, v, Nm1));
      emit_prim_store(b, buf, output_count, instance_id, eligible,
                      prim, slot, 3, value, stride, offset_bytes);
   }

   /* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */
   {
      nir_def *prim = nir_iadd_imm(b, v, -2);
      nir_def *parity = nir_iand_imm(b, prim, 1u);
      nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity);
      nir_def *eligible = nir_iand(b,
         nir_uge(b, v, nir_imm_int(b, 2)),
         nir_ult(b, v, N));
      emit_prim_store(b, buf, output_count, instance_id, eligible,
                      prim, slot, 3, value, stride, offset_bytes);
   }
}

/* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */
static void
emit_line_strip(nir_builder *b, nir_def *v, nir_def *N,
                nir_def *buf, nir_def *output_count, nir_def *instance_id,
                nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   nir_def *Nm1 = nir_iadd_imm(b, N, -1);

   /* Prim v, slot 0: v < N-1 */
   emit_prim_store(b, buf, output_count, instance_id,
      nir_ult(b, v, Nm1),
      v, nir_imm_int(b, 0), 2, value, stride, offset_bytes);

   /* Prim v-1, slot 1: 1 <= v < N */
   {
      nir_def *prim = nir_iadd_imm(b, v, -1);
      nir_def *eligible = nir_iand(b,
         nir_uge(b, v, nir_imm_int(b, 1)),
         nir_ult(b, v, N));
      emit_prim_store(b, buf, output_count, instance_id, eligible,
                      prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
   }
}

/* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}.
 *   vertex v=0: contributes to ALL prims as slot 2 (loop required)
 *   vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2)
 *   vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1)
 */
static void
emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N,
             nir_def *buf, nir_def *output_count, nir_def *instance_id,
             nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   nir_def *Nm1 = nir_iadd_imm(b, N, -1);
   nir_def *Nm2 = nir_iadd_imm(b, N, -2);

   /* Prim v-1, slot 0: 1 <= v < N-1 */
   {
      nir_def *prim = nir_iadd_imm(b, v, -1);
      nir_def *eligible = nir_iand(b,
         nir_uge(b, v, nir_imm_int(b, 1)),
         nir_ult(b, v, Nm1));
      emit_prim_store(b, buf, output_count, instance_id, eligible,
                      prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
   }

   /* Prim v-2, slot 1: 2 <= v < N */
   {
      nir_def *prim = nir_iadd_imm(b, v, -2);
      nir_def *eligible = nir_iand(b,
         nir_uge(b, v, nir_imm_int(b, 2)),
         nir_ult(b, v, N));
      emit_prim_store(b, buf, output_count, instance_id, eligible,
                      prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes);
   }

   /* Central vertex (v == 0): loop over all prims, write to slot 2. */
   nir_push_if(b, nir_ieq_imm(b, v, 0));
   {
      nir_variable *p_var = nir_local_variable_create(b->impl,
         glsl_uint_type(), "fan_p");
      nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1);
      nir_push_loop(b);
      {
         nir_def *p = nir_load_var(b, p_var);
         nir_push_if(b, nir_uge(b, p, Nm2));
         {
            nir_jump(b, nir_jump_break);
         }
         nir_pop_if(b, NULL);

         nir_def *out_idx = nir_iadd(b,
            nir_imul(b, instance_id, output_count),
            nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2));
         nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
         nir_store_global(b, value, addr);

         nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1);
      }
      nir_pop_loop(b, NULL);
   }
   nir_pop_if(b, NULL);
}

/* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}.
 *   v contributes if v%4 == 1: prim v/4 slot 0
 *   v contributes if v%4 == 2: prim v/4 slot 1
 */
static void
emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N,
                   nir_def *buf, nir_def *output_count, nir_def *instance_id,
                   nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   (void)N; /* eligibility is mod-based, not range-based */
   nir_def *vmod4 = nir_iand_imm(b, v, 3u);
   nir_def *prim = nir_ushr_imm(b, v, 2);  /* v / 4 */

   emit_prim_store(b, buf, output_count, instance_id,
      nir_ieq_imm(b, vmod4, 1),
      prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);

   emit_prim_store(b, buf, output_count, instance_id,
      nir_ieq_imm(b, vmod4, 2),
      prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
}

/* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}.
 *   v contributes to prim v-1 slot 0 (1 <= v <= N-2)
 *   v contributes to prim v-2 slot 1 (2 <= v <= N-1)
 */
static void
emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
                    nir_def *buf, nir_def *output_count, nir_def *instance_id,
                    nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   nir_def *Nm1 = nir_iadd_imm(b, N, -1);
   nir_def *Nm2 = nir_iadd_imm(b, N, -2);

   /* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */
   {
      nir_def *prim = nir_iadd_imm(b, v, -1);
      nir_def *eligible = nir_iand(b,
         nir_uge(b, v, nir_imm_int(b, 1)),
         nir_ult(b, v, Nm1));
      (void)Nm2;
      emit_prim_store(b, buf, output_count, instance_id, eligible,
                      prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
   }

   /* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */
   {
      nir_def *prim = nir_iadd_imm(b, v, -2);
      nir_def *eligible = nir_iand(b,
         nir_uge(b, v, nir_imm_int(b, 2)),
         nir_ult(b, v, N));
      emit_prim_store(b, buf, output_count, instance_id, eligible,
                      prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
   }
}

/* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}.
 *   v contributes if v%6 == 0: prim v/6 slot 0
 *   v contributes if v%6 == 2: prim v/6 slot 1
 *   v contributes if v%6 == 4: prim v/6 slot 2
 */
static void
emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N,
                  nir_def *buf, nir_def *output_count, nir_def *instance_id,
                  nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   (void)N;
   nir_def *vmod6 = nir_umod_imm(b, v, 6);
   nir_def *prim = nir_udiv_imm(b, v, 6);

   for (uint32_t slot = 0; slot < 3; slot++) {
      emit_prim_store(b, buf, output_count, instance_id,
         nir_ieq_imm(b, vmod6, slot * 2),
         prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes);
   }
}

/* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits:
 *   even i: {2i, 2i+2, 2i+4}    (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4)
 *   odd  i: {2i, 2i+4, 2i+2}    (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2)
 *
 * Only EVEN input vertices contribute (since all output indices are 2*something).
 * For even input v:
 *   prim v/2 slot 0 (always, if v/2 < N/2-2)
 *   prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd   (when v >= 2)
 *   prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd   (when v >= 4)
 */
static void
emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
                   nir_def *buf, nir_def *output_count, nir_def *instance_id,
                   nir_def *value, uint16_t stride, uint16_t offset_bytes)
{
   /* Bail for odd input vertices — they never contribute. */
   nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0);
   nir_push_if(b, v_is_even);
   {
      nir_def *N_half = nir_ushr_imm(b, N, 1);
      nir_def *max_prim = nir_iadd_imm(b, N_half, -2);  /* N/2 - 2 */
      nir_def *v_half = nir_ushr_imm(b, v, 1);

      /* Prim v/2 slot 0: v/2 < N/2 - 2 */
      emit_prim_store(b, buf, output_count, instance_id,
         nir_ult(b, v_half, max_prim),
         v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes);

      /* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */
      {
         nir_def *prim = nir_iadd_imm(b, v_half, -1);
         nir_def *parity = nir_iand_imm(b, prim, 1u);
         nir_def *slot = nir_iadd_imm(b, parity, 1);  /* even→1, odd→2 */
         nir_def *eligible = nir_iand(b,
            nir_uge(b, v, nir_imm_int(b, 2)),
            nir_ult(b, prim, max_prim));
         emit_prim_store(b, buf, output_count, instance_id, eligible,
                         prim, slot, 3, value, stride, offset_bytes);
      }

      /* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */
      {
         nir_def *prim = nir_iadd_imm(b, v_half, -2);
         nir_def *parity = nir_iand_imm(b, prim, 1u);
         nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity);  /* even→2, odd→1 */
         nir_def *eligible = nir_iand(b,
            nir_uge(b, v, nir_imm_int(b, 4)),
            nir_ult(b, prim, max_prim));
         emit_prim_store(b, buf, output_count, instance_id, eligible,
                         prim, slot, 3, value, stride, offset_bytes);
      }
   }
   nir_pop_if(b, NULL);
}

/* ----- Main lowering: per store_output XFB channel ----- */

static void
lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr,
                        unsigned channel_idx, unsigned num_components,
                        unsigned buffer, unsigned offset_words)
{
   assert(buffer < MAX_XFB_BUFFERS);
   assert(nir_intrinsic_component(intr) == 0);

   uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
   assert(stride != 0);
   uint16_t offset_bytes = offset_words * 4;

   BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
   BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);

   nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology);
   nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count);
   nir_def *N = nir_load_num_vertices(b);
   nir_def *v = nir_load_raw_vertex_id_pan(b);
   nir_def *instance = nir_load_instance_id(b);
   nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer);

   nir_def *src = intr->src[0].ssa;
   nir_component_mask_t mask = nir_component_mask(num_components);
   nir_def *value = nir_channels(b, src, mask << channel_idx);

   /* Topology dispatch ladder. LIST first (fast path). */
   nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST));
   {
      emit_list_store(b, buf, out_count, instance, v, value,
                      stride, offset_bytes);
   }
   nir_push_else(b, NULL);
   {
      /* iter17 Janet Finding 3: gate all non-LIST emission on
       * output_count > 0. For degenerate input counts (N < min required
       * for the topology), output_count is 0 and we must emit NO stores
       * — otherwise N-2 / N-3 / etc. arithmetic underflows in the
       * eligibility predicates and we falsely fire stores. */
      nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count));
      {
      nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP));
      {
         emit_tri_strip(b, v, N, buf, out_count, instance, value,
                        stride, offset_bytes);
      }
      nir_push_else(b, NULL);
      {
         nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP));
         {
            emit_line_strip(b, v, N, buf, out_count, instance, value,
                            stride, offset_bytes);
         }
         nir_push_else(b, NULL);
         {
            nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN));
            {
               emit_tri_fan(b, v, N, buf, out_count, instance, value,
                            stride, offset_bytes);
            }
            nir_push_else(b, NULL);
            {
               nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ));
               {
                  emit_line_list_adj(b, v, N, buf, out_count, instance, value,
                                     stride, offset_bytes);
               }
               nir_push_else(b, NULL);
               {
                  nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ));
                  {
                     emit_line_strip_adj(b, v, N, buf, out_count, instance, value,
                                         stride, offset_bytes);
                  }
                  nir_push_else(b, NULL);
                  {
                     nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ));
                     {
                        emit_tri_list_adj(b, v, N, buf, out_count, instance, value,
                                          stride, offset_bytes);
                     }
                     nir_push_else(b, NULL);
                     {
                        /* TRI_STRIP_ADJ — last case */
                        emit_tri_strip_adj(b, v, N, buf, out_count, instance, value,
                                           stride, offset_bytes);
                     }
                     nir_pop_if(b, NULL);
                  }
                  nir_pop_if(b, NULL);
               }
               nir_pop_if(b, NULL);
            }
            nir_pop_if(b, NULL);
         }
         nir_pop_if(b, NULL);
      }
      nir_pop_if(b, NULL);
      }
      nir_pop_if(b, NULL);  /* Janet Finding 3: close output_count > 0 guard */
   }
   nir_pop_if(b, NULL);
}

/* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite +
 * dispatch store_output through our topology-aware emission. */
static bool
lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr,
                 UNUSED void *data)
{
   if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
      b->cursor = nir_instr_remove(&intr->instr);
      nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b),
                               nir_load_raw_vertex_offset_pan(b));
      nir_def_rewrite_uses(&intr->def, repl);
      return true;
   }

   if (intr->intrinsic != nir_intrinsic_store_output)
      return false;

   bool progress = false;
   b->cursor = nir_before_instr(&intr->instr);

   /* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2.
    * Outer loop selects which annotation; inner picks which channel. */
   for (unsigned i = 0; i < 2; ++i) {
      nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr)
                         : nir_intrinsic_io_xfb(intr);
      for (unsigned j = 0; j < 2; ++j) {
         if (!xfb.out[j].num_components)
            continue;
         lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components,
                                 xfb.out[j].buffer, xfb.out[j].offset);
         progress = true;
      }
   }

   if (progress)
      nir_instr_remove(&intr->instr);
   return progress;
}

bool
panvk_per_arch(nir_lower_xfb)(nir_shader *nir)
{
   return nir_shader_intrinsics_pass(
      nir, lower_xfb_iter17, nir_metadata_control_flow, NULL);
}

#endif /* PAN_ARCH < 9 */