From 4db64917bcbc95030b456381f3597b805ba7e5cf Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Fri, 22 May 2026 09:49:59 +0200 Subject: [PATCH] mesa-panvk-bifrost-video: r1-r4 patches as real files (symlinks broke CI) The original PR #79 used symlinks for 0001..0004 patches (pointing into ../mesa-panvk-bifrost/) to avoid drift between siblings. CI's "cp -r arch/mesa-panvk-bifrost-video /tmp/build-..." preserves the symlinks, but the destination /tmp/build-... has no sibling dir to resolve them against, so makepkg errors with: ==> ERROR: 0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch was not found in the build directory and is not a URL. Each Arch PKGBUILD owns its source files per convention; the duplication risk is low because r1..r4 are closed-release patches. Co-Authored-By: Claude Opus 4.7 --- ...e-robustness2-nullDescriptor-bifrost.patch | 58 +- ...nvk-expose-vulkan-1.1-1.2-on-bifrost.patch | 48 +- ...vk-bifrost-vk-ext-transform-feedback.patch | 329 ++++++++- ...-bifrost-xfb-primitive-decomposition.patch | 630 +++++++++++++++++- 4 files changed, 1061 insertions(+), 4 deletions(-) mode change 120000 => 100644 arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch mode change 120000 => 100644 arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch mode change 120000 => 100644 arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch mode change 120000 => 100644 arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch diff --git a/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch b/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch deleted file mode 120000 index 0dcf8589b4..0000000000 --- a/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch +++ /dev/null @@ -1 +0,0 @@ -../mesa-panvk-bifrost/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch b/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch new file mode 100644 index 0000000000..8d2a377c5d --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch @@ -0,0 +1,57 @@ +From: claude-noether (on behalf of mfritsche) +Date: 2026-05-19 +Subject: panvk: expose VK_KHR/EXT_robustness2 + nullDescriptor on Bifrost (PAN_ARCH 6/7) + +Without this, Mesa's Zink driver refuses to use PanVk-Bifrost as its Vulkan +backend, falling back silently to llvmpipe (software rasterizer) for all +GL-via-Zink on Bifrost SBCs. That defeats the entire purpose of having a +Vulkan driver on Bifrost — GL acceleration via Zink is the most natural +near-term consumer. + +panvk_vX_nir_lower_descriptors.c:1309 and panvk_vX_shader.c:1355 already +plumb dev->vk.enabled_features.nullDescriptor arch-agnostically — the gate +at panvk_vX_physical_device.c was set conservatively when Bifrost was +unmaintained, not because of hardware incapability. + +iter1–7 of the panvk-bifrost campaign proved fundamental driver functions +on Mali-G52 r1 MC1 (PAN_ARCH=7). This patch is the iter8 follow-up. + +robustBufferAccess2 and robustImageAccess2 are NOT flipped — they're +independent rb2 features Zink doesn't require, gated differently +(robustBufferAccess2 = PAN_ARCH >= 11, robustImageAccess2 = false), and +out of scope for iter8. + +--- + src/panfrost/vulkan/panvk_vX_physical_device.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c +--- a/src/panfrost/vulkan/panvk_vX_physical_device.c ++++ b/src/panfrost/vulkan/panvk_vX_physical_device.c +@@ -91,7 +91,7 @@ get_device_extensions(const struct panvk_physical_device *device, + .KHR_pipeline_binary = true, + .KHR_pipeline_executable_properties = true, + .KHR_pipeline_library = true, +- .KHR_robustness2 = PAN_ARCH >= 10, ++ .KHR_robustness2 = true, + .KHR_sampler_mirror_clamp_to_edge = true, + .KHR_sampler_ycbcr_conversion = true, + .KHR_separate_depth_stencil_layouts = true, +@@ -168,7 +168,7 @@ get_device_extensions(const struct panvk_physical_device *device, + .EXT_queue_family_foreign = true, + .EXT_robustness = pan_arch(device->kmod.dev->props.gpu_id) >= 9, + .EXT_image_robustness = true, +- .EXT_robustness2 = PAN_ARCH >= 10, ++ .EXT_robustness2 = true, + .EXT_sampler_filter_minmax = PAN_ARCH >= 10, + .EXT_scalar_block_layout = true, + .EXT_separate_stencil_usage = true, +@@ -493,7 +493,7 @@ get_device_features(const struct panvk_physical_device *device, + /* VK_KHR_robustness2 */ + .robustBufferAccess2 = PAN_ARCH >= 11, + .robustImageAccess2 = false, +- .nullDescriptor = PAN_ARCH >= 10, ++ .nullDescriptor = true, + + /* VK_KHR_shader_clock */ + .shaderSubgroupClock = device->kmod.dev->props.gpu_can_query_timestamp, diff --git a/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch b/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch deleted file mode 120000 index 1d7a265b90..0000000000 --- a/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch +++ /dev/null @@ -1 +0,0 @@ -../mesa-panvk-bifrost/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch b/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch new file mode 100644 index 0000000000..f44ffcbb56 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch @@ -0,0 +1,47 @@ +From: claude-noether (on behalf of mfritsche) +Date: 2026-05-20 +Subject: panvk: expose Vulkan 1.1 + 1.2 on Bifrost (PAN_ARCH 6/7) + +ANGLE (Chromium's GL stack) requires apiVersion >= 1.1 to initialize. Without +this, Brave / Chromium's GPU process fails at GL info collection: + + vk_renderer.cpp:2659 (initialize): ANGLE Requires a minimum Vulkan device + version of 1.1 + Display::initialize error 0: Internal Vulkan error (-9): The requested + version of Vulkan is not supported by the driver + +Stack-up with iter8's robustness2 patch enables ANGLE → PanVk-Bifrost → +Skia (via --enable-features=Vulkan) on Bifrost SBCs. + +PanVk-Bifrost already supports the bulk of 1.1-promoted features as extensions +(multiview, maintenance1-3, descriptor update template, 16-bit storage, +descriptor update template, sampler ycbcr, variable pointers, etc. — all +visible in iter0 vulkaninfo). The version bump primarily bundles them. + +Risk: Vulkan 1.1 has features beyond what iter1–7 exercised (protected memory, +full subgroup ops). Specific app failures will be characterizable. + +1.2 is also flipped — Brave's Vulkan path may want descriptor indexing, +buffer device address, etc. (all listed in iter0 vulkaninfo as supported +extensions, just gated as 1.0-with-extensions, not 1.2-core). + +--- + src/panfrost/vulkan/panvk_vX_physical_device.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c +--- a/src/panfrost/vulkan/panvk_vX_physical_device.c ++++ b/src/panfrost/vulkan/panvk_vX_physical_device.c +@@ -38,8 +38,8 @@ get_device_extensions(const struct panvk_physical_device *device, + struct vk_device_extension_table *ext) + { + *ext = (struct vk_device_extension_table){ +- .KHR_8bit_storage = true, +- .KHR_16bit_storage = true, +- bool has_vk1_1 = PAN_ARCH >= 10; +- bool has_vk1_2 = PAN_ARCH >= 10; ++ .KHR_8bit_storage = true, ++ .KHR_16bit_storage = true, ++ bool has_vk1_1 = true; ++ bool has_vk1_2 = true; + *ext = (struct vk_device_extension_table){ diff --git a/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch b/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch deleted file mode 120000 index 7aebd6f385..0000000000 --- a/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch +++ /dev/null @@ -1 +0,0 @@ -../mesa-panvk-bifrost/0003-panvk-bifrost-vk-ext-transform-feedback.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch b/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch new file mode 100644 index 0000000000..4d162fc2e9 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0003-panvk-bifrost-vk-ext-transform-feedback.patch @@ -0,0 +1,328 @@ +--- a/src/panfrost/vulkan/panvk_shader.h 2026-04-29 22:19:00.000000000 +0200 ++++ b/src/panfrost/vulkan/panvk_shader.h 2026-05-20 18:52:53.312698258 +0200 +@@ -150,6 +150,10 @@ + struct { + #if PAN_ARCH < 9 + int32_t raw_vertex_offset; ++ uint32_t num_vertices; /* iter13: XFB needs per-draw vertex count */ ++ /* aligned_u64 attribute below inserts the 4-byte alignment gap ++ * after num_vertices automatically — no explicit pad needed. */ ++ aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */ + #endif + int32_t first_vertex; + int32_t base_instance; +--- a/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-20 19:09:29.711145446 +0200 ++++ b/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-20 18:52:54.832720445 +0200 +@@ -169,6 +169,7 @@ + .EXT_provoking_vertex = true, + .EXT_queue_family_foreign = true, + .EXT_robustness2 = true, ++ .EXT_transform_feedback = PAN_ARCH < 9, /* iter13: JM-class only for now */ + .EXT_sampler_filter_minmax = PAN_ARCH >= 10, + .EXT_scalar_block_layout = true, + .EXT_separate_stencil_usage = true, +@@ -495,6 +496,10 @@ + .robustImageAccess2 = false, + .nullDescriptor = true, + ++ /* VK_EXT_transform_feedback (iter13) */ ++ .transformFeedback = PAN_ARCH < 9, ++ .geometryStreams = false, ++ + /* VK_KHR_shader_clock */ + .shaderSubgroupClock = device->kmod.dev->props.gpu_can_query_timestamp, + .shaderDeviceClock = device->kmod.dev->props.timestamp_device_coherent, +@@ -1020,6 +1025,18 @@ + .robustStorageBufferAccessSizeAlignment = 1, + .robustUniformBufferAccessSizeAlignment = 1, + ++ /* VK_EXT_transform_feedback (iter13) */ ++ .maxTransformFeedbackStreams = 1, ++ .maxTransformFeedbackBuffers = 4, ++ .maxTransformFeedbackBufferSize = UINT32_MAX, ++ .maxTransformFeedbackStreamDataSize = 512, ++ .maxTransformFeedbackBufferDataSize = 512, ++ .maxTransformFeedbackBufferDataStride = 2048, ++ .transformFeedbackQueries = false, ++ .transformFeedbackStreamsLinesTriangles = false, ++ .transformFeedbackRasterizationStreamSelect = false, ++ .transformFeedbackDraw = false, ++ + /* VK_EXT_shader_object */ + /* We do not currently support VK_EXT_shader_object but this is used + * internally by vk_shader +--- a/src/panfrost/vulkan/panvk_vX_shader.c 2026-04-29 22:19:00.000000000 +0200 ++++ b/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-20 18:52:56.556745611 +0200 +@@ -21,6 +21,7 @@ + #include "panvk_physical_device.h" + #include "panvk_sampler.h" + #include "panvk_shader.h" ++#include "pan_nir.h" /* iter13: pan_nir_lower_xfb */ + + #include "spirv/nir_spirv.h" + #include "util/memstream.h" +@@ -100,6 +101,20 @@ + case nir_intrinsic_load_raw_vertex_offset_pan: + val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset); + break; ++ case nir_intrinsic_load_num_vertices: /* iter13: XFB index calc */ ++ val = load_sysval(b, graphics, bit_size, vs.num_vertices); ++ break; ++ case nir_intrinsic_load_xfb_address: { /* iter13: XFB buffer N base address */ ++ unsigned idx = nir_intrinsic_base(intr); ++ switch (idx) { ++ case 0: val = load_sysval(b, graphics, bit_size, vs.xfb_address[0]); break; ++ case 1: val = load_sysval(b, graphics, bit_size, vs.xfb_address[1]); break; ++ case 2: val = load_sysval(b, graphics, bit_size, vs.xfb_address[2]); break; ++ case 3: val = load_sysval(b, graphics, bit_size, vs.xfb_address[3]); break; ++ default: return false; ++ } ++ break; ++ } + case nir_intrinsic_load_layer_id: + assert(b->shader->info.stage == MESA_SHADER_FRAGMENT); + val = load_sysval(b, graphics, bit_size, layer_id); +@@ -457,6 +472,7 @@ + core_max_id); + + pan_preprocess_nir(nir, pdev->kmod.dev->props.gpu_id); ++ + } + + static void +@@ -870,6 +886,18 @@ + nir_var_shader_in | nir_var_shader_out, UINT32_MAX); + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + glsl_type_size, nir_lower_io_use_interpolated_input_intrinsics); ++ ++#if PAN_ARCH < 9 ++ /* iter13: VK_EXT_transform_feedback — runs AFTER nir_lower_io so that ++ * shader outputs are now store_output intrinsics that pan_nir_lower_xfb ++ * can rewrite to nir_store_global+nir_load_xfb_address. */ ++ if (nir->info.stage == MESA_SHADER_VERTEX && ++ nir->info.has_transform_feedback_varyings) { ++ NIR_PASS(_, nir, nir_opt_constant_folding); ++ NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info); ++ NIR_PASS(_, nir, pan_nir_lower_xfb); ++ } ++#endif + } + + static VkResult +@@ -1288,6 +1316,9 @@ + .view_mask = (state && state->rp) ? state->rp->view_mask : 0, + .robust2_modes = robust2_modes, + .robust_descriptors = dev->vk.enabled_features.nullDescriptor, ++ /* iter13: XFB shaders must disable IDVS (matches Panfrost-Gallium). */ ++ .no_idvs = (info->stage == MESA_SHADER_VERTEX) && ++ info->nir->info.has_transform_feedback_varyings, + }; + + switch (info->stage) { +--- a/src/panfrost/vulkan/panvk_cmd_draw.h 2026-04-29 22:19:00.000000000 +0200 ++++ b/src/panfrost/vulkan/panvk_cmd_draw.h 2026-05-20 18:52:57.748763011 +0200 +@@ -135,6 +135,19 @@ + struct panvk_graphics_sysvals sysvals; + + #if PAN_ARCH < 9 ++ /* iter13: VK_EXT_transform_feedback state (JM-class only for now). */ ++ struct { ++ bool active; ++ uint32_t buffer_count; ++ struct { ++ uint64_t addr; ++ uint64_t offset; ++ uint64_t size; ++ } buffers[4]; ++ } xfb; ++#endif ++ ++#if PAN_ARCH < 9 + struct panvk_shader_link link; + #endif + +--- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-04-29 22:19:00.000000000 +0200 ++++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-20 19:10:23.031919662 +0200 +@@ -10,6 +10,7 @@ + #include "panvk_entrypoints.h" + + #include "pan_desc.h" ++#include "pan_compiler.h" /* PAN_SHADER_OOB_ADDRESS */ + #include "pan_util.h" + + static void +@@ -722,6 +723,35 @@ + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset, + info->vertex.raw_offset); + set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id); ++ ++ /* iter13: VK_EXT_transform_feedback sysvals — always set (per draw), ++ * reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */ ++ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count); ++ { ++ const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx; ++ /* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS ++ * (= 1<<63). This is the Panfrost-Gallium memory-sink idiom — the ++ * Bifrost MMU silently discards stores to this address, so a pipeline ++ * with XFB outputs used in a non-XFB draw (or in an XFB draw with ++ * fewer bound buffers than the shader declares) is safe instead of ++ * faulting. See gallium/drivers/panfrost/pan_cmdstream.c PAN_SYSVAL_XFB. */ ++ uint64_t _xa0 = PAN_SHADER_OOB_ADDRESS, _xa1 = PAN_SHADER_OOB_ADDRESS, ++ _xa2 = PAN_SHADER_OOB_ADDRESS, _xa3 = PAN_SHADER_OOB_ADDRESS; ++ if (_gfx->xfb.active) { ++ if (_gfx->xfb.buffer_count > 0 && _gfx->xfb.buffers[0].addr) ++ _xa0 = _gfx->xfb.buffers[0].addr + _gfx->xfb.buffers[0].offset; ++ if (_gfx->xfb.buffer_count > 1 && _gfx->xfb.buffers[1].addr) ++ _xa1 = _gfx->xfb.buffers[1].addr + _gfx->xfb.buffers[1].offset; ++ if (_gfx->xfb.buffer_count > 2 && _gfx->xfb.buffers[2].addr) ++ _xa2 = _gfx->xfb.buffers[2].addr + _gfx->xfb.buffers[2].offset; ++ if (_gfx->xfb.buffer_count > 3 && _gfx->xfb.buffers[3].addr) ++ _xa3 = _gfx->xfb.buffers[3].addr + _gfx->xfb.buffers[3].offset; ++ } ++ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[0], _xa0); ++ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[1], _xa1); ++ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[2], _xa2); ++ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[3], _xa3); ++ } + #endif + + if (dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS)) { +--- a/src/panfrost/vulkan/meson.build 2026-04-29 22:19:00.000000000 +0200 ++++ b/src/panfrost/vulkan/meson.build 2026-05-20 18:53:04.484861338 +0200 +@@ -73,6 +73,7 @@ + jm_inc_dir = ['jm'] + jm_files = [ + 'jm/panvk_vX_bind_queue.c', ++ 'jm/panvk_vX_cmd_xfb.c', # iter13 + 'jm/panvk_vX_cmd_buffer.c', + 'jm/panvk_vX_cmd_dispatch.c', + 'jm/panvk_vX_cmd_draw.c', +--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c 2026-04-29 22:19:00.000000000 +0200 ++++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c 2026-05-20 19:10:26.163965149 +0200 +@@ -473,5 +473,12 @@ + + vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo); + ++#if PAN_ARCH < 9 ++ /* iter13: clear XFB state on Begin so a reused command buffer does not ++ * inherit stale xfb.buffer_count / xfb.active / xfb.buffers[] from a ++ * prior recording. */ ++ memset(&cmdbuf->state.gfx.xfb, 0, sizeof(cmdbuf->state.gfx.xfb)); ++#endif ++ + return VK_SUCCESS; + } +--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c 2026-05-18 12:50:53.067999996 +0200 ++++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c 2026-05-20 19:10:27.175979847 +0200 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright © 2026 mfritsche / claude-noether ++ * SPDX-License-Identifier: MIT ++ * ++ * iter13: VK_EXT_transform_feedback command handlers for the JM ++ * architecture path (Bifrost v6/v7 + Valhall-JM v9). ++ * ++ * The runtime contract: ++ * - vkCmdBindTransformFeedbackBuffersEXT: stash (gpu_addr, offset, size) ++ * for each slot into cmdbuf->state.gfx.xfb.buffers[]. ++ * - vkCmdBeginTransformFeedbackEXT: set cmdbuf->state.gfx.xfb.active = true. ++ * Mark sysvals dirty so the next draw re-emits vs.xfb_address[]. ++ * - vkCmdEndTransformFeedbackEXT: set active = false. ++ * ++ * Counter buffers (firstCounterBuffer/counterBufferCount/pCounterBuffers/ ++ * pCounterBufferOffsets) are accepted by API but ignored — v1 doesn't ++ * support pause/resume. transformFeedbackDraw is advertised as false. ++ * ++ * Per-draw integration: jm/panvk_vX_cmd_draw.c reads cmdbuf->state.gfx.xfb ++ * and populates vs.xfb_address[i] for shader use. The pan_nir_lower_xfb ++ * pass in panvk_vX_shader.c emits nir_load_xfb_address(i) which lowers ++ * (via panvk_vX_shader.c sysval handler) to a load from the per-draw ++ * sysval push area. ++ */ ++ ++#include "vk_log.h" ++#include "util/log.h" ++ ++#include "panvk_cmd_buffer.h" ++#include "panvk_cmd_draw.h" ++#include "panvk_buffer.h" ++#include "panvk_entrypoints.h" ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_per_arch(CmdBindTransformFeedbackBuffersEXT)( ++ VkCommandBuffer commandBuffer, ++ uint32_t firstBinding, ++ uint32_t bindingCount, ++ const VkBuffer *pBuffers, ++ const VkDeviceSize *pOffsets, ++ const VkDeviceSize *pSizes) ++{ ++ VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); ++ struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx; ++ ++ for (uint32_t i = 0; i < bindingCount; i++) { ++ uint32_t slot = firstBinding + i; ++ if (slot >= 4) ++ continue; ++ ++ VK_FROM_HANDLE(panvk_buffer, buf, pBuffers[i]); ++ gfx->xfb.buffers[slot].addr = panvk_buffer_gpu_ptr(buf, 0); ++ gfx->xfb.buffers[slot].offset = pOffsets[i]; ++ gfx->xfb.buffers[slot].size = ++ (pSizes != NULL && pSizes[i] != VK_WHOLE_SIZE) ++ ? pSizes[i] ++ : (buf->vk.size - pOffsets[i]); ++ } ++ ++ if (firstBinding + bindingCount > gfx->xfb.buffer_count) ++ gfx->xfb.buffer_count = firstBinding + bindingCount; ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_per_arch(CmdBeginTransformFeedbackEXT)( ++ VkCommandBuffer commandBuffer, ++ uint32_t firstCounterBuffer, ++ uint32_t counterBufferCount, ++ const VkBuffer *pCounterBuffers, ++ const VkDeviceSize *pCounterBufferOffsets) ++{ ++ VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); ++ struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx; ++ ++ /* Counter buffers ignored in v1 — see VkPhysicalDeviceTransformFeedback ++ * PropertiesEXT.transformFeedbackDraw = false in panvk_vX_physical_device.c. ++ * App is spec-compliant if it does not pass counter buffers (which our ++ * features advertisement allows), but warn loudly if it does so we do not ++ * silently produce wrong capture state. */ ++ (void)firstCounterBuffer; ++ (void)pCounterBufferOffsets; ++ if (counterBufferCount > 0 && pCounterBuffers != NULL) { ++ mesa_logw("panvk: CmdBeginTransformFeedbackEXT: counter buffers not " ++ "implemented (transformFeedbackDraw=false); XFB resume will " ++ "restart at buffer offset 0"); ++ } ++ ++ gfx->xfb.active = true; ++ /* Per-draw set_gfx_sysval picks up the change automatically — no ++ * explicit dirty marking required (set_gfx_sysval uses memcmp + ++ * BITSET to detect state diffs and re-emit sysvals). */ ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++panvk_per_arch(CmdEndTransformFeedbackEXT)( ++ VkCommandBuffer commandBuffer, ++ uint32_t firstCounterBuffer, ++ uint32_t counterBufferCount, ++ const VkBuffer *pCounterBuffers, ++ const VkDeviceSize *pCounterBufferOffsets) ++{ ++ VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); ++ struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx; ++ ++ (void)firstCounterBuffer; ++ (void)counterBufferCount; ++ (void)pCounterBuffers; ++ (void)pCounterBufferOffsets; ++ ++ gfx->xfb.active = false; ++} diff --git a/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch b/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch deleted file mode 120000 index e9ba2ffbbb..0000000000 --- a/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch +++ /dev/null @@ -1 +0,0 @@ -../mesa-panvk-bifrost/0004-panvk-bifrost-xfb-primitive-decomposition.patch \ No newline at end of file diff --git a/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch b/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch new file mode 100644 index 0000000000..c015062ff6 --- /dev/null +++ b/arch/mesa-panvk-bifrost-video/0004-panvk-bifrost-xfb-primitive-decomposition.patch @@ -0,0 +1,629 @@ +diff -urN a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build +--- a/src/panfrost/vulkan/meson.build 2026-05-21 14:04:02.529474145 +0200 ++++ b/src/panfrost/vulkan/meson.build 2026-05-21 14:04:04.106755486 +0200 +@@ -123,6 +123,7 @@ + 'panvk_vX_nir_lower_input_attachment_loads.c', + 'panvk_vX_sampler.c', + 'panvk_vX_shader.c', ++ 'panvk_vX_xfb_lower.c', + sha1_h, + ] + +diff -urN a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h +--- a/src/panfrost/vulkan/panvk_shader.h 2026-05-21 14:04:02.525251986 +0200 ++++ b/src/panfrost/vulkan/panvk_shader.h 2026-05-21 14:04:04.084251800 +0200 +@@ -154,6 +154,8 @@ + /* aligned_u64 attribute below inserts the 4-byte alignment gap + * after num_vertices automatically — no explicit pad needed. */ + aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */ ++ uint32_t xfb_topology; /* iter17: panvk_xfb_topology enum value */ ++ uint32_t xfb_output_count; /* iter17: per-instance output verts after decomp */ + #endif + int32_t first_vertex; + int32_t base_instance; +@@ -569,4 +571,76 @@ + struct pan_compute_dim local_size, const void *bin_ptr, size_t bin_size, + struct panvk_shader **shader_out); + ++ ++#if PAN_ARCH < 9 ++/* iter17: encoding for vs.xfb_topology sysval. Maps VkPrimitiveTopology values ++ * we need to distinguish at shader runtime for XFB capture. LIST topologies ++ * use the iter13 single-store fast path; non-LIST need per-vertex decomposition. */ ++enum panvk_xfb_topology { ++ PANVK_XFB_TOPO_LIST = 0, ++ PANVK_XFB_TOPO_LINE_STRIP = 1, ++ PANVK_XFB_TOPO_TRI_STRIP = 2, ++ PANVK_XFB_TOPO_TRI_FAN = 3, ++ PANVK_XFB_TOPO_LINE_LIST_ADJ = 4, ++ PANVK_XFB_TOPO_LINE_STRIP_ADJ = 5, ++ PANVK_XFB_TOPO_TRI_LIST_ADJ = 6, ++ PANVK_XFB_TOPO_TRI_STRIP_ADJ = 7, ++}; ++ ++#include "panvk_macros.h" ++struct nir_shader; ++bool panvk_per_arch(nir_lower_xfb)(struct nir_shader *nir); ++ ++/* Map VkPrimitiveTopology to panvk_xfb_topology enum (driver-side helper). */ ++static inline uint32_t ++panvk_vk_topology_to_xfb_enum(VkPrimitiveTopology topo) ++{ ++ switch (topo) { ++ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: ++ return PANVK_XFB_TOPO_LINE_STRIP; ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: ++ return PANVK_XFB_TOPO_TRI_STRIP; ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: ++ return PANVK_XFB_TOPO_TRI_FAN; ++ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: ++ return PANVK_XFB_TOPO_LINE_LIST_ADJ; ++ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: ++ return PANVK_XFB_TOPO_LINE_STRIP_ADJ; ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: ++ return PANVK_XFB_TOPO_TRI_LIST_ADJ; ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: ++ return PANVK_XFB_TOPO_TRI_STRIP_ADJ; ++ case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: ++ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: ++ default: ++ return PANVK_XFB_TOPO_LIST; ++ } ++} ++ ++/* Compute the per-instance output vertex count for a given (topology, input count). */ ++static inline uint32_t ++panvk_xfb_output_count(VkPrimitiveTopology topo, uint32_t input_count) ++{ ++ switch (topo) { ++ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: ++ return input_count >= 1 ? 2u * (input_count - 1u) : 0u; ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: ++ return input_count >= 2 ? 3u * (input_count - 2u) : 0u; ++ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: ++ return (input_count / 4u) * 2u; ++ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: ++ return input_count >= 3 ? 2u * (input_count - 3u) : 0u; ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: ++ return (input_count / 6u) * 3u; ++ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: ++ return input_count >= 6 ? 3u * (input_count / 2u - 2u) : 0u; ++ default: ++ return input_count; /* LIST topologies: 1:1 mapping */ ++ } ++} ++#endif ++ ++ + #endif +diff -urN a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c +--- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-21 14:04:02.528576354 +0200 ++++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-21 14:04:04.091357598 +0200 +@@ -727,6 +727,20 @@ + /* iter13: VK_EXT_transform_feedback sysvals — always set (per draw), + * reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */ + set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count); ++ ++ /* iter17: XFB primitive-decomposition sysvals. ++ * xfb_topology = enum value for the current bound topology. ++ * xfb_output_count = per-instance output vertex count after decomposition. ++ * For LIST topologies, output_count == input vertex count and the shader ++ * takes the iter13 single-store fast path. */ ++ { ++ VkPrimitiveTopology vk_topo = ++ cmdbuf->vk.dynamic_graphics_state.ia.primitive_topology; ++ uint32_t topo_enum = panvk_vk_topology_to_xfb_enum(vk_topo); ++ uint32_t out_count = panvk_xfb_output_count(vk_topo, info->vertex.count); ++ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_topology, topo_enum); ++ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_output_count, out_count); ++ } + { + const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx; + /* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS +diff -urN a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c +--- a/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-21 14:04:02.527576494 +0200 ++++ b/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-21 14:04:04.098356619 +0200 +@@ -895,7 +895,10 @@ + nir->info.has_transform_feedback_varyings) { + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info); +- NIR_PASS(_, nir, pan_nir_lower_xfb); ++ /* iter17: panvk-specific replacement for pan_nir_lower_xfb that handles ++ * primitive decomposition for non-LIST topologies. Single-store LIST ++ * fast path matches iter13 behavior. */ ++ NIR_PASS(_, nir, panvk_per_arch(nir_lower_xfb)); + } + #endif + } +diff -urN a/src/panfrost/vulkan/panvk_vX_xfb_lower.c b/src/panfrost/vulkan/panvk_vX_xfb_lower.c +--- a/src/panfrost/vulkan/panvk_vX_xfb_lower.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/src/panfrost/vulkan/panvk_vX_xfb_lower.c 2026-05-21 14:04:04.115354242 +0200 +@@ -0,0 +1,486 @@ ++/* ++ * Copyright © 2026 mfritsche / claude-noether ++ * SPDX-License-Identifier: MIT ++ * ++ * iter17: panvk-specific replacement for pan_nir_lower_xfb that handles ++ * primitive decomposition for transform_feedback on non-LIST topologies ++ * (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY). ++ * ++ * Approach: emit a topology dispatch at the start of each store_output ++ * lowering. The shader reads vs.xfb_topology sysval at runtime and branches ++ * into per-topology emission logic. For each affected topology, the lowered ++ * code emits guarded conditional stores — one per primitive this vertex ++ * contributes to, computing the output buffer position via primitive index ++ * and slot within the decomposed primitive. ++ * ++ * For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that ++ * matches iter13's single-store behavior. ++ * ++ * For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives ++ * as slot 2 — handled via a NIR loop bounded by num_vertices. ++ * ++ * See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context. ++ */ ++ ++#include "panvk_macros.h" ++ ++#if PAN_ARCH < 9 ++ ++#include "panvk_shader.h" ++ ++#include "compiler/nir/nir_builder.h" ++#include "pan_nir.h" ++ ++#include ++ ++/* ----- Address arithmetic ----- */ ++ ++static nir_def * ++xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx, ++ uint16_t stride, uint16_t offset_bytes) ++{ ++ nir_def *byte_off = nir_iadd_imm(b, ++ nir_imul_imm(b, out_idx, stride), offset_bytes); ++ return nir_iadd(b, buf, nir_u2u64(b, byte_off)); ++} ++ ++static void ++emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count, ++ nir_def *instance_id, nir_def *raw_vid, nir_def *value, ++ uint16_t stride, uint16_t offset_bytes) ++{ ++ nir_def *out_idx = nir_iadd(b, ++ nir_imul(b, instance_id, output_count), raw_vid); ++ nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); ++ nir_store_global(b, value, addr); ++} ++ ++static void ++emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count, ++ nir_def *instance_id, nir_def *eligible, ++ nir_def *prim_idx, nir_def *slot, ++ uint32_t verts_per_prim, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ nir_push_if(b, eligible); ++ { ++ nir_def *out_idx = nir_iadd(b, ++ nir_imul(b, instance_id, output_count), ++ nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot)); ++ nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); ++ nir_store_global(b, value, addr); ++ } ++ nir_pop_if(b, NULL); ++} ++ ++/* ----- Per-topology emission ----- */ ++ ++/* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */ ++static void ++emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N, ++ nir_def *buf, nir_def *output_count, nir_def *instance_id, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ nir_def *Nm2 = nir_iadd_imm(b, N, -2); ++ nir_def *Nm1 = nir_iadd_imm(b, N, -1); ++ ++ /* Prim v, slot 0: v < N-2 */ ++ emit_prim_store(b, buf, output_count, instance_id, ++ nir_ult(b, v, Nm2), ++ v, nir_imm_int(b, 0), 3, value, stride, offset_bytes); ++ ++ /* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v, -1); ++ nir_def *parity = nir_iand_imm(b, prim, 1u); ++ nir_def *slot = nir_iadd_imm(b, parity, 1); ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 1)), ++ nir_ult(b, v, Nm1)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, slot, 3, value, stride, offset_bytes); ++ } ++ ++ /* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v, -2); ++ nir_def *parity = nir_iand_imm(b, prim, 1u); ++ nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 2)), ++ nir_ult(b, v, N)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, slot, 3, value, stride, offset_bytes); ++ } ++} ++ ++/* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */ ++static void ++emit_line_strip(nir_builder *b, nir_def *v, nir_def *N, ++ nir_def *buf, nir_def *output_count, nir_def *instance_id, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ nir_def *Nm1 = nir_iadd_imm(b, N, -1); ++ ++ /* Prim v, slot 0: v < N-1 */ ++ emit_prim_store(b, buf, output_count, instance_id, ++ nir_ult(b, v, Nm1), ++ v, nir_imm_int(b, 0), 2, value, stride, offset_bytes); ++ ++ /* Prim v-1, slot 1: 1 <= v < N */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v, -1); ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 1)), ++ nir_ult(b, v, N)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); ++ } ++} ++ ++/* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}. ++ * vertex v=0: contributes to ALL prims as slot 2 (loop required) ++ * vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2) ++ * vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1) ++ */ ++static void ++emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N, ++ nir_def *buf, nir_def *output_count, nir_def *instance_id, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ nir_def *Nm1 = nir_iadd_imm(b, N, -1); ++ nir_def *Nm2 = nir_iadd_imm(b, N, -2); ++ ++ /* Prim v-1, slot 0: 1 <= v < N-1 */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v, -1); ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 1)), ++ nir_ult(b, v, Nm1)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes); ++ } ++ ++ /* Prim v-2, slot 1: 2 <= v < N */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v, -2); ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 2)), ++ nir_ult(b, v, N)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes); ++ } ++ ++ /* Central vertex (v == 0): loop over all prims, write to slot 2. */ ++ nir_push_if(b, nir_ieq_imm(b, v, 0)); ++ { ++ nir_variable *p_var = nir_local_variable_create(b->impl, ++ glsl_uint_type(), "fan_p"); ++ nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1); ++ nir_push_loop(b); ++ { ++ nir_def *p = nir_load_var(b, p_var); ++ nir_push_if(b, nir_uge(b, p, Nm2)); ++ { ++ nir_jump(b, nir_jump_break); ++ } ++ nir_pop_if(b, NULL); ++ ++ nir_def *out_idx = nir_iadd(b, ++ nir_imul(b, instance_id, output_count), ++ nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2)); ++ nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes); ++ nir_store_global(b, value, addr); ++ ++ nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1); ++ } ++ nir_pop_loop(b, NULL); ++ } ++ nir_pop_if(b, NULL); ++} ++ ++/* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}. ++ * v contributes if v%4 == 1: prim v/4 slot 0 ++ * v contributes if v%4 == 2: prim v/4 slot 1 ++ */ ++static void ++emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N, ++ nir_def *buf, nir_def *output_count, nir_def *instance_id, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ (void)N; /* eligibility is mod-based, not range-based */ ++ nir_def *vmod4 = nir_iand_imm(b, v, 3u); ++ nir_def *prim = nir_ushr_imm(b, v, 2); /* v / 4 */ ++ ++ emit_prim_store(b, buf, output_count, instance_id, ++ nir_ieq_imm(b, vmod4, 1), ++ prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes); ++ ++ emit_prim_store(b, buf, output_count, instance_id, ++ nir_ieq_imm(b, vmod4, 2), ++ prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); ++} ++ ++/* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}. ++ * v contributes to prim v-1 slot 0 (1 <= v <= N-2) ++ * v contributes to prim v-2 slot 1 (2 <= v <= N-1) ++ */ ++static void ++emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N, ++ nir_def *buf, nir_def *output_count, nir_def *instance_id, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ nir_def *Nm1 = nir_iadd_imm(b, N, -1); ++ nir_def *Nm2 = nir_iadd_imm(b, N, -2); ++ ++ /* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v, -1); ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 1)), ++ nir_ult(b, v, Nm1)); ++ (void)Nm2; ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes); ++ } ++ ++ /* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v, -2); ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 2)), ++ nir_ult(b, v, N)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes); ++ } ++} ++ ++/* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}. ++ * v contributes if v%6 == 0: prim v/6 slot 0 ++ * v contributes if v%6 == 2: prim v/6 slot 1 ++ * v contributes if v%6 == 4: prim v/6 slot 2 ++ */ ++static void ++emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N, ++ nir_def *buf, nir_def *output_count, nir_def *instance_id, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ (void)N; ++ nir_def *vmod6 = nir_umod_imm(b, v, 6); ++ nir_def *prim = nir_udiv_imm(b, v, 6); ++ ++ for (uint32_t slot = 0; slot < 3; slot++) { ++ emit_prim_store(b, buf, output_count, instance_id, ++ nir_ieq_imm(b, vmod6, slot * 2), ++ prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes); ++ } ++} ++ ++/* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits: ++ * even i: {2i, 2i+2, 2i+4} (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4) ++ * odd i: {2i, 2i+4, 2i+2} (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2) ++ * ++ * Only EVEN input vertices contribute (since all output indices are 2*something). ++ * For even input v: ++ * prim v/2 slot 0 (always, if v/2 < N/2-2) ++ * prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd (when v >= 2) ++ * prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd (when v >= 4) ++ */ ++static void ++emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N, ++ nir_def *buf, nir_def *output_count, nir_def *instance_id, ++ nir_def *value, uint16_t stride, uint16_t offset_bytes) ++{ ++ /* Bail for odd input vertices — they never contribute. */ ++ nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0); ++ nir_push_if(b, v_is_even); ++ { ++ nir_def *N_half = nir_ushr_imm(b, N, 1); ++ nir_def *max_prim = nir_iadd_imm(b, N_half, -2); /* N/2 - 2 */ ++ nir_def *v_half = nir_ushr_imm(b, v, 1); ++ ++ /* Prim v/2 slot 0: v/2 < N/2 - 2 */ ++ emit_prim_store(b, buf, output_count, instance_id, ++ nir_ult(b, v_half, max_prim), ++ v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes); ++ ++ /* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v_half, -1); ++ nir_def *parity = nir_iand_imm(b, prim, 1u); ++ nir_def *slot = nir_iadd_imm(b, parity, 1); /* even→1, odd→2 */ ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 2)), ++ nir_ult(b, prim, max_prim)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, slot, 3, value, stride, offset_bytes); ++ } ++ ++ /* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */ ++ { ++ nir_def *prim = nir_iadd_imm(b, v_half, -2); ++ nir_def *parity = nir_iand_imm(b, prim, 1u); ++ nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); /* even→2, odd→1 */ ++ nir_def *eligible = nir_iand(b, ++ nir_uge(b, v, nir_imm_int(b, 4)), ++ nir_ult(b, prim, max_prim)); ++ emit_prim_store(b, buf, output_count, instance_id, eligible, ++ prim, slot, 3, value, stride, offset_bytes); ++ } ++ } ++ nir_pop_if(b, NULL); ++} ++ ++/* ----- Main lowering: per store_output XFB channel ----- */ ++ ++static void ++lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr, ++ unsigned channel_idx, unsigned num_components, ++ unsigned buffer, unsigned offset_words) ++{ ++ assert(buffer < MAX_XFB_BUFFERS); ++ assert(nir_intrinsic_component(intr) == 0); ++ ++ uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; ++ assert(stride != 0); ++ uint16_t offset_bytes = offset_words * 4; ++ ++ BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); ++ BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); ++ ++ nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology); ++ nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count); ++ nir_def *N = nir_load_num_vertices(b); ++ nir_def *v = nir_load_raw_vertex_id_pan(b); ++ nir_def *instance = nir_load_instance_id(b); ++ nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer); ++ ++ nir_def *src = intr->src[0].ssa; ++ nir_component_mask_t mask = nir_component_mask(num_components); ++ nir_def *value = nir_channels(b, src, mask << channel_idx); ++ ++ /* Topology dispatch ladder. LIST first (fast path). */ ++ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST)); ++ { ++ emit_list_store(b, buf, out_count, instance, v, value, ++ stride, offset_bytes); ++ } ++ nir_push_else(b, NULL); ++ { ++ /* iter17 Janet Finding 3: gate all non-LIST emission on ++ * output_count > 0. For degenerate input counts (N < min required ++ * for the topology), output_count is 0 and we must emit NO stores ++ * — otherwise N-2 / N-3 / etc. arithmetic underflows in the ++ * eligibility predicates and we falsely fire stores. */ ++ nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count)); ++ { ++ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP)); ++ { ++ emit_tri_strip(b, v, N, buf, out_count, instance, value, ++ stride, offset_bytes); ++ } ++ nir_push_else(b, NULL); ++ { ++ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP)); ++ { ++ emit_line_strip(b, v, N, buf, out_count, instance, value, ++ stride, offset_bytes); ++ } ++ nir_push_else(b, NULL); ++ { ++ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN)); ++ { ++ emit_tri_fan(b, v, N, buf, out_count, instance, value, ++ stride, offset_bytes); ++ } ++ nir_push_else(b, NULL); ++ { ++ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ)); ++ { ++ emit_line_list_adj(b, v, N, buf, out_count, instance, value, ++ stride, offset_bytes); ++ } ++ nir_push_else(b, NULL); ++ { ++ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ)); ++ { ++ emit_line_strip_adj(b, v, N, buf, out_count, instance, value, ++ stride, offset_bytes); ++ } ++ nir_push_else(b, NULL); ++ { ++ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ)); ++ { ++ emit_tri_list_adj(b, v, N, buf, out_count, instance, value, ++ stride, offset_bytes); ++ } ++ nir_push_else(b, NULL); ++ { ++ /* TRI_STRIP_ADJ — last case */ ++ emit_tri_strip_adj(b, v, N, buf, out_count, instance, value, ++ stride, offset_bytes); ++ } ++ nir_pop_if(b, NULL); ++ } ++ nir_pop_if(b, NULL); ++ } ++ nir_pop_if(b, NULL); ++ } ++ nir_pop_if(b, NULL); ++ } ++ nir_pop_if(b, NULL); ++ } ++ nir_pop_if(b, NULL); ++ } ++ nir_pop_if(b, NULL); /* Janet Finding 3: close output_count > 0 guard */ ++ } ++ nir_pop_if(b, NULL); ++} ++ ++/* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite + ++ * dispatch store_output through our topology-aware emission. */ ++static bool ++lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr, ++ UNUSED void *data) ++{ ++ if (intr->intrinsic == nir_intrinsic_load_vertex_id) { ++ b->cursor = nir_instr_remove(&intr->instr); ++ nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b), ++ nir_load_raw_vertex_offset_pan(b)); ++ nir_def_rewrite_uses(&intr->def, repl); ++ return true; ++ } ++ ++ if (intr->intrinsic != nir_intrinsic_store_output) ++ return false; ++ ++ bool progress = false; ++ b->cursor = nir_before_instr(&intr->instr); ++ ++ /* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2. ++ * Outer loop selects which annotation; inner picks which channel. */ ++ for (unsigned i = 0; i < 2; ++i) { ++ nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr) ++ : nir_intrinsic_io_xfb(intr); ++ for (unsigned j = 0; j < 2; ++j) { ++ if (!xfb.out[j].num_components) ++ continue; ++ lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components, ++ xfb.out[j].buffer, xfb.out[j].offset); ++ progress = true; ++ } ++ } ++ ++ if (progress) ++ nir_instr_remove(&intr->instr); ++ return progress; ++} ++ ++bool ++panvk_per_arch(nir_lower_xfb)(nir_shader *nir) ++{ ++ return nir_shader_intrinsics_pass( ++ nir, lower_xfb_iter17, nir_metadata_control_flow, NULL); ++} ++ ++#endif /* PAN_ARCH < 9 */