initial seed: retrofit campaign lineage from local working trees

panvk-bifrost campaigns (r1..r4 Vulkan compositor + r5.video1 Vulkan
video decode) shipped before this repo existed; the deliverable
patches live in marfrit-packages, but the reasoning chain, phase docs,
and source-state evidence lived only in local working trees on the
development host.

This retrofit imports:
- mesa-panvk-bifrost/   — r1..r4 era phase docs (iter1..iter18)
                          (libmali stub blobs at iter18/blob/ excluded
                          — 109MB of RE artifacts replaced with a README
                          pointer)
- mesa-panvk-bifrost-video/ — sibling campaign phase docs + probe
- evidence/             — frozen .tgz source snapshots at each milestone
                          (basis for the 0005 patch diff generation)

Future iterations should branch off here from day one, so each iter is
a commit rather than a snapshot. See [[feedback-session-local-process-pins]]
for the process drift this retrofit closes.

Total: 1.9 MB across 124 files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 05:25:37 +02:00
parent 430d0da278
commit a4e7d8ab90
124 changed files with 22551 additions and 1 deletions
+39
View File
@@ -0,0 +1,39 @@
# iter13 XFB probe — build glue.
CC ?= cc
CFLAGS ?= -O0 -g -Wall -Wextra -std=c11
LDLIBS ?= -lvulkan
PROBE = probe_xfb
NOPROBE = probe_xfb_nodraw
SRC = probe_xfb.c
NOSRC = probe_xfb_nodraw.c
VERT = probe_xfb.vert
VSPV = probe_xfb.vert.spv
all: $(PROBE) $(NOPROBE) $(VSPV)
$(PROBE): $(SRC)
$(CC) $(CFLAGS) -o $@ $< $(LDLIBS)
$(NOPROBE): $(NOSRC)
$(CC) $(CFLAGS) -o $@ $< $(LDLIBS)
# glslangValidator + xfb-aware compile. The -V flag enables Vulkan SPIR-V output.
# xfb_buffer / xfb_offset / xfb_stride decorations are honored when the SPIR-V
# is targeted at Vulkan (which is the default for -V).
$(VSPV): $(VERT)
glslangValidator -V $< -o $@
run: all
PAN_I_WANT_A_BROKEN_VULKAN_DRIVER=1 ./$(PROBE)
run-patched-mesa: all
VK_ICD_FILENAMES=/usr/lib/panvk-bifrost/icd.json \
PAN_I_WANT_A_BROKEN_VULKAN_DRIVER=1 \
./$(PROBE)
clean:
rm -f $(PROBE) $(VSPV)
.PHONY: all run run-patched-mesa clean
@@ -0,0 +1,484 @@
/*
* Copyright © 2021 Collabora Ltd.
*
* Derived from tu_cmd_buffer.c which is:
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
* Copyright © 2015 Intel Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "genxml/gen_macros.h"
#include "panvk_buffer.h"
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_draw.h"
#include "panvk_cmd_fb_preload.h"
#include "panvk_cmd_pool.h"
#include "panvk_cmd_push_constant.h"
#include "panvk_device.h"
#include "panvk_entrypoints.h"
#include "panvk_instance.h"
#include "panvk_meta.h"
#include "panvk_physical_device.h"
#include "panvk_priv_bo.h"
#include "pan_desc.h"
#include "pan_encoder.h"
#include "pan_props.h"
#include "pan_samples.h"
#include "vk_descriptor_update_template.h"
#include "vk_format.h"
static VkResult
panvk_cmd_prepare_fragment_job(struct panvk_cmd_buffer *cmdbuf, uint64_t fbd)
{
const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct panvk_batch *batch = cmdbuf->cur_batch;
struct pan_ptr job_ptr = panvk_cmd_alloc_desc(cmdbuf, FRAGMENT_JOB);
if (!job_ptr.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
GENX(pan_emit_fragment_job_payload)(fbinfo, fbd, job_ptr.cpu);
pan_section_pack(job_ptr.cpu, FRAGMENT_JOB, HEADER, header) {
header.type = MALI_JOB_TYPE_FRAGMENT;
header.index = 1;
}
pan_jc_add_job(&batch->frag_jc, MALI_JOB_TYPE_FRAGMENT, false, false, 0, 0,
&job_ptr, false);
util_dynarray_append(&batch->jobs, job_ptr.cpu);
return VK_SUCCESS;
}
void
panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
{
struct panvk_batch *batch = cmdbuf->cur_batch;
if (!batch)
return;
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
assert(batch);
if (!batch->fb.desc.gpu && !batch->vtc_jc.first_job) {
if (util_dynarray_num_elements(&batch->event_ops,
struct panvk_cmd_event_op) == 0) {
/* Content-less batch, let's drop it */
vk_free(&cmdbuf->vk.pool->alloc, batch);
} else {
/* Batch has no jobs but is needed for synchronization, let's add a
* NULL job so the SUBMIT ioctl doesn't choke on it.
*/
struct pan_ptr ptr = panvk_cmd_alloc_desc(cmdbuf, JOB_HEADER);
if (ptr.gpu) {
util_dynarray_append(&batch->jobs, ptr.cpu);
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_NULL, false, false, 0,
0, &ptr, false);
}
list_addtail(&batch->node, &cmdbuf->batches);
}
cmdbuf->cur_batch = NULL;
return;
}
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(dev->vk.physical);
list_addtail(&batch->node, &cmdbuf->batches);
if (batch->tlsinfo.tls.size) {
unsigned thread_tls_alloc =
pan_query_thread_tls_alloc(&phys_dev->kmod.dev->props);
unsigned core_id_range;
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
unsigned size = pan_get_total_stack_size(batch->tlsinfo.tls.size,
thread_tls_alloc, core_id_range);
batch->tlsinfo.tls.ptr =
panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
}
if (batch->tlsinfo.wls.size) {
assert(batch->wls_total_size);
batch->tlsinfo.wls.ptr =
panvk_cmd_alloc_dev_mem(cmdbuf, tls, batch->wls_total_size, 4096).gpu;
}
if (batch->tls.cpu)
GENX(pan_emit_tls)(&batch->tlsinfo, batch->tls.cpu);
if (batch->fb.desc.cpu) {
panvk_per_arch(cmd_select_tile_size)(cmdbuf);
/* At this point, we should know sample count and the tile size should have
* been calculated */
assert(fbinfo->nr_samples > 0 && fbinfo->tile_size > 0);
fbinfo->sample_positions =
dev->sample_positions->addr.dev +
pan_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
fbinfo->first_provoking_vertex =
cmdbuf->state.gfx.render.first_provoking_vertex != U_TRISTATE_NO;
VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, fbinfo);
if (result != VK_SUCCESS)
return;
uint32_t view_mask = cmdbuf->state.gfx.render.view_mask;
assert(view_mask == 0 || util_bitcount(view_mask) <= batch->fb.layer_count);
uint32_t enabled_layer_count = view_mask ?
util_bitcount(view_mask) :
batch->fb.layer_count;
for (uint32_t i = 0; i < enabled_layer_count; i++) {
uint32_t layer_id = (view_mask != 0) ? u_bit_scan(&view_mask) : i;
VkResult result;
uint64_t fbd = batch->fb.desc.gpu + (batch->fb.desc_stride * layer_id);
result = panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, layer_id);
if (result != VK_SUCCESS)
break;
fbd |= GENX(pan_emit_fbd)(
&cmdbuf->state.gfx.render.fb.info, layer_id, &batch->tlsinfo,
&batch->tiler.ctx,
batch->fb.desc.cpu + (batch->fb.desc_stride * layer_id));
result = panvk_cmd_prepare_fragment_job(cmdbuf, fbd);
if (result != VK_SUCCESS)
break;
}
}
cmdbuf->cur_batch = NULL;
}
VkResult
panvk_per_arch(cmd_alloc_fb_desc)(struct panvk_cmd_buffer *cmdbuf)
{
struct panvk_batch *batch = cmdbuf->cur_batch;
if (batch->fb.desc.gpu)
return VK_SUCCESS;
const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
bool has_zs_ext = fbinfo->zs.view.zs || fbinfo->zs.view.s;
batch->fb.layer_count = cmdbuf->state.gfx.render.layer_count;
unsigned fbd_size = pan_size(FRAMEBUFFER);
if (has_zs_ext)
fbd_size = ALIGN_POT(fbd_size, pan_alignment(ZS_CRC_EXTENSION)) +
pan_size(ZS_CRC_EXTENSION);
fbd_size = ALIGN_POT(fbd_size, pan_alignment(RENDER_TARGET)) +
(MAX2(fbinfo->rt_count, 1) * pan_size(RENDER_TARGET));
batch->fb.bo_count = cmdbuf->state.gfx.render.fb.bo_count;
memcpy(batch->fb.bos, cmdbuf->state.gfx.render.fb.bos,
batch->fb.bo_count * sizeof(batch->fb.bos[0]));
batch->fb.desc =
panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbd_size * batch->fb.layer_count,
pan_alignment(FRAMEBUFFER));
batch->fb.desc_stride = fbd_size;
memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0,
sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds));
return batch->fb.desc.gpu ? VK_SUCCESS : VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
VkResult
panvk_per_arch(cmd_alloc_tls_desc)(struct panvk_cmd_buffer *cmdbuf, bool gfx)
{
struct panvk_batch *batch = cmdbuf->cur_batch;
assert(batch);
if (!batch->tls.gpu) {
batch->tls = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
if (!batch->tls.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
return VK_SUCCESS;
}
VkResult
panvk_per_arch(cmd_prepare_tiler_context)(struct panvk_cmd_buffer *cmdbuf,
uint32_t layer_idx)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
struct panvk_batch *batch = cmdbuf->cur_batch;
uint64_t tiler_desc;
if (batch->tiler.ctx_descs.gpu) {
tiler_desc =
batch->tiler.ctx_descs.gpu + (pan_size(TILER_CONTEXT) * layer_idx);
goto out_set_layer_ctx;
}
const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
uint32_t layer_count = cmdbuf->state.gfx.render.layer_count;
batch->tiler.heap_desc = panvk_cmd_alloc_desc(cmdbuf, TILER_HEAP);
batch->tiler.ctx_descs =
panvk_cmd_alloc_desc_array(cmdbuf, layer_count, TILER_CONTEXT);
if (!batch->tiler.heap_desc.gpu || !batch->tiler.ctx_descs.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
tiler_desc =
batch->tiler.ctx_descs.gpu + (pan_size(TILER_CONTEXT) * layer_idx);
pan_pack(&batch->tiler.heap_templ, TILER_HEAP, cfg) {
cfg.size = pan_kmod_bo_size(dev->tiler_heap->bo);
cfg.base = dev->tiler_heap->addr.dev;
cfg.bottom = dev->tiler_heap->addr.dev;
cfg.top = cfg.base + cfg.size;
}
pan_pack(&batch->tiler.ctx_templ, TILER_CONTEXT, cfg) {
cfg.hierarchy_mask = panvk_select_tiler_hierarchy_mask(
phys_dev, &cmdbuf->state.gfx, pan_kmod_bo_size(dev->tiler_heap->bo));
cfg.fb_width = fbinfo->width;
cfg.fb_height = fbinfo->height;
cfg.heap = batch->tiler.heap_desc.gpu;
cfg.sample_pattern = pan_sample_pattern(fbinfo->nr_samples);
}
memcpy(batch->tiler.heap_desc.cpu, &batch->tiler.heap_templ,
sizeof(batch->tiler.heap_templ));
struct mali_tiler_context_packed *ctxs = batch->tiler.ctx_descs.cpu;
assert(layer_count > 0);
for (uint32_t i = 0; i < layer_count; i++) {
STATIC_ASSERT(
!(pan_size(TILER_CONTEXT) & (pan_alignment(TILER_CONTEXT) - 1)));
memcpy(&ctxs[i], &batch->tiler.ctx_templ, sizeof(*ctxs));
}
out_set_layer_ctx:
if (PAN_ARCH >= 9)
batch->tiler.ctx.valhall.desc = tiler_desc;
else
batch->tiler.ctx.bifrost.desc = tiler_desc;
return VK_SUCCESS;
}
struct panvk_batch *
panvk_per_arch(cmd_open_batch)(struct panvk_cmd_buffer *cmdbuf)
{
assert(!cmdbuf->cur_batch);
cmdbuf->cur_batch =
vk_zalloc(&cmdbuf->vk.pool->alloc, sizeof(*cmdbuf->cur_batch), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
cmdbuf->cur_batch->jobs = UTIL_DYNARRAY_INIT;
cmdbuf->cur_batch->event_ops = UTIL_DYNARRAY_INIT;
assert(cmdbuf->cur_batch);
return cmdbuf->cur_batch;
}
VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
panvk_per_arch(cmd_close_batch)(cmdbuf);
panvk_pool_flush_maps(&cmdbuf->desc_pool);
return vk_command_buffer_end(&cmdbuf->vk);
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
const VkDependencyInfo *pDependencyInfo)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
/* Caches are flushed/invalidated at batch boundaries for now, nothing to do
* for memory barriers assuming we implement barriers with the creation of a
* new batch.
* FIXME: We can probably do better with a CacheFlush job that has the
* barrier flag set to true.
*/
if (cmdbuf->cur_batch) {
bool preload_fb =
cmdbuf->cur_batch && cmdbuf->cur_batch->vtc_jc.first_tiler;
panvk_per_arch(cmd_close_batch)(cmdbuf);
if (preload_fb)
panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf);
panvk_per_arch(cmd_open_batch)(cmdbuf);
}
for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
const VkImageMemoryBarrier2 *barrier = &pDependencyInfo->pImageMemoryBarriers[i];
panvk_per_arch(cmd_transition_image_layout)(commandBuffer, barrier);
}
/* If we had any layout transition dispatches, the batch will be closed at
* this point, therefore establishing the sync between itself and the
* commands that follow.
*/
}
static void
panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
VkCommandBufferResetFlags flags)
{
struct panvk_cmd_buffer *cmdbuf =
container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
vk_command_buffer_reset(&cmdbuf->vk);
list_for_each_entry_safe(struct panvk_batch, batch, &cmdbuf->batches, node) {
list_del(&batch->node);
util_dynarray_fini(&batch->jobs);
util_dynarray_fini(&batch->event_ops);
vk_free(&cmdbuf->vk.pool->alloc, batch);
}
panvk_pool_reset(&cmdbuf->desc_pool);
panvk_pool_reset(&cmdbuf->tls_pool);
panvk_pool_reset(&cmdbuf->varying_pool);
panvk_cmd_buffer_obj_list_reset(cmdbuf, push_sets);
memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
}
static void
panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
{
struct panvk_cmd_buffer *cmdbuf =
container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
list_for_each_entry_safe(struct panvk_batch, batch, &cmdbuf->batches, node) {
list_del(&batch->node);
util_dynarray_fini(&batch->jobs);
util_dynarray_fini(&batch->event_ops);
vk_free(&cmdbuf->vk.pool->alloc, batch);
}
panvk_pool_cleanup(&cmdbuf->desc_pool);
panvk_pool_cleanup(&cmdbuf->tls_pool);
panvk_pool_cleanup(&cmdbuf->varying_pool);
panvk_cmd_buffer_obj_list_cleanup(cmdbuf, push_sets);
vk_command_buffer_finish(&cmdbuf->vk);
vk_free(&dev->vk.alloc, cmdbuf);
}
static VkResult
panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
struct vk_command_buffer **cmdbuf_out)
{
struct panvk_device *device =
container_of(vk_pool->base.device, struct panvk_device, vk);
struct panvk_cmd_pool *pool =
container_of(vk_pool, struct panvk_cmd_pool, vk);
struct panvk_cmd_buffer *cmdbuf;
cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!cmdbuf)
return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
VkResult result = vk_command_buffer_init(
&pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
if (result != VK_SUCCESS) {
vk_free(&device->vk.alloc, cmdbuf);
return result;
}
panvk_cmd_buffer_obj_list_init(cmdbuf, push_sets);
cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
&cmdbuf->state.gfx.dynamic.sl;
struct panvk_pool_properties desc_pool_props = {
.create_flags =
panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_WB_MMAP),
.slab_size = 64 * 1024,
.label = "Command buffer descriptor pool",
.prealloc = true,
.owns_bos = true,
.needs_locking = false,
};
panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool, NULL,
&desc_pool_props);
struct panvk_pool_properties tls_pool_props = {
.create_flags =
panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
.slab_size = 64 * 1024,
.label = "TLS pool",
.prealloc = false,
.owns_bos = true,
.needs_locking = false,
};
panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool, &pool->tls_big_bo_pool,
&tls_pool_props);
struct panvk_pool_properties var_pool_props = {
.create_flags =
panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
.slab_size = 64 * 1024,
.label = "Varying pool",
.prealloc = false,
.owns_bos = true,
.needs_locking = false,
};
panvk_pool_init(&cmdbuf->varying_pool, device, &pool->varying_bo_pool, NULL,
&var_pool_props);
list_inithead(&cmdbuf->batches);
*cmdbuf_out = &cmdbuf->vk;
return VK_SUCCESS;
}
const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
.create = panvk_create_cmdbuf,
.reset = panvk_reset_cmdbuf,
.destroy = panvk_destroy_cmdbuf,
};
VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
const VkCommandBufferBeginInfo *pBeginInfo)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
#if PAN_ARCH < 9
/* iter13: clear XFB state on Begin so a reused command buffer does not
* inherit stale xfb.buffer_count / xfb.active / xfb.buffers[] from a
* prior recording. */
memset(&cmdbuf->state.gfx.xfb, 0, sizeof(cmdbuf->state.gfx.xfb));
#endif
return VK_SUCCESS;
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,111 @@
/*
* Copyright © 2026 mfritsche / claude-noether
* SPDX-License-Identifier: MIT
*
* iter13: VK_EXT_transform_feedback command handlers for the JM
* architecture path (Bifrost v6/v7 + Valhall-JM v9).
*
* The runtime contract:
* - vkCmdBindTransformFeedbackBuffersEXT: stash (gpu_addr, offset, size)
* for each slot into cmdbuf->state.gfx.xfb.buffers[].
* - vkCmdBeginTransformFeedbackEXT: set cmdbuf->state.gfx.xfb.active = true.
* Mark sysvals dirty so the next draw re-emits vs.xfb_address[].
* - vkCmdEndTransformFeedbackEXT: set active = false.
*
* Counter buffers (firstCounterBuffer/counterBufferCount/pCounterBuffers/
* pCounterBufferOffsets) are accepted by API but ignored — v1 doesn't
* support pause/resume. transformFeedbackDraw is advertised as false.
*
* Per-draw integration: jm/panvk_vX_cmd_draw.c reads cmdbuf->state.gfx.xfb
* and populates vs.xfb_address[i] for shader use. The pan_nir_lower_xfb
* pass in panvk_vX_shader.c emits nir_load_xfb_address(i) which lowers
* (via panvk_vX_shader.c sysval handler) to a load from the per-draw
* sysval push area.
*/
#include "vk_log.h"
#include "util/log.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_draw.h"
#include "panvk_buffer.h"
#include "panvk_entrypoints.h"
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBindTransformFeedbackBuffersEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstBinding,
uint32_t bindingCount,
const VkBuffer *pBuffers,
const VkDeviceSize *pOffsets,
const VkDeviceSize *pSizes)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
for (uint32_t i = 0; i < bindingCount; i++) {
uint32_t slot = firstBinding + i;
if (slot >= 4)
continue;
VK_FROM_HANDLE(panvk_buffer, buf, pBuffers[i]);
gfx->xfb.buffers[slot].addr = panvk_buffer_gpu_ptr(buf, 0);
gfx->xfb.buffers[slot].offset = pOffsets[i];
gfx->xfb.buffers[slot].size =
(pSizes != NULL && pSizes[i] != VK_WHOLE_SIZE)
? pSizes[i]
: (buf->vk.size - pOffsets[i]);
}
if (firstBinding + bindingCount > gfx->xfb.buffer_count)
gfx->xfb.buffer_count = firstBinding + bindingCount;
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginTransformFeedbackEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer *pCounterBuffers,
const VkDeviceSize *pCounterBufferOffsets)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
/* Counter buffers ignored in v1 — see VkPhysicalDeviceTransformFeedback
* PropertiesEXT.transformFeedbackDraw = false in panvk_vX_physical_device.c.
* App is spec-compliant if it does not pass counter buffers (which our
* features advertisement allows), but warn loudly if it does so we do not
* silently produce wrong capture state. */
(void)firstCounterBuffer;
(void)pCounterBufferOffsets;
if (counterBufferCount > 0 && pCounterBuffers != NULL) {
mesa_logw("panvk: CmdBeginTransformFeedbackEXT: counter buffers not "
"implemented (transformFeedbackDraw=false); XFB resume will "
"restart at buffer offset 0");
}
gfx->xfb.active = true;
/* Per-draw set_gfx_sysval picks up the change automatically — no
* explicit dirty marking required (set_gfx_sysval uses memcmp +
* BITSET to detect state diffs and re-emit sysvals). */
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndTransformFeedbackEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer *pCounterBuffers,
const VkDeviceSize *pCounterBufferOffsets)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
(void)firstCounterBuffer;
(void)counterBufferCount;
(void)pCounterBuffers;
(void)pCounterBufferOffsets;
gfx->xfb.active = false;
}
@@ -0,0 +1,275 @@
# Copyright © 2021 Collabora Ltd.
#
# Derived from the freedreno driver which is:
# Copyright © 2017 Intel Corporation
# SPDX-License-Identifier: MIT
panvk_entrypoints = custom_target(
'panvk_entrypoints.[ch]',
input : [vk_entrypoints_gen, vk_api_xml],
output : ['panvk_entrypoints.h', 'panvk_entrypoints.c'],
command : [
prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'panvk',
'--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7',
'--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10',
'--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13',
'--beta', with_vulkan_beta.to_string()
],
depend_files : vk_entrypoints_gen_depend_files,
)
panvk_tracepoints = custom_target(
'panvk_tracepoints.[ch]',
input: 'panvk_tracepoints.py',
output: ['panvk_tracepoints.h',
'panvk_tracepoints_perfetto.h',
'panvk_tracepoints.c'],
command: [
prog_python, '@INPUT@',
'--import-path', join_paths(dir_source_root, 'src/util/perf/'),
'--utrace-hdr', '@OUTPUT0@',
'--perfetto-hdr', '@OUTPUT1@',
'--utrace-src', '@OUTPUT2@',
],
depend_files: u_trace_py,
)
libpanvk_files = files(
'panvk_buffer.c',
'panvk_cmd_pool.c',
'panvk_device_memory.c',
'panvk_host_copy.c',
'panvk_image.c',
'panvk_instance.c',
'panvk_mempool.c',
'panvk_physical_device.c',
'panvk_priv_bo.c',
'panvk_sparse.c',
'panvk_utrace.c',
'panvk_wsi.c',
)
libpanvk_files += [sha1_h]
panvk_deps = []
panvk_flags = []
panvk_per_arch_libs = []
bifrost_archs = [6, 7]
bifrost_inc_dir = ['bifrost']
bifrost_files = [
'bifrost/panvk_vX_meta_desc_copy.c',
]
valhall_archs = [9, 10]
valhall_inc_dir = ['valhall']
valhall_files = []
fifthgen_archs = [12, 13]
fifthgen_inc_dir = ['fifthgen']
fifthgen_files = []
jm_archs = [6, 7]
jm_inc_dir = ['jm']
jm_files = [
'jm/panvk_vX_bind_queue.c',
'jm/panvk_vX_cmd_xfb.c', # iter13
'jm/panvk_vX_cmd_buffer.c',
'jm/panvk_vX_cmd_dispatch.c',
'jm/panvk_vX_cmd_draw.c',
'jm/panvk_vX_cmd_event.c',
'jm/panvk_vX_cmd_query.c',
'jm/panvk_vX_cmd_precomp.c',
'jm/panvk_vX_event.c',
'jm/panvk_vX_gpu_queue.c',
]
csf_archs = [10, 12, 13]
csf_inc_dir = ['csf']
csf_files = [
'csf/panvk_vX_bind_queue.c',
'csf/panvk_vX_cmd_buffer.c',
'csf/panvk_vX_cmd_dispatch.c',
'csf/panvk_vX_cmd_draw.c',
'csf/panvk_vX_cmd_event.c',
'csf/panvk_vX_cmd_query.c',
'csf/panvk_vX_cmd_precomp.c',
'csf/panvk_vX_event.c',
'csf/panvk_vX_exception_handler.c',
'csf/panvk_vX_gpu_queue.c',
'csf/panvk_vX_instr.c',
'csf/panvk_vX_utrace.c',
]
common_per_arch_files = [
panvk_entrypoints[0],
panvk_tracepoints[0],
'panvk_vX_blend.c',
'panvk_vX_buffer_view.c',
'panvk_vX_cmd_fb_preload.c',
'panvk_vX_cmd_desc_state.c',
'panvk_vX_cmd_dispatch.c',
'panvk_vX_cmd_draw.c',
'panvk_vX_cmd_meta.c',
'panvk_vX_cmd_push_constant.c',
'panvk_vX_descriptor_set.c',
'panvk_vX_descriptor_set_layout.c',
'panvk_vX_device.c',
'panvk_vX_physical_device.c',
'panvk_vX_precomp_cache.c',
'panvk_vX_query_pool.c',
'panvk_vX_image_view.c',
'panvk_vX_nir_lower_descriptors.c',
'panvk_vX_nir_lower_input_attachment_loads.c',
'panvk_vX_sampler.c',
'panvk_vX_shader.c',
sha1_h,
]
foreach arch : [6, 7, 10, 12, 13]
per_arch_files = common_per_arch_files
inc_panvk_per_arch = []
if arch in bifrost_archs
inc_panvk_per_arch += bifrost_inc_dir
per_arch_files += bifrost_files
elif arch in valhall_archs
inc_panvk_per_arch += valhall_inc_dir
per_arch_files += valhall_files
elif arch in fifthgen_archs
inc_panvk_per_arch += fifthgen_inc_dir
per_arch_files += fifthgen_files
endif
if arch in jm_archs
inc_panvk_per_arch += jm_inc_dir
per_arch_files += jm_files
elif arch in csf_archs
inc_panvk_per_arch += csf_inc_dir
per_arch_files += csf_files
endif
panvk_per_arch_libs += static_library(
'panvk_v@0@'.format(arch),
per_arch_files,
include_directories : [
inc_include,
inc_src,
inc_panfrost,
inc_panvk_per_arch,
],
dependencies : [
idep_nir_headers,
idep_pan_packers,
idep_vulkan_util_headers,
idep_vulkan_runtime_headers,
idep_vulkan_wsi_headers,
idep_mesautil,
dep_libdrm,
dep_valgrind,
idep_libpan_per_arch[arch.to_string()],
],
c_args : [no_override_init_args, panvk_flags, '-DPAN_ARCH=@0@'.format(arch)],
gnu_symbol_visibility : 'hidden',
)
endforeach
if with_perfetto
panvk_deps += dep_perfetto
libpanvk_files += ['panvk_utrace_perfetto.cc']
endif
if with_platform_wayland
panvk_deps += dep_wayland_client
endif
if with_platform_android
libpanvk_files += files('panvk_android.c')
endif
libvulkan_panfrost = shared_library(
'vulkan_panfrost',
[libpanvk_files, panvk_entrypoints, panvk_tracepoints],
include_directories : [
inc_include,
inc_src,
inc_panfrost,
],
link_whole : [panvk_per_arch_libs],
link_with : [
libpanfrost_shared,
libpanfrost_decode,
libpanfrost_lib,
libpanfrost_compiler,
],
dependencies : [
dep_dl,
dep_elf,
dep_libdrm,
dep_m,
dep_thread,
dep_valgrind,
idep_nir,
idep_pan_packers,
panvk_deps,
idep_vulkan_util,
idep_vulkan_runtime,
idep_vulkan_wsi,
idep_mesautil,
],
c_args : [no_override_init_args, panvk_flags],
link_args : [vulkan_icd_link_args, ld_args_bsymbolic, ld_args_gc_sections, ld_args_build_id],
gnu_symbol_visibility : 'hidden',
install : true,
)
if with_symbols_check
test(
'panvk symbols check',
symbols_check,
args : [
'--lib', libvulkan_panfrost,
'--symbols-file', vulkan_icd_symbols,
symbols_check_args,
],
suite : ['panfrost'],
)
endif
icd_file_name = libname_prefix + 'vulkan_panfrost.' + libname_suffix
panfrost_icd = custom_target(
'panfrost_icd',
input : [vk_icd_gen, vk_api_xml],
output : 'panfrost_icd.' + vulkan_manifest_suffix,
command : [
prog_python, '@INPUT0@',
'--api-version', '1.4', '--xml', '@INPUT1@',
'--sizeof-pointer', sizeof_pointer,
'--icd-lib-path', vulkan_icd_lib_path,
'--icd-filename', icd_file_name,
'--out', '@OUTPUT@',
],
build_by_default : true,
install_dir : with_vulkan_icd_dir,
install_tag : 'runtime',
install : true,
)
_dev_icdname = 'panfrost_devenv_icd.@0@.json'.format(host_machine.cpu())
_dev_icd = custom_target(
'panfrost_devenv_icd',
input : [vk_icd_gen, vk_api_xml],
output : _dev_icdname,
command : [
prog_python, '@INPUT0@',
'--api-version', '1.4', '--xml', '@INPUT1@',
'--sizeof-pointer', sizeof_pointer,
'--icd-lib-path', meson.current_build_dir(),
'--icd-filename', icd_file_name,
'--out', '@OUTPUT@',
],
build_by_default : true,
)
devenv.append('VK_DRIVER_FILES', _dev_icd.full_path())
@@ -0,0 +1,501 @@
/*
* Copyright © 2024 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef PANVK_CMD_DRAW_H
#define PANVK_CMD_DRAW_H
#ifndef PAN_ARCH
#error "PAN_ARCH must be defined"
#endif
#include "panvk_blend.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_query.h"
#include "panvk_entrypoints.h"
#include "panvk_image.h"
#include "panvk_image_view.h"
#include "panvk_physical_device.h"
#include "panvk_shader.h"
#include "vk_command_buffer.h"
#include "vk_format.h"
#include "util/u_tristate.h"
#include "pan_props.h"
#define MAX_VBS 16
struct panvk_cmd_buffer;
struct panvk_attrib_buf {
uint64_t address;
unsigned size;
};
struct panvk_resolve_attachment {
VkResolveModeFlagBits mode;
struct panvk_image_view *dst_iview;
};
struct panvk_rendering_state {
VkRenderingFlags flags;
uint32_t layer_count;
uint32_t view_mask;
enum u_tristate first_provoking_vertex;
enum vk_rp_attachment_flags bound_attachments;
struct {
struct panvk_image_view *iviews[MAX_RTS];
/* If non-null, preload_iviews[i] overrides iviews[i] for preloads. */
struct panvk_image_view *preload_iviews[MAX_RTS];
VkFormat fmts[MAX_RTS];
uint8_t samples[MAX_RTS];
struct panvk_resolve_attachment resolve[MAX_RTS];
} color_attachments;
struct pan_image_view zs_pview;
struct pan_image_view s_pview;
struct {
struct panvk_image_view *iview;
/* If non-null, preload_iview overrides iview for preloads. */
struct panvk_image_view *preload_iview;
VkFormat fmt;
struct panvk_resolve_attachment resolve;
} z_attachment, s_attachment;
struct {
struct pan_fb_info info;
bool crc_valid[MAX_RTS];
/* nr_samples to be used before framebuffer / tiler descriptor are emitted */
uint32_t nr_samples;
#if PAN_ARCH < 9
uint32_t bo_count;
struct pan_kmod_bo *bos[(MAX_RTS * PANVK_MAX_PLANES) + 2];
#endif
} fb;
#if PAN_ARCH >= 10
struct pan_ptr fbds;
uint64_t tiler;
/* When a secondary command buffer has to flush draws, it disturbs the
* inherited context, and the primary command buffer needs to know. */
bool invalidate_inherited_ctx;
/* True if the last render pass was suspended. */
bool suspended;
/* Blocks that can patch to flip the provoking vertex mode if we need to
* emit FBDs/TDs before we know which mode the application is using */
struct cs_maybe *maybe_set_tds_provoking_vertex;
struct cs_maybe *maybe_set_fbds_provoking_vertex;
struct {
/* != 0 if the render pass contains one or more occlusion queries to
* signal. */
uint64_t chain;
/* Point to the syncobj of the last occlusion query that was passed
* to a draw. */
uint64_t last;
} oq;
#endif
};
enum panvk_cmd_graphics_dirty_state {
PANVK_CMD_GRAPHICS_DIRTY_VS,
PANVK_CMD_GRAPHICS_DIRTY_FS,
PANVK_CMD_GRAPHICS_DIRTY_VB,
PANVK_CMD_GRAPHICS_DIRTY_IB,
PANVK_CMD_GRAPHICS_DIRTY_OQ,
PANVK_CMD_GRAPHICS_DIRTY_DESC_STATE,
PANVK_CMD_GRAPHICS_DIRTY_RENDER_STATE,
PANVK_CMD_GRAPHICS_DIRTY_VS_PUSH_UNIFORMS,
PANVK_CMD_GRAPHICS_DIRTY_FS_PUSH_UNIFORMS,
PANVK_CMD_GRAPHICS_DIRTY_STATE_COUNT,
};
struct panvk_cmd_graphics_state {
struct panvk_descriptor_state desc_state;
struct {
struct vk_vertex_input_state vi;
struct vk_sample_locations_state sl;
} dynamic;
struct panvk_occlusion_query_state occlusion_query;
#if PAN_ARCH >= 10
struct panvk_prims_generated_query_state prims_generated_query;
#endif
struct panvk_graphics_sysvals sysvals;
#if PAN_ARCH < 9
/* iter13: VK_EXT_transform_feedback state (JM-class only for now). */
struct {
bool active;
uint32_t buffer_count;
struct {
uint64_t addr;
uint64_t offset;
uint64_t size;
} buffers[4];
} xfb;
#endif
#if PAN_ARCH < 9
struct panvk_shader_link link;
#endif
struct {
const struct panvk_shader *shader;
struct panvk_shader_desc_state desc;
uint64_t blend_descs[MAX_RTS];
uint64_t push_uniforms;
bool required;
#if PAN_ARCH < 9
uint64_t rsd;
#endif
} fs;
struct {
const struct panvk_shader *shader;
struct panvk_shader_desc_state desc;
uint64_t push_uniforms;
#if PAN_ARCH < 9
uint64_t attribs;
uint64_t attrib_bufs;
uint64_t indirect_attribs_infos;
uint64_t indirect_attrib_bufs_infos;
uint64_t indirect_varying_bufs_infos;
bool previous_draw_was_indirect;
#endif
} vs;
struct {
struct panvk_attrib_buf bufs[MAX_VBS];
unsigned count;
} vb;
#if PAN_ARCH >= 10
struct {
uint32_t attribs_changing_on_base_instance;
} vi;
#endif
/* Index buffer */
struct {
uint64_t dev_addr;
uint64_t size;
uint8_t index_size;
} ib;
struct {
struct panvk_blend_info info;
} cb;
struct panvk_rendering_state render;
bool vk_meta;
#if PAN_ARCH < 9
uint64_t vpd;
#endif
#if PAN_ARCH >= 10
uint64_t tsd;
#endif
BITSET_DECLARE(dirty, PANVK_CMD_GRAPHICS_DIRTY_STATE_COUNT);
};
#define dyn_gfx_state_dirty(__cmdbuf, __name) \
BITSET_TEST((__cmdbuf)->vk.dynamic_graphics_state.dirty, \
MESA_VK_DYNAMIC_##__name)
#define gfx_state_dirty(__cmdbuf, __name) \
BITSET_TEST((__cmdbuf)->state.gfx.dirty, PANVK_CMD_GRAPHICS_DIRTY_##__name)
#define gfx_state_set_dirty(__cmdbuf, __name) \
BITSET_SET((__cmdbuf)->state.gfx.dirty, PANVK_CMD_GRAPHICS_DIRTY_##__name)
#define gfx_state_clear_all_dirty(__cmdbuf) \
BITSET_ZERO((__cmdbuf)->state.gfx.dirty)
#define gfx_state_set_all_dirty(__cmdbuf) \
BITSET_ONES((__cmdbuf)->state.gfx.dirty)
#define set_gfx_sysval(__cmdbuf, __dirty, __name, __val) \
do { \
struct panvk_graphics_sysvals __new_sysval; \
__new_sysval.__name = __val; \
if (memcmp(&(__cmdbuf)->state.gfx.sysvals.__name, &__new_sysval.__name, \
sizeof(__new_sysval.__name))) { \
(__cmdbuf)->state.gfx.sysvals.__name = __new_sysval.__name; \
BITSET_SET_RANGE(__dirty, sysval_fau_start(graphics, __name), \
sysval_fau_end(graphics, __name)); \
} \
} while (0)
#if PAN_ARCH >= 10
struct panvk_device_draw_context {
struct panvk_priv_bo *fns_bo;
uint64_t fn_set_fbds_provoking_vertex_stride;
};
#endif
static inline void
panvk_depth_range(const struct panvk_cmd_graphics_state *state,
const struct vk_viewport_state *vp,
float *z_min, float *z_max)
{
float a = vp->depth_clip_negative_one_to_one ?
state->sysvals.viewport.offset.z - state->sysvals.viewport.scale.z :
state->sysvals.viewport.offset.z;
float b = state->sysvals.viewport.offset.z + state->sysvals.viewport.scale.z;
*z_min = MIN2(a, b);
*z_max = MAX2(a, b);
}
static inline uint32_t
panvk_select_tiler_hierarchy_mask(const struct panvk_physical_device *phys_dev,
const struct panvk_cmd_graphics_state *state,
unsigned bin_ptr_mem_budget)
{
struct pan_tiler_features tiler_features =
pan_query_tiler_features(&phys_dev->kmod.dev->props);
uint32_t hierarchy_mask = GENX(pan_select_tiler_hierarchy_mask)(
state->render.fb.info.width, state->render.fb.info.height,
tiler_features.max_levels, state->render.fb.info.tile_size,
bin_ptr_mem_budget);
return hierarchy_mask;
}
static inline bool
fs_required(const struct panvk_cmd_graphics_state *state,
const struct vk_dynamic_graphics_state *dyn_state)
{
const struct panvk_shader_variant *fs =
panvk_shader_only_variant(state->fs.shader);
const struct pan_shader_info *fs_info = fs ? &fs->info : NULL;
const struct vk_color_blend_state *cb = &dyn_state->cb;
const struct vk_rasterization_state *rs = &dyn_state->rs;
if (rs->rasterizer_discard_enable || !fs_info)
return false;
/* If we generally have side effects */
if (fs_info->fs.sidefx)
return true;
/* If colour is written we need to execute */
for (unsigned i = 0; i < cb->attachment_count; ++i) {
if ((cb->color_write_enables & BITFIELD_BIT(i)) &&
cb->attachments[i].write_mask)
return true;
}
/* If alpha-to-coverage is enabled, we need to run the fragment shader even
* if we don't have a color attachment, so depth/stencil updates can be
* discarded if alpha, and thus coverage, is 0. */
if (dyn_state->ms.alpha_to_coverage_enable)
return true;
/* If the sample mask is updated, we need to run the fragment shader,
* otherwise the fixed-function depth/stencil results will apply to all
* samples. */
if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK))
return true;
/* If depth is written and not implied we need to execute.
* TODO: Predicate on Z/S writes being enabled */
return (fs_info->fs.writes_depth || fs_info->fs.writes_stencil);
}
static inline bool
cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state,
ASSERTED const struct vk_dynamic_graphics_state *dyn_state,
bool cached_value)
{
/* Make sure the cached value was properly initialized. */
assert(fs_required(state, dyn_state) == cached_value);
return cached_value;
}
#define get_fs(__cmdbuf) \
(cached_fs_required(&(__cmdbuf)->state.gfx, \
&(__cmdbuf)->vk.dynamic_graphics_state, \
(__cmdbuf)->state.gfx.fs.required) \
? (__cmdbuf)->state.gfx.fs.shader \
: NULL)
/* Anything that might change the value returned by get_fs() makes users of the
* fragment shader dirty, because not using the fragment shader (when
* fs_required() returns false) impacts various other things, like VS -> FS
* linking in the JM backend, or the update of the fragment shader pointer in
* the CSF backend. Call gfx_state_dirty(cmdbuf, FS) if you only care about
* fragment shader updates. */
#define fs_user_dirty(__cmdbuf) \
(gfx_state_dirty(cmdbuf, FS) || \
dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) || \
dyn_gfx_state_dirty(cmdbuf, CB_ATTACHMENT_COUNT) || \
dyn_gfx_state_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) || \
dyn_gfx_state_dirty(cmdbuf, CB_WRITE_MASKS) || \
dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE))
/* After a draw, all dirty flags are cleared except the FS dirty flag which
* needs to be set again if the draw didn't use the fragment shader. */
#define clear_dirty_after_draw(__cmdbuf) \
do { \
bool __set_fs_dirty = \
(__cmdbuf)->state.gfx.fs.shader != get_fs(__cmdbuf); \
bool __set_fs_push_dirty = \
__set_fs_dirty && gfx_state_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \
vk_dynamic_graphics_state_clear_dirty( \
&(__cmdbuf)->vk.dynamic_graphics_state); \
gfx_state_clear_all_dirty(__cmdbuf); \
if (__set_fs_dirty) \
gfx_state_set_dirty(__cmdbuf, FS); \
if (__set_fs_push_dirty) \
gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \
} while (0)
#if PAN_ARCH >= 10
VkResult
panvk_per_arch(device_draw_context_init)(struct panvk_device *dev);
void
panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev);
#endif
void
panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingInfo *pRenderingInfo);
void
panvk_per_arch(cmd_force_fb_preload)(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingInfo *render_info);
void
panvk_per_arch(cmd_preload_render_area_border)(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingInfo *render_info);
void panvk_per_arch(cmd_select_tile_size)(struct panvk_cmd_buffer *cmdbuf);
struct panvk_draw_info {
struct {
uint32_t size;
uint32_t offset;
} index;
struct {
#if PAN_ARCH < 9
int32_t raw_offset;
#endif
int32_t base;
uint32_t count;
} vertex;
struct {
int32_t base;
uint32_t count;
} instance;
struct {
uint64_t buffer_dev_addr;
uint64_t count_buffer_dev_addr;
uint32_t draw_count;
uint32_t stride;
} indirect;
#if PAN_ARCH < 9
uint32_t layer_id;
#endif
};
void
panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_draw_info *info);
static inline uint32_t
color_attachment_written_mask(
const struct panvk_shader_variant *fs,
const struct vk_color_attachment_location_state *cal)
{
uint32_t written_by_shader =
(fs->info.outputs_written >> FRAG_RESULT_DATA0) & BITFIELD_MASK(8);
uint32_t catt_written_mask = 0;
for (uint32_t i = 0; i < MAX_RTS; i++) {
if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
continue;
uint32_t shader_rt = cal->color_map[i];
if (written_by_shader & BITFIELD_BIT(shader_rt))
catt_written_mask |= BITFIELD_BIT(i);
}
return catt_written_mask;
}
static inline uint32_t
color_attachment_read_mask(const struct panvk_shader_variant *fs,
const struct vk_input_attachment_location_state *ial,
uint8_t color_attachment_mask)
{
uint32_t color_attachment_count =
ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN
? util_last_bit(color_attachment_mask)
: ial->color_attachment_count;
uint32_t catt_read_mask = 0;
for (uint32_t i = 0; i < color_attachment_count; i++) {
if (ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
continue;
uint32_t catt_idx = ial->color_map[i] + 1;
if (fs->fs.input_attachment_read & BITFIELD_BIT(catt_idx)) {
assert(color_attachment_mask & BITFIELD_BIT(i));
catt_read_mask |= BITFIELD_BIT(i);
}
}
return catt_read_mask;
}
static inline bool
z_attachment_read(const struct panvk_shader_variant *fs,
const struct vk_input_attachment_location_state *ial)
{
uint32_t depth_mask = ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX
? BITFIELD_BIT(0)
: ial->depth_att != MESA_VK_ATTACHMENT_UNUSED
? BITFIELD_BIT(ial->depth_att + 1)
: 0;
return depth_mask & fs->fs.input_attachment_read;
}
static inline bool
s_attachment_read(const struct panvk_shader_variant *fs,
const struct vk_input_attachment_location_state *ial)
{
uint32_t stencil_mask = ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX
? BITFIELD_BIT(0)
: ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED
? BITFIELD_BIT(ial->stencil_att + 1)
: 0;
return stencil_mask & fs->fs.input_attachment_read;
}
#endif
@@ -0,0 +1,572 @@
/*
* Copyright © 2021 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef PANVK_SHADER_H
#define PANVK_SHADER_H
#ifndef PAN_ARCH
#error "PAN_ARCH must be defined"
#endif
#include "compiler/pan_compiler.h"
#include "pan_desc.h"
#include "pan_earlyzs.h"
#include "panvk_cmd_push_constant.h"
#include "panvk_descriptor_set.h"
#include "panvk_macros.h"
#include "panvk_mempool.h"
#include "vk_pipeline_layout.h"
#include "vk_shader.h"
extern const struct vk_device_shader_ops panvk_per_arch(device_shader_ops);
#define MAX_RTS 8
#define MAX_VS_ATTRIBS 16
#if PAN_ARCH < 9
/* We could theoretically use the MAX_PER_SET values here (except for UBOs
* where we're really limited to 256 on the shader side), but on Bifrost we
* have to copy some tables around, which comes at an extra memory/processing
* cost, so let's pick something smaller. */
#define MAX_PER_STAGE_SAMPLED_IMAGES 256
#define MAX_PER_STAGE_SAMPLERS 128
#define MAX_PER_STAGE_UNIFORM_BUFFERS MAX_PER_SET_UNIFORM_BUFFERS
#define MAX_PER_STAGE_STORAGE_BUFFERS 64
#define MAX_PER_STAGE_STORAGE_IMAGES 32
#define MAX_PER_STAGE_INPUT_ATTACHMENTS MAX_PER_SET_INPUT_ATTACHMENTS
#else
#define MAX_PER_STAGE_SAMPLED_IMAGES MAX_PER_SET_SAMPLED_IMAGES
#define MAX_PER_STAGE_SAMPLERS MAX_PER_SET_SAMPLERS
#define MAX_PER_STAGE_UNIFORM_BUFFERS MAX_PER_SET_UNIFORM_BUFFERS
#define MAX_PER_STAGE_STORAGE_BUFFERS MAX_PER_SET_STORAGE_BUFFERS
#define MAX_PER_STAGE_STORAGE_IMAGES MAX_PER_SET_STORAGE_IMAGES
#define MAX_PER_STAGE_INPUT_ATTACHMENTS MAX_PER_SET_INPUT_ATTACHMENTS
#endif
#define MAX_PER_STAGE_RESOURCES ( \
MAX_PER_STAGE_SAMPLED_IMAGES + MAX_PER_STAGE_SAMPLERS + \
MAX_PER_STAGE_UNIFORM_BUFFERS + MAX_PER_STAGE_STORAGE_BUFFERS + \
MAX_PER_STAGE_STORAGE_IMAGES + MAX_PER_STAGE_INPUT_ATTACHMENTS)
struct nir_shader;
struct pan_blend_state;
struct panvk_device;
enum panvk_varying_buf_id {
PANVK_VARY_BUF_GENERAL,
PANVK_VARY_BUF_POSITION,
PANVK_VARY_BUF_PSIZ,
/* Keep last */
PANVK_VARY_BUF_MAX,
};
#if PAN_ARCH < 9
enum panvk_desc_table_id {
PANVK_DESC_TABLE_USER = 0,
PANVK_DESC_TABLE_CS_DYN_SSBOS = MAX_SETS,
PANVK_DESC_TABLE_COMPUTE_COUNT = PANVK_DESC_TABLE_CS_DYN_SSBOS + 1,
PANVK_DESC_TABLE_VS_DYN_SSBOS = MAX_SETS,
PANVK_DESC_TABLE_FS_DYN_SSBOS = MAX_SETS + 1,
PANVK_DESC_TABLE_GFX_COUNT = PANVK_DESC_TABLE_FS_DYN_SSBOS + 1,
};
#endif
#define PANVK_COLOR_ATTACHMENT(x) (x)
#define PANVK_ZS_ATTACHMENT 255
struct panvk_input_attachment_info {
uint32_t target;
uint32_t conversion;
};
/* One attachment per color, one for depth, one for stencil, and the last one
* for the attachment without an InputAttachmentIndex attribute. */
#define INPUT_ATTACHMENT_MAP_SIZE 11
#define FAU_WORD_SIZE sizeof(uint64_t)
#define aligned_u64 __attribute__((aligned(sizeof(uint64_t)))) uint64_t
/* System values which are common to both graphics and compute. These are
* always at the same offset in both graphics and compute allowing us to
* compile the shader without knowing which queue it will be dispatched on.
*/
struct panvk_common_sysvals_inner {
/* Address of sysval/push constant buffer used for indirect loads */
aligned_u64 push_uniforms;
/* Address of the printf buffer */
aligned_u64 printf_buffer_address;
} __attribute__((aligned(FAU_WORD_SIZE)));
struct panvk_common_sysvals {
uint32_t _pad[4];
struct panvk_common_sysvals_inner common;
} __attribute__((aligned(FAU_WORD_SIZE)));
static_assert((offsetof(struct panvk_common_sysvals, common) %
FAU_WORD_SIZE) == 0,
"struct panvk_graphics_sysvals_inner must be 8-byte aligned");
static_assert((sizeof(struct panvk_common_sysvals_inner) %
FAU_WORD_SIZE) == 0,
"struct panvk_graphics_sysvals_inner must be 8-byte aligned");
#define SYSVALS_COMMON_START \
(offsetof(struct panvk_common_sysvals, common) / FAU_WORD_SIZE)
#define SYSVALS_COMMON_COUNT \
(sizeof(struct panvk_common_sysvals_inner) / FAU_WORD_SIZE)
#define SYSVALS_COMMON_END (SYSVALS_COMMON_START + SYSVALS_COMMON_COUNT)
struct panvk_graphics_sysvals {
/* Blend constants MUST come first because their position cannot depend on
* the FAU packing of the fragment shader.
*/
struct {
float constants[4];
} blend;
/* This must be at the same offset for both compute and graphics */
struct panvk_common_sysvals_inner common;
struct {
struct {
float x, y, z;
} scale, offset;
} viewport;
struct {
#if PAN_ARCH < 9
int32_t raw_vertex_offset;
uint32_t num_vertices; /* iter13: XFB needs per-draw vertex count */
/* aligned_u64 attribute below inserts the 4-byte alignment gap
* after num_vertices automatically — no explicit pad needed. */
aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */
#endif
int32_t first_vertex;
int32_t base_instance;
uint32_t noperspective_varyings;
} vs;
struct {
aligned_u64 blend_descs[MAX_RTS];
} fs;
struct panvk_input_attachment_info iam[INPUT_ATTACHMENT_MAP_SIZE];
#if PAN_ARCH < 9
/* gl_Layer on Bifrost is a bit of hack. We have to issue one draw per
* layer, and filter primitives at the VS level.
*/
int32_t layer_id;
struct {
aligned_u64 sets[PANVK_DESC_TABLE_GFX_COUNT];
} desc;
#endif
} __attribute__((aligned(FAU_WORD_SIZE)));
static_assert(offsetof(struct panvk_graphics_sysvals, blend) == 0,
"panvk_graphics_sysvals::blend must be at the start");
static_assert(offsetof(struct panvk_graphics_sysvals, common) ==
offsetof(struct panvk_common_sysvals, common),
"Common sysvals must be at the same offset everywhere");
static_assert((sizeof(struct panvk_graphics_sysvals) % FAU_WORD_SIZE) == 0,
"struct panvk_graphics_sysvals must be 8-byte aligned");
#if PAN_ARCH < 9
static_assert((offsetof(struct panvk_graphics_sysvals, desc) % FAU_WORD_SIZE) ==
0,
"panvk_graphics_sysvals::desc must be 8-byte aligned");
#endif
struct panvk_compute_sysvals {
struct {
uint32_t x, y, z;
} base;
uint32_t _pad;
/* This must be at the same offset for both compute and graphics */
struct panvk_common_sysvals_inner common;
struct {
uint32_t x, y, z;
} num_work_groups;
struct {
uint32_t x, y, z;
} local_group_size;
#if PAN_ARCH < 9
struct {
aligned_u64 sets[PANVK_DESC_TABLE_COMPUTE_COUNT];
} desc;
#endif
} __attribute__((aligned(FAU_WORD_SIZE)));
static_assert(offsetof(struct panvk_compute_sysvals, common) ==
offsetof(struct panvk_common_sysvals, common),
"Common sysvals must be at the same offset everywhere");
static_assert((sizeof(struct panvk_compute_sysvals) % FAU_WORD_SIZE) == 0,
"struct panvk_compute_sysvals must be 8-byte aligned");
#if PAN_ARCH < 9
static_assert((offsetof(struct panvk_compute_sysvals, desc) % FAU_WORD_SIZE) ==
0,
"panvk_compute_sysvals::desc must be 8-byte aligned");
#endif
/* This is not the final offset in the push constant buffer (AKA FAU), but
* just a magic offset we use before packing push constants so we can easily
* identify the type of push constant (driver sysvals vs user push constants).
*/
#define SYSVALS_PUSH_CONST_BASE MAX_PUSH_CONSTANTS_SIZE
#define common_sysval_size(__name) \
sizeof(((struct panvk_common_sysvals *)NULL)->common.__name)
#define graphics_sysval_size(__name) \
sizeof(((struct panvk_graphics_sysvals *)NULL)->__name)
#define compute_sysval_size(__name) \
sizeof(((struct panvk_compute_sysvals *)NULL)->__name)
#define sysval_size(__ptype, __name) __ptype##_sysval_size(__name)
#define common_sysval_offset(__name) \
offsetof(struct panvk_common_sysvals, common.__name)
#define graphics_sysval_offset(__name) \
offsetof(struct panvk_graphics_sysvals, __name)
#define compute_sysval_offset(__name) \
offsetof(struct panvk_compute_sysvals, __name)
#define sysval_offset(__ptype, __name) __ptype##_sysval_offset(__name)
#define sysval_entry_size(__ptype, __name) \
sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0])
#define sysval_entry_offset(__ptype, __name, __idx) \
(sysval_offset(__ptype, __name) + \
(sysval_entry_size(__ptype, __name) * __idx))
#define sysval_fau_start(__ptype, __name) \
(sysval_offset(__ptype, __name) / FAU_WORD_SIZE)
#define sysval_fau_end(__ptype, __name) \
((sysval_offset(__ptype, __name) + sysval_size(__ptype, __name) - 1) / \
FAU_WORD_SIZE)
#define sysval_fau_entry_start(__ptype, __name, __idx) \
(sysval_entry_offset(__ptype, __name, __idx) / FAU_WORD_SIZE)
#define sysval_fau_entry_end(__ptype, __name, __idx) \
((sysval_entry_offset(__ptype, __name, __idx + 1) - 1) / FAU_WORD_SIZE)
#define shader_remapped_fau_offset(__shader, __kind, __offset) \
((FAU_WORD_SIZE * BITSET_PREFIX_SUM((__shader)->fau.used_##__kind, \
(__offset) / FAU_WORD_SIZE)) + \
((__offset) % FAU_WORD_SIZE))
#define shader_remapped_sysval_offset(__shader, __offset) \
shader_remapped_fau_offset(__shader, sysvals, __offset)
#define shader_remapped_push_const_offset(__shader, __offset) \
(((__shader)->fau.sysval_count * FAU_WORD_SIZE) + \
shader_remapped_fau_offset(__shader, push_consts, __offset))
#define shader_use_sysval(__shader, __ptype, __name) \
BITSET_SET_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_start(__ptype, __name), \
sysval_fau_end(__ptype, __name))
#define shader_uses_sysval(__shader, __ptype, __name) \
BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_start(__ptype, __name), \
sysval_fau_end(__ptype, __name))
#define shader_uses_sysval_entry(__shader, __ptype, __name, __idx) \
BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_entry_start(__ptype, __name, __idx), \
sysval_fau_entry_end(__ptype, __name, __idx))
#define shader_use_sysval_range(__shader, __base, __range) \
BITSET_SET_RANGE((__shader)->fau.used_sysvals, (__base) / FAU_WORD_SIZE, \
((__base) + (__range) - 1) / FAU_WORD_SIZE)
#define shader_use_push_const_range(__shader, __base, __range) \
BITSET_SET_RANGE((__shader)->fau.used_push_consts, \
(__base) / FAU_WORD_SIZE, \
((__base) + (__range) - 1) / FAU_WORD_SIZE)
#define load_sysval(__b, __ptype, __bitsz, __name) \
nir_load_push_constant( \
__b, sysval_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \
nir_imm_int(__b, sysval_offset(__ptype, __name)), \
.base = SYSVALS_PUSH_CONST_BASE)
#define load_sysval_entry(__b, __ptype, __bitsz, __name, __dyn_idx) \
nir_load_push_constant( \
__b, sysval_entry_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \
nir_imul_imm(__b, __dyn_idx, sysval_entry_size(__ptype, __name)), \
.base = SYSVALS_PUSH_CONST_BASE + sysval_offset(__ptype, __name), \
.range = sysval_size(__ptype, __name))
#if PAN_ARCH < 9
enum panvk_bifrost_desc_table_type {
PANVK_BIFROST_DESC_TABLE_INVALID = -1,
/* UBO is encoded on 8 bytes */
PANVK_BIFROST_DESC_TABLE_UBO = 0,
/* Images are using a <3DAttributeBuffer,Attribute> pair, each
* of them being stored in a separate table. */
PANVK_BIFROST_DESC_TABLE_IMG,
/* Texture and sampler are encoded on 32 bytes */
PANVK_BIFROST_DESC_TABLE_TEXTURE,
PANVK_BIFROST_DESC_TABLE_SAMPLER,
PANVK_BIFROST_DESC_TABLE_COUNT,
};
#endif
#define COPY_DESC_HANDLE(table, idx) ((table << 28) | (idx))
#define COPY_DESC_HANDLE_EXTRACT_INDEX(handle) ((handle) & BITFIELD_MASK(28))
#define COPY_DESC_HANDLE_EXTRACT_TABLE(handle) ((handle) >> 28)
#define MAX_COMPUTE_SYSVAL_FAUS \
(sizeof(struct panvk_compute_sysvals) / FAU_WORD_SIZE)
#define MAX_GFX_SYSVAL_FAUS \
(sizeof(struct panvk_graphics_sysvals) / FAU_WORD_SIZE)
#define MAX_SYSVAL_FAUS MAX2(MAX_COMPUTE_SYSVAL_FAUS, MAX_GFX_SYSVAL_FAUS)
#define MAX_PUSH_CONST_FAUS (MAX_PUSH_CONSTANTS_SIZE / FAU_WORD_SIZE)
struct panvk_shader_fau_info {
BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
uint32_t sysval_count;
uint32_t total_count;
};
struct panvk_shader_desc_info {
uint32_t used_set_mask;
#if PAN_ARCH < 9
struct {
uint32_t map[MAX_DYNAMIC_UNIFORM_BUFFERS];
uint32_t count;
} dyn_ubos;
struct {
uint32_t map[MAX_DYNAMIC_STORAGE_BUFFERS];
uint32_t count;
} dyn_ssbos;
struct {
struct panvk_priv_mem map;
uint32_t count[PANVK_BIFROST_DESC_TABLE_COUNT];
} others;
#else
struct {
uint32_t map[MAX_DYNAMIC_BUFFERS];
uint32_t count;
} dyn_bufs;
uint32_t fs_varying_attr_desc_count;
#endif
};
struct panvk_shader_variant {
struct pan_shader_info info;
union {
struct {
struct pan_compute_dim local_size;
} cs;
struct {
struct pan_earlyzs_lut earlyzs_lut;
uint32_t input_attachment_read;
} fs;
};
struct panvk_shader_desc_info desc_info;
struct panvk_shader_fau_info fau;
const void *bin_ptr;
uint32_t bin_size;
bool own_bin;
struct panvk_priv_mem code_mem;
#if PAN_ARCH < 9
struct panvk_priv_mem rsd;
#else
union {
struct panvk_priv_mem spd;
struct {
#if PAN_ARCH < 12
struct panvk_priv_mem pos_points;
struct panvk_priv_mem pos_triangles;
struct panvk_priv_mem var;
#else
struct panvk_priv_mem all_points;
struct panvk_priv_mem all_triangles;
#endif
} spds;
};
#endif
const char *nir_str;
const char *asm_str;
};
enum panvk_vs_variant {
/* Hardware vertex shader, when next stage is fragment */
PANVK_VS_VARIANT_HW,
PANVK_VS_VARIANTS,
};
struct panvk_shader {
struct vk_shader vk;
struct panvk_shader_variant variants[];
};
static inline unsigned
panvk_shader_num_variants(mesa_shader_stage stage)
{
if (stage == MESA_SHADER_VERTEX)
return PANVK_VS_VARIANTS;
return 1;
}
static const char *panvk_vs_shader_variant_name[] = {
[PANVK_VS_VARIANT_HW] = NULL,
};
static const char *
panvk_shader_variant_name(const struct panvk_shader *shader,
struct panvk_shader_variant *variant)
{
unsigned i = variant - shader->variants;
assert(i < panvk_shader_num_variants(shader->vk.stage));
if (shader->vk.stage == MESA_SHADER_VERTEX) {
assert(i < ARRAY_SIZE(panvk_vs_shader_variant_name));
return panvk_vs_shader_variant_name[i];
}
assert(panvk_shader_num_variants(shader->vk.stage) == 1);
return NULL;
}
static const struct panvk_shader_variant *
panvk_shader_only_variant(const struct panvk_shader *shader)
{
if (!shader)
return NULL;
assert(panvk_shader_num_variants(shader->vk.stage) == 1);
return &shader->variants[0];
}
static const struct panvk_shader_variant *
panvk_shader_hw_variant(const struct panvk_shader *shader)
{
if (!shader)
return NULL;
return &shader->variants[0];
}
static inline uint64_t
panvk_shader_variant_get_dev_addr(const struct panvk_shader_variant *shader)
{
return shader != NULL ? panvk_priv_mem_dev_addr(shader->code_mem) : 0;
}
#define panvk_shader_foreach_variant(__shader, __var) \
for (struct panvk_shader_variant *__var = (__shader)->variants; \
__var < (__shader)->variants + \
panvk_shader_num_variants((__shader)->vk.stage); \
++__var)
#if PAN_ARCH < 9
struct panvk_shader_link {
struct {
struct panvk_priv_mem attribs;
} vs, fs;
unsigned buf_strides[PANVK_VARY_BUF_MAX];
};
VkResult panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
const struct panvk_shader_variant *vs,
const struct panvk_shader_variant *fs,
struct panvk_shader_link *link);
static inline void
panvk_shader_link_cleanup(struct panvk_shader_link *link)
{
panvk_pool_free_mem(&link->vs.attribs);
panvk_pool_free_mem(&link->fs.attribs);
}
#endif
bool panvk_per_arch(nir_lower_input_attachment_loads)(
nir_shader *nir,
const struct vk_graphics_pipeline_state *state,
uint32_t *input_attachment_read_out);
void panvk_per_arch(nir_lower_descriptors)(
nir_shader *nir, struct panvk_device *dev,
const struct vk_pipeline_robustness_state *rs, uint32_t set_layout_count,
struct vk_descriptor_set_layout *const *set_layouts,
const struct vk_graphics_pipeline_state *state,
struct panvk_shader_desc_info *desc_info);
/* This a stripped-down version of panvk_shader for internal shaders that
* are managed by vk_meta (blend and preload shaders). Those don't need the
* complexity inherent to user provided shaders as they're not exposed. */
struct panvk_internal_shader {
struct vk_shader vk;
struct pan_shader_info info;
struct panvk_priv_mem code_mem;
#if PAN_ARCH < 9
struct panvk_priv_mem rsd;
#else
struct panvk_priv_mem spd;
#endif
};
VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_internal_shader, vk.base, VkShaderEXT,
VK_OBJECT_TYPE_SHADER_EXT)
void panvk_per_arch(compiler_lock)(void);
void panvk_per_arch(compiler_unlock)(void);
VkResult panvk_per_arch(create_internal_shader)(
struct panvk_device *dev, nir_shader *nir,
struct pan_compile_inputs *compiler_inputs,
struct panvk_internal_shader **shader_out);
VkResult panvk_per_arch(create_shader_from_binary)(
struct panvk_device *dev, const struct pan_shader_info *info,
struct pan_compute_dim local_size, const void *bin_ptr, size_t bin_size,
struct panvk_shader **shader_out);
#endif
@@ -0,0 +1,956 @@
/*
* Copyright © 2024 Collabora Ltd.
* Copyright © 2024 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
#include "panvk_buffer.h"
#include "panvk_cmd_buffer.h"
#include "panvk_device_memory.h"
#include "panvk_entrypoints.h"
#include "pan_desc.h"
#include "pan_compiler.h" /* PAN_SHADER_OOB_ADDRESS */
#include "pan_util.h"
static void
att_set_clear_preload(const VkRenderingAttachmentInfo *att, bool *clear, bool *preload)
{
switch (att->loadOp) {
case VK_ATTACHMENT_LOAD_OP_CLEAR:
*clear = true;
break;
case VK_ATTACHMENT_LOAD_OP_LOAD:
*preload = true;
break;
case VK_ATTACHMENT_LOAD_OP_NONE:
case VK_ATTACHMENT_LOAD_OP_DONT_CARE:
/* This is a very frustrating corner case. From the spec:
*
* VK_ATTACHMENT_STORE_OP_NONE specifies the contents within the
* render area are not accessed by the store operation as long as
* no values are written to the attachment during the render pass.
*
* With VK_ATTACHMENT_LOAD_OP_DONT_CARE + VK_ATTACHMENT_STORE_OP_NONE,
* we need to preserve the contents throughout partial renders. The
* easiest way to do that is forcing a preload, so that partial stores
* for unused attachments will be no-op'd by writing existing contents.
*
* TODO: disable preload when we have clean_pixel_write_enable = false
* as an optimization
*/
*preload |= att->storeOp == VK_ATTACHMENT_STORE_OP_NONE;
break;
default:
UNREACHABLE("Unsupported loadOp");
}
}
static struct panvk_image_view *
get_ms2ss_image_view(struct panvk_image_view *iview, uint32_t nr_samples)
{
assert(nr_samples >= 2 && nr_samples <= 16);
assert(iview->pview.nr_samples == 1);
assert(iview->vk.image->create_flags &
VK_IMAGE_CREATE_MULTISAMPLED_RENDER_TO_SINGLE_SAMPLED_BIT_EXT);
/* sample count 2 is at index 0, 4 at 1, .. */
uint32_t vidx = 0;
switch (nr_samples) {
case VK_SAMPLE_COUNT_2_BIT:
vidx = 0;
break;
case VK_SAMPLE_COUNT_4_BIT:
vidx = 1;
break;
case VK_SAMPLE_COUNT_8_BIT:
vidx = 2;
break;
case VK_SAMPLE_COUNT_16_BIT:
vidx = 3;
break;
default:
UNREACHABLE("unhandled sample count");
}
assert(iview->ms_views[vidx] != VK_NULL_HANDLE);
struct panvk_image_view *res =
panvk_image_view_from_handle(iview->ms_views[vidx]);
assert(res->pview.nr_samples == nr_samples);
return res;
}
static void
render_state_set_color_attachment(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingAttachmentInfo *att,
uint32_t index)
{
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
struct pan_fb_info *fbinfo = &state->render.fb.info;
VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
struct panvk_image_view *iview_ss = NULL;
const bool ms2ss = cmdbuf->state.gfx.render.fb.nr_samples > 1 &&
iview->pview.nr_samples == 1;
if (ms2ss) {
iview_ss = iview;
iview =
get_ms2ss_image_view(iview, cmdbuf->state.gfx.render.fb.nr_samples);
}
struct panvk_image *img =
container_of(iview->vk.image, struct panvk_image, vk);
state->render.bound_attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(index);
state->render.color_attachments.iviews[index] = iview;
state->render.color_attachments.preload_iviews[index] =
ms2ss ? iview_ss : NULL;
state->render.color_attachments.fmts[index] = iview->vk.format;
state->render.color_attachments.samples[index] = img->vk.samples;
#if PAN_ARCH < 9
for (uint8_t p = 0; p < ARRAY_SIZE(iview->pview.planes); p++) {
struct pan_image_plane_ref pref =
pan_image_view_get_plane(&iview->pview, p);
if (!pref.image)
continue;
assert(pref.plane_idx < ARRAY_SIZE(img->planes));
assert(img->planes[pref.plane_idx].mem->bo != NULL);
state->render.fb.bos[state->render.fb.bo_count++] =
img->planes[pref.plane_idx].mem->bo;
}
#endif
fbinfo->rts[index].view = &iview->pview;
fbinfo->rts[index].crc_valid = &state->render.fb.crc_valid[index];
state->render.fb.nr_samples =
MAX2(state->render.fb.nr_samples,
pan_image_view_get_nr_samples(&iview->pview));
if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
enum pipe_format fmt = vk_format_to_pipe_format(iview->vk.format);
union pipe_color_union *col =
(union pipe_color_union *)&att->clearValue.color;
pan_pack_color(phys_dev->formats.blendable,
fbinfo->rts[index].clear_value, col, fmt, false);
}
att_set_clear_preload(att, &fbinfo->rts[index].clear,
&fbinfo->rts[index].preload);
if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
struct panvk_resolve_attachment *resolve_info =
&state->render.color_attachments.resolve[index];
VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView);
/* VUID-VkRenderingAttachmentInfo-imageView-06862 and
* VUID-VkRenderingAttachmentInfo-imageView-06863:
* If resolveMode != NONE, then
* resolveView == NULL iff. multisampledRenderToSingleSampledEnable */
assert(ms2ss == (resolve_iview == NULL));
resolve_info->mode = att->resolveMode;
if (!ms2ss) {
resolve_info->dst_iview = resolve_iview;
} else {
assert(iview_ss);
resolve_info->dst_iview = iview_ss;
assert(resolve_info->dst_iview->pview.nr_samples == 1);
}
}
}
static void
render_state_set_z_attachment(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingAttachmentInfo *att)
{
struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
struct pan_fb_info *fbinfo = &state->render.fb.info;
VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
struct panvk_image_view *iview_ss = NULL;
const bool ms2ss = cmdbuf->state.gfx.render.fb.nr_samples > 1 &&
iview->pview.nr_samples == 1;
if (ms2ss) {
iview_ss = iview;
iview =
get_ms2ss_image_view(iview, cmdbuf->state.gfx.render.fb.nr_samples);
}
struct panvk_image *img =
container_of(iview->vk.image, struct panvk_image, vk);
#if PAN_ARCH < 9
/* Depth plane always comes first. */
state->render.fb.bos[state->render.fb.bo_count++] = img->planes[0].mem->bo;
#endif
state->render.z_attachment.fmt = iview->vk.format;
state->render.bound_attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
state->render.zs_pview = iview->pview;
fbinfo->zs.view.zs = &state->render.zs_pview;
/* Fixup view format when the image is multiplanar. */
if (panvk_image_is_planar_depth_stencil(img))
state->render.zs_pview.format = panvk_image_depth_only_pfmt(img);
state->render.zs_pview.planes[0] = (struct pan_image_plane_ref){
.image = &img->planes[0].image,
.plane_idx = 0,
};
state->render.zs_pview.planes[1] = (struct pan_image_plane_ref){0};
state->render.fb.nr_samples =
MAX2(state->render.fb.nr_samples,
pan_image_view_get_nr_samples(&iview->pview));
state->render.z_attachment.iview = iview;
state->render.z_attachment.preload_iview = ms2ss ? iview_ss : NULL;
/* D24S8 is a single plane format where the depth/stencil are interleaved.
* If we touch the depth component, we need to make sure the stencil
* component is preserved, hence the preload, and the view format adjusment.
*/
if (panvk_image_is_interleaved_depth_stencil(img)) {
fbinfo->zs.preload.s = true;
cmdbuf->state.gfx.render.zs_pview.format =
img->planes[0].image.props.format;
} else {
state->render.zs_pview.format = panvk_image_depth_only_pfmt(img);
}
if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
fbinfo->zs.clear_value.depth = att->clearValue.depthStencil.depth;
att_set_clear_preload(att, &fbinfo->zs.clear.z, &fbinfo->zs.preload.z);
if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
struct panvk_resolve_attachment *resolve_info =
&state->render.z_attachment.resolve;
VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView);
resolve_info->mode = att->resolveMode;
if (!ms2ss) {
resolve_info->dst_iview = resolve_iview;
} else {
assert(iview_ss);
resolve_info->dst_iview = iview_ss;
assert(resolve_info->dst_iview->pview.nr_samples == 1);
}
}
}
static void
render_state_set_s_attachment(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingAttachmentInfo *att)
{
struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
struct pan_fb_info *fbinfo = &state->render.fb.info;
VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
struct panvk_image_view *iview_ss = NULL;
const bool ms2ss = cmdbuf->state.gfx.render.fb.nr_samples > 1 &&
iview->pview.nr_samples == 1;
if (ms2ss) {
iview_ss = iview;
iview =
get_ms2ss_image_view(iview, cmdbuf->state.gfx.render.fb.nr_samples);
}
struct panvk_image *img =
container_of(iview->vk.image, struct panvk_image, vk);
#if PAN_ARCH < 9
/* The stencil plane is always last. */
state->render.fb.bos[state->render.fb.bo_count++] =
img->planes[img->plane_count - 1].mem->bo;
#endif
state->render.s_attachment.fmt = iview->vk.format;
state->render.bound_attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
state->render.s_pview = iview->pview;
fbinfo->zs.view.s = &state->render.s_pview;
if (panvk_image_is_planar_depth_stencil(img)) {
state->render.s_pview.format = panvk_image_stencil_only_pfmt(img);
state->render.s_pview.planes[0] = (struct pan_image_plane_ref){0};
state->render.s_pview.planes[1] = (struct pan_image_plane_ref){
.image = &img->planes[1].image,
.plane_idx = 0,
};
} else {
state->render.s_pview.format = panvk_image_stencil_only_pfmt(img);
state->render.s_pview.planes[0] = (struct pan_image_plane_ref){
.image = &img->planes[0].image,
.plane_idx = 0,
};
state->render.s_pview.planes[1] = (struct pan_image_plane_ref){0};
}
state->render.fb.nr_samples =
MAX2(state->render.fb.nr_samples,
pan_image_view_get_nr_samples(&iview->pview));
state->render.s_attachment.iview = iview;
state->render.s_attachment.preload_iview = ms2ss ? iview_ss : NULL;
/* If the depth and stencil attachments point to the same image,
* and the format is D24S8, we can combine them in a single view
* addressing both components.
*/
if (state->render.s_pview.format == PIPE_FORMAT_X24S8_UINT &&
state->render.z_attachment.iview &&
state->render.z_attachment.iview->vk.image == iview->vk.image) {
state->render.zs_pview.format = PIPE_FORMAT_Z24_UNORM_S8_UINT;
fbinfo->zs.preload.s = false;
fbinfo->zs.view.s = NULL;
/* If there was no depth attachment, and the image format is D24S8,
* we use the depth+stencil slot, so we can benefit from AFBC, which
* is not supported on the stencil-only slot on Bifrost.
*/
} else if (img->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
state->render.s_pview.format == PIPE_FORMAT_X24S8_UINT &&
fbinfo->zs.view.zs == NULL) {
fbinfo->zs.view.zs = &state->render.s_pview;
state->render.s_pview.format = PIPE_FORMAT_Z24_UNORM_S8_UINT;
fbinfo->zs.preload.z = true;
fbinfo->zs.view.s = NULL;
}
if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
fbinfo->zs.clear_value.stencil = att->clearValue.depthStencil.stencil;
att_set_clear_preload(att, &fbinfo->zs.clear.s, &fbinfo->zs.preload.s);
if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
struct panvk_resolve_attachment *resolve_info =
&state->render.s_attachment.resolve;
VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView);
resolve_info->mode = att->resolveMode;
if (!ms2ss) {
resolve_info->dst_iview = resolve_iview;
} else {
assert(iview_ss);
resolve_info->dst_iview = iview_ss;
assert(resolve_info->dst_iview->pview.nr_samples == 1);
}
}
}
void
panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingInfo *pRenderingInfo)
{
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
struct pan_fb_info *fbinfo = &state->render.fb.info;
uint32_t att_width = UINT32_MAX, att_height = UINT32_MAX;
state->render.flags = pRenderingInfo->flags;
BITSET_SET(state->dirty, PANVK_CMD_GRAPHICS_DIRTY_RENDER_STATE);
#if PAN_ARCH < 9
state->render.fb.bo_count = 0;
memset(state->render.fb.bos, 0, sizeof(state->render.fb.bos));
#endif
state->render.first_provoking_vertex = U_TRISTATE_UNSET;
#if PAN_ARCH >= 10
state->render.maybe_set_tds_provoking_vertex = NULL;
state->render.maybe_set_fbds_provoking_vertex = NULL;
#endif
memset(state->render.fb.crc_valid, 0, sizeof(state->render.fb.crc_valid));
memset(&state->render.color_attachments, 0,
sizeof(state->render.color_attachments));
memset(&state->render.z_attachment, 0, sizeof(state->render.z_attachment));
memset(&state->render.s_attachment, 0, sizeof(state->render.s_attachment));
state->render.bound_attachments = 0;
const VkMultisampledRenderToSingleSampledInfoEXT *ms2ss_info =
vk_find_struct_const(pRenderingInfo,
MULTISAMPLED_RENDER_TO_SINGLE_SAMPLED_INFO_EXT);
const bool ms2ss = ms2ss_info
? ms2ss_info->multisampledRenderToSingleSampledEnable
: VK_FALSE;
cmdbuf->state.gfx.render.layer_count = pRenderingInfo->viewMask ?
util_last_bit(pRenderingInfo->viewMask) :
pRenderingInfo->layerCount;
cmdbuf->state.gfx.render.view_mask = pRenderingInfo->viewMask;
*fbinfo = (struct pan_fb_info){
.tile_buf_budget = pan_query_optimal_tib_size(PAN_ARCH, phys_dev->model),
.z_tile_buf_budget = pan_query_optimal_z_tib_size(PAN_ARCH, phys_dev->model),
.nr_samples = 0,
.rt_count = pRenderingInfo->colorAttachmentCount,
};
/* In case ms2ss is enabled, use the provided sample count.
* All attachments need to have sample count == 1 or the provided value.
* But, if all attachments have 1, we would end up choosing the wrong value
* if we don't set it here already. */
cmdbuf->state.gfx.render.fb.nr_samples =
ms2ss ? ms2ss_info->rasterizationSamples : 1;
assert(pRenderingInfo->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts));
for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
const VkRenderingAttachmentInfo *att =
&pRenderingInfo->pColorAttachments[i];
VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
if (!iview)
continue;
render_state_set_color_attachment(cmdbuf, att, i);
att_width = MIN2(iview->vk.extent.width, att_width);
att_height = MIN2(iview->vk.extent.height, att_height);
}
if (pRenderingInfo->pDepthAttachment &&
pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE) {
const VkRenderingAttachmentInfo *att = pRenderingInfo->pDepthAttachment;
VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
if (iview) {
assert(iview->vk.image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
render_state_set_z_attachment(cmdbuf, att);
att_width = MIN2(iview->vk.extent.width, att_width);
att_height = MIN2(iview->vk.extent.height, att_height);
}
}
if (pRenderingInfo->pStencilAttachment &&
pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE) {
const VkRenderingAttachmentInfo *att = pRenderingInfo->pStencilAttachment;
VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
if (iview) {
assert(iview->vk.image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
render_state_set_s_attachment(cmdbuf, att);
att_width = MIN2(iview->vk.extent.width, att_width);
att_height = MIN2(iview->vk.extent.height, att_height);
}
}
fbinfo->draw_extent.minx = pRenderingInfo->renderArea.offset.x;
fbinfo->draw_extent.maxx = pRenderingInfo->renderArea.offset.x +
pRenderingInfo->renderArea.extent.width - 1;
fbinfo->draw_extent.miny = pRenderingInfo->renderArea.offset.y;
fbinfo->draw_extent.maxy = pRenderingInfo->renderArea.offset.y +
pRenderingInfo->renderArea.extent.height - 1;
fbinfo->frame_bounding_box = fbinfo->draw_extent;
if (state->render.bound_attachments) {
fbinfo->width = att_width;
fbinfo->height = att_height;
} else {
fbinfo->width = fbinfo->draw_extent.maxx + 1;
fbinfo->height = fbinfo->draw_extent.maxy + 1;
}
assert(fbinfo->width && fbinfo->height);
}
void
panvk_per_arch(cmd_select_tile_size)(struct panvk_cmd_buffer *cmdbuf)
{
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
/* In case we never emitted tiler/framebuffer descriptors, we emit the
* current sample count and compute tile size */
if (fbinfo->nr_samples == 0) {
fbinfo->nr_samples = cmdbuf->state.gfx.render.fb.nr_samples;
GENX(pan_select_tile_size)(fbinfo);
#if PAN_ARCH != 6
if (fbinfo->cbuf_allocation > fbinfo->tile_buf_budget) {
vk_perf(VK_LOG_OBJS(&cmdbuf->vk.base),
"Using too much tile-memory, disabling pipelining");
}
#endif
} else {
/* In case we already emitted tiler/framebuffer descriptors, we ensure
* that the sample count didn't change (this should never happen) */
assert(fbinfo->nr_samples == cmdbuf->state.gfx.render.fb.nr_samples);
}
}
void
panvk_per_arch(cmd_force_fb_preload)(struct panvk_cmd_buffer *cmdbuf,
const VkRenderingInfo *render_info)
{
/* We force preloading for all active attachments when the render area is
* unaligned or when a barrier flushes prior draw calls in the middle of a
* render pass. The two cases can be distinguished by whether a
* render_info is provided.
*
* When the render area is unaligned, we force preloading to preserve
* contents falling outside of the render area. We also make sure the
* initial attachment clears are performed.
*/
struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
struct pan_fb_info *fbinfo = &state->render.fb.info;
VkClearAttachment clear_atts[MAX_RTS + 2];
uint32_t clear_att_count = 0;
if (!state->render.bound_attachments)
return;
for (unsigned i = 0; i < fbinfo->rt_count; i++) {
if (!fbinfo->rts[i].view)
continue;
fbinfo->rts[i].preload = true;
if (fbinfo->rts[i].clear) {
if (render_info) {
const VkRenderingAttachmentInfo *att =
&render_info->pColorAttachments[i];
clear_atts[clear_att_count++] = (VkClearAttachment){
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.colorAttachment = i,
.clearValue = att->clearValue,
};
}
fbinfo->rts[i].clear = false;
}
}
if (fbinfo->zs.view.zs) {
fbinfo->zs.preload.z = true;
if (fbinfo->zs.clear.z) {
if (render_info) {
const VkRenderingAttachmentInfo *att =
render_info->pDepthAttachment;
clear_atts[clear_att_count++] = (VkClearAttachment){
.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
.clearValue = att->clearValue,
};
}
fbinfo->zs.clear.z = false;
}
}
if (fbinfo->zs.view.s ||
(fbinfo->zs.view.zs &&
util_format_is_depth_and_stencil(fbinfo->zs.view.zs->format))) {
fbinfo->zs.preload.s = true;
if (fbinfo->zs.clear.s) {
if (render_info) {
const VkRenderingAttachmentInfo *att =
render_info->pStencilAttachment;
clear_atts[clear_att_count++] = (VkClearAttachment){
.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT,
.clearValue = att->clearValue,
};
}
fbinfo->zs.clear.s = false;
}
}
#if PAN_ARCH >= 10
/* insert a barrier for preload */
const VkMemoryBarrier2 mem_barrier = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
.srcStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
.srcAccessMask = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
.dstStageMask = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
.dstAccessMask = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT,
};
const VkDependencyInfo dep_info = {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.memoryBarrierCount = 1,
.pMemoryBarriers = &mem_barrier,
};
panvk_per_arch(CmdPipelineBarrier2)(panvk_cmd_buffer_to_handle(cmdbuf),
&dep_info);
#endif
if (clear_att_count && render_info) {
VkClearRect clear_rect = {
.rect = render_info->renderArea,
.baseArrayLayer = 0,
.layerCount = render_info->viewMask ? 1 : render_info->layerCount,
};
panvk_per_arch(CmdClearAttachments)(panvk_cmd_buffer_to_handle(cmdbuf),
clear_att_count, clear_atts, 1,
&clear_rect);
}
}
void
panvk_per_arch(cmd_preload_render_area_border)(
struct panvk_cmd_buffer *cmdbuf, const VkRenderingInfo *render_info)
{
const unsigned meta_tile_size = pan_meta_tile_size(PAN_ARCH);
struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
struct pan_fb_info *fbinfo = &state->render.fb.info;
bool render_area_is_aligned =
((fbinfo->draw_extent.minx | fbinfo->draw_extent.miny) %
meta_tile_size) == 0 &&
(fbinfo->draw_extent.maxx + 1 == fbinfo->width ||
(fbinfo->draw_extent.maxx % meta_tile_size) == (meta_tile_size - 1)) &&
(fbinfo->draw_extent.maxy + 1 == fbinfo->height ||
(fbinfo->draw_extent.maxy % meta_tile_size) == (meta_tile_size - 1));
/* If the render area is aligned on the meta tile size, we're good. */
if (!render_area_is_aligned)
panvk_per_arch(cmd_force_fb_preload)(cmdbuf, render_info);
}
static void
prepare_iam_sysvals(struct panvk_cmd_buffer *cmdbuf, BITSET_WORD *dirty_sysvals)
{
const struct vk_input_attachment_location_state *ial =
&cmdbuf->vk.dynamic_graphics_state.ial;
struct panvk_input_attachment_info iam[INPUT_ATTACHMENT_MAP_SIZE];
uint32_t catt_count =
ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN
? MAX_RTS
: ial->color_attachment_count;
memset(iam, ~0, sizeof(iam));
assert(catt_count <= MAX_RTS);
for (uint32_t i = 0; i < catt_count; i++) {
if (ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED ||
!(cmdbuf->state.gfx.render.bound_attachments &
MESA_VK_RP_ATTACHMENT_COLOR_BIT(i)))
continue;
VkFormat fmt = cmdbuf->state.gfx.render.color_attachments.fmts[i];
enum pipe_format pfmt = vk_format_to_pipe_format(fmt);
struct mali_internal_conversion_packed conv;
uint32_t ia_idx = ial->color_map[i] + 1;
assert(ia_idx < ARRAY_SIZE(iam));
iam[ia_idx].target = PANVK_COLOR_ATTACHMENT(i);
pan_pack(&conv, INTERNAL_CONVERSION, cfg) {
cfg.memory_format =
GENX(pan_dithered_format_from_pipe_format)(pfmt, false);
#if PAN_ARCH < 9
cfg.register_format =
vk_format_is_uint(fmt) ? MALI_REGISTER_FILE_FORMAT_U32
: vk_format_is_sint(fmt) ? MALI_REGISTER_FILE_FORMAT_I32
: MALI_REGISTER_FILE_FORMAT_F32;
#endif
}
iam[ia_idx].conversion = conv.opaque[0];
}
if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
uint32_t ia_idx =
ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX ? 0 : ial->depth_att + 1;
assert(ia_idx < ARRAY_SIZE(iam));
iam[ia_idx].target = PANVK_ZS_ATTACHMENT;
#if PAN_ARCH < 9
/* On v7, we need to pass the depth format around. If we use a conversion
* of zero, like we do on v9+, the GPU reports an INVALID_INSTR_ENC. */
VkFormat fmt = cmdbuf->state.gfx.render.z_attachment.fmt;
enum pipe_format pfmt = vk_format_to_pipe_format(fmt);
struct mali_internal_conversion_packed conv;
pan_pack(&conv, INTERNAL_CONVERSION, cfg) {
cfg.register_format = MALI_REGISTER_FILE_FORMAT_F32;
cfg.memory_format =
GENX(pan_dithered_format_from_pipe_format)(pfmt, false);
}
iam[ia_idx].conversion = conv.opaque[0];
#endif
}
if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
uint32_t ia_idx =
ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX ? 0 : ial->stencil_att + 1;
assert(ia_idx < ARRAY_SIZE(iam));
iam[ia_idx].target = PANVK_ZS_ATTACHMENT;
}
for (uint32_t i = 0; i < ARRAY_SIZE(iam); i++)
set_gfx_sysval(cmdbuf, dirty_sysvals, iam[i], iam[i]);
}
/* This value has been selected to get
* dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero passing.
*/
#define MIN_DEPTH_CLIP_RANGE 37.7E-06f
void
panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_draw_info *info)
{
struct vk_color_blend_state *cb = &cmdbuf->vk.dynamic_graphics_state.cb;
const struct panvk_shader_variant *fs =
panvk_shader_only_variant(get_fs(cmdbuf));
uint32_t noperspective_varyings = fs ? fs->info.varyings.noperspective : 0;
BITSET_DECLARE(dirty_sysvals, MAX_SYSVAL_FAUS) = {0};
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.noperspective_varyings,
noperspective_varyings);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.first_vertex, info->vertex.base);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.base_instance, info->instance.base);
#if PAN_ARCH < 9
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
info->vertex.raw_offset);
set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
/* iter13: VK_EXT_transform_feedback sysvals — always set (per draw),
* reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count);
{
const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx;
/* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS
* (= 1<<63). This is the Panfrost-Gallium memory-sink idiom — the
* Bifrost MMU silently discards stores to this address, so a pipeline
* with XFB outputs used in a non-XFB draw (or in an XFB draw with
* fewer bound buffers than the shader declares) is safe instead of
* faulting. See gallium/drivers/panfrost/pan_cmdstream.c PAN_SYSVAL_XFB. */
uint64_t _xa0 = PAN_SHADER_OOB_ADDRESS, _xa1 = PAN_SHADER_OOB_ADDRESS,
_xa2 = PAN_SHADER_OOB_ADDRESS, _xa3 = PAN_SHADER_OOB_ADDRESS;
if (_gfx->xfb.active) {
if (_gfx->xfb.buffer_count > 0 && _gfx->xfb.buffers[0].addr)
_xa0 = _gfx->xfb.buffers[0].addr + _gfx->xfb.buffers[0].offset;
if (_gfx->xfb.buffer_count > 1 && _gfx->xfb.buffers[1].addr)
_xa1 = _gfx->xfb.buffers[1].addr + _gfx->xfb.buffers[1].offset;
if (_gfx->xfb.buffer_count > 2 && _gfx->xfb.buffers[2].addr)
_xa2 = _gfx->xfb.buffers[2].addr + _gfx->xfb.buffers[2].offset;
if (_gfx->xfb.buffer_count > 3 && _gfx->xfb.buffers[3].addr)
_xa3 = _gfx->xfb.buffers[3].addr + _gfx->xfb.buffers[3].offset;
}
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[0], _xa0);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[1], _xa1);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[2], _xa2);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[3], _xa3);
}
#endif
if (dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS)) {
for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++) {
set_gfx_sysval(cmdbuf, dirty_sysvals, blend.constants[i],
cb->blend_constants[i]);
}
}
for (unsigned i = 0; i < MAX_RTS; i++) {
set_gfx_sysval(cmdbuf, dirty_sysvals, fs.blend_descs[i],
cmdbuf->state.gfx.fs.blend_descs[i]);
}
if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
dyn_gfx_state_dirty(cmdbuf, VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE)) {
const struct vk_rasterization_state *rs =
&cmdbuf->vk.dynamic_graphics_state.rs;
const struct vk_viewport_state *vp =
&cmdbuf->vk.dynamic_graphics_state.vp;
const VkViewport *viewport = &vp->viewports[0];
/* Doing the viewport transform in the vertex shader and then depth
* clipping with the viewport depth range gets a similar result to
* clipping in clip-space, but loses precision when the viewport depth
* range is very small. When minDepth == maxDepth, this completely
* flattens the clip-space depth and results in never clipping.
*
* To work around this, set a lower limit on depth range when clipping is
* enabled. This results in slightly incorrect fragment depth values, and
* doesn't help with the precision loss, but at least clipping isn't
* completely broken.
*/
float z_min = viewport->minDepth;
float z_max = viewport->maxDepth;
if (vk_rasterization_state_depth_clip_enable(rs) &&
fabsf(z_max - z_min) < MIN_DEPTH_CLIP_RANGE) {
float z_sign = z_min <= z_max ? 1.0f : -1.0f;
float z_center = 0.5f * (z_max + z_min);
/* Bump offset off-center if necessary, to not go out of range */
z_center = CLAMP(z_center, 0.5f * MIN_DEPTH_CLIP_RANGE,
1.0f - 0.5f * MIN_DEPTH_CLIP_RANGE);
z_min = z_center - 0.5f * z_sign * MIN_DEPTH_CLIP_RANGE;
z_max = z_center + 0.5f * z_sign * MIN_DEPTH_CLIP_RANGE;
}
/* Upload the viewport scale. Defined as (px/2, py/2, pz) at the start of
* section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the
* end of the section, the spec defines:
*
* px = width
* py = height
* pz = maxDepth - minDepth if negativeOneToOne is false
* pz = (maxDepth - minDepth) / 2 if negativeOneToOne is true
*/
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.x,
0.5f * viewport->width);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.y,
0.5f * viewport->height);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.scale.z,
vp->depth_clip_negative_one_to_one ?
0.5f * (z_max - z_min) : z_max - z_min);
/* Upload the viewport offset. Defined as (ox, oy, oz) at the start of
* section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the
* end of the section, the spec defines:
*
* ox = x + width/2
* oy = y + height/2
* oz = minDepth if negativeOneToOne is false
* oz = (maxDepth + minDepth) / 2 if negativeOneToOne is true
*/
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.x,
(0.5f * viewport->width) + viewport->x);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.y,
(0.5f * viewport->height) + viewport->y);
set_gfx_sysval(cmdbuf, dirty_sysvals, viewport.offset.z,
vp->depth_clip_negative_one_to_one ?
0.5f * (z_min + z_max) : z_min);
}
if (dyn_gfx_state_dirty(cmdbuf, INPUT_ATTACHMENT_MAP))
prepare_iam_sysvals(cmdbuf, dirty_sysvals);
const struct panvk_shader_variant *vs =
panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader);
#if PAN_ARCH < 9
struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS)) {
set_gfx_sysval(cmdbuf, dirty_sysvals,
desc.sets[PANVK_DESC_TABLE_VS_DYN_SSBOS],
vs_desc_state->dyn_ssbos);
}
if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, FS)) {
set_gfx_sysval(cmdbuf, dirty_sysvals,
desc.sets[PANVK_DESC_TABLE_FS_DYN_SSBOS],
fs_desc_state->dyn_ssbos);
}
for (uint32_t i = 0; i < MAX_SETS; i++) {
uint32_t used_set_mask =
vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
if (used_set_mask & BITFIELD_BIT(i)) {
set_gfx_sysval(cmdbuf, dirty_sysvals, desc.sets[i],
desc_state->sets[i]->descs.dev);
}
}
#endif
/* We mask the dirty sysvals by the shader usage, and only flag
* the push uniforms dirty if those intersect. */
BITSET_DECLARE(dirty_shader_sysvals, MAX_SYSVAL_FAUS);
BITSET_AND(dirty_shader_sysvals, dirty_sysvals, vs->fau.used_sysvals);
if (!BITSET_IS_EMPTY(dirty_shader_sysvals))
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
if (fs) {
BITSET_AND(dirty_shader_sysvals, dirty_sysvals, fs->fau.used_sysvals);
/* If blend constants are not read by the blend shader, we can consider
* they are not read at all, so clear the dirty bits to avoid re-emitting
* FAUs when we can. */
if (!cmdbuf->state.gfx.cb.info.shader_loads_blend_const)
BITSET_CLEAR_COUNT(dirty_shader_sysvals, 0, 4);
if (!BITSET_IS_EMPTY(dirty_shader_sysvals))
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
}
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBindVertexBuffers2)(VkCommandBuffer commandBuffer,
uint32_t firstBinding,
uint32_t bindingCount,
const VkBuffer *pBuffers,
const VkDeviceSize *pOffsets,
const VkDeviceSize *pSizes,
const VkDeviceSize *pStrides)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
assert(firstBinding + bindingCount <= MAX_VBS);
if (pStrides) {
vk_cmd_set_vertex_binding_strides(&cmdbuf->vk, firstBinding,
bindingCount, pStrides);
}
for (uint32_t i = 0; i < bindingCount; i++) {
VK_FROM_HANDLE(panvk_buffer, buffer, pBuffers[i]);
if (buffer) {
cmdbuf->state.gfx.vb.bufs[firstBinding + i].address =
panvk_buffer_gpu_ptr(buffer, pOffsets[i]);
cmdbuf->state.gfx.vb.bufs[firstBinding + i].size = panvk_buffer_range(
buffer, pOffsets[i], pSizes ? pSizes[i] : VK_WHOLE_SIZE);
} else {
cmdbuf->state.gfx.vb.bufs[firstBinding + i].address = 0;
cmdbuf->state.gfx.vb.bufs[firstBinding + i].size = 0;
}
}
cmdbuf->state.gfx.vb.count =
MAX2(cmdbuf->state.gfx.vb.count, firstBinding + bindingCount);
gfx_state_set_dirty(cmdbuf, VB);
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBindIndexBuffer2)(VkCommandBuffer commandBuffer,
VkBuffer buffer, VkDeviceSize offset,
VkDeviceSize size, VkIndexType indexType)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
VK_FROM_HANDLE(panvk_buffer, buf, buffer);
if (buf) {
cmdbuf->state.gfx.ib.size = panvk_buffer_range(buf, offset, size);
assert(cmdbuf->state.gfx.ib.size <= UINT32_MAX);
cmdbuf->state.gfx.ib.dev_addr = panvk_buffer_gpu_ptr(buf, offset);
} else {
cmdbuf->state.gfx.ib.size = 0;
/* In case of NullDescriptors, we need to set a non-NULL address and rely
* on out-of-bounds behavior against the zero size of the buffer. Note
* that this only works for v10+, as v9 does not have a way to specify the
* index buffer size. */
cmdbuf->state.gfx.ib.dev_addr = PAN_ARCH >= 10 ? 0x1000 : 0;
}
cmdbuf->state.gfx.ib.index_size = vk_index_type_to_bytes(indexType);
gfx_state_set_dirty(cmdbuf, IB);
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,442 @@
#!/usr/bin/env python3
"""
iter13: apply VK_EXT_transform_feedback implementation to Mesa 26.0.6 PanVk.
Run from inside /home/mfritsche/mesa-build/mesa-26.0.6/ on ohm.
Idempotent checks if changes are already present and skips if so.
The implementation is single-variant (Vulkan spec allows undefined behavior
for XFB-output shaders bound outside Begin/EndTransformFeedback, so we
don't need defensive two-variant compilation for v1).
Files modified:
1. src/panfrost/vulkan/panvk_shader.h
2. src/panfrost/vulkan/panvk_vX_physical_device.c
3. src/panfrost/vulkan/panvk_vX_shader.c
4. src/panfrost/vulkan/panvk_cmd_draw.h
5. src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
6. src/panfrost/vulkan/meson.build
Files created:
7. src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c
"""
import os
import sys
ROOT = os.path.abspath(os.path.dirname(__file__)) if "MESA_ROOT" not in os.environ else os.environ["MESA_ROOT"]
# Default: assume cwd is mesa root
if os.path.basename(os.getcwd()).startswith("mesa-"):
ROOT = os.getcwd()
print(f"[iter13] applying patches under {ROOT}")
def replace_once(path, old, new, marker_in_new=None):
"""Replace `old` with `new` in file at path. If `marker_in_new` is in the
file already, treat as already-applied and skip."""
full = os.path.join(ROOT, path)
with open(full) as f:
content = f.read()
if marker_in_new and marker_in_new in content:
print(f" [skip] {path} — already patched ({marker_in_new!r} present)")
return
if old not in content:
print(f" [FAIL] {path} — expected pattern not found:\n {old[:100]!r}")
sys.exit(2)
count = content.count(old)
if count > 1:
print(f" [FAIL] {path} — pattern matches {count} times, need exactly 1")
sys.exit(2)
new_content = content.replace(old, new)
with open(full, "w") as f:
f.write(new_content)
print(f" [ok] {path}")
def create_file(path, content, skip_if_exists=True):
full = os.path.join(ROOT, path)
if skip_if_exists and os.path.exists(full):
print(f" [skip] {path} — exists")
return
os.makedirs(os.path.dirname(full), exist_ok=True)
with open(full, "w") as f:
f.write(content)
print(f" [ok] {path} (created)")
# ============================================================
# 1. panvk_shader.h — extend vs sysval struct (PAN_ARCH < 9)
# ============================================================
print("\n[1/7] panvk_shader.h — add num_vertices + xfb_address[4] to vs sysvals")
replace_once(
"src/panfrost/vulkan/panvk_shader.h",
""" struct {
#if PAN_ARCH < 9
int32_t raw_vertex_offset;
#endif
int32_t first_vertex;
int32_t base_instance;
uint32_t noperspective_varyings;
} vs;""",
""" struct {
#if PAN_ARCH < 9
int32_t raw_vertex_offset;
uint32_t num_vertices; /* iter13: XFB needs per-draw vertex count */
uint32_t _pad_xfb; /* keep 8-byte alignment before u64 array */
aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */
#endif
int32_t first_vertex;
int32_t base_instance;
uint32_t noperspective_varyings;
} vs;""",
marker_in_new="xfb_address[4]",
)
# ============================================================
# 2. panvk_vX_physical_device.c — expose ext + features + properties
# ============================================================
print("\n[2/7] panvk_vX_physical_device.c — expose VK_EXT_transform_feedback")
# A. Add extension to the ext list (find a stable nearby line)
replace_once(
"src/panfrost/vulkan/panvk_vX_physical_device.c",
" .EXT_robustness2 = true,",
""" .EXT_robustness2 = true,
.EXT_transform_feedback = PAN_ARCH < 9, /* iter13: JM-class only for now */""",
marker_in_new="EXT_transform_feedback",
)
# B. Add features. The features block has /* VK_KHR_robustness2 */ nearby.
replace_once(
"src/panfrost/vulkan/panvk_vX_physical_device.c",
""" /* VK_KHR_robustness2 */
.robustBufferAccess2 = PAN_ARCH >= 11,
.robustImageAccess2 = false,
.nullDescriptor = true,""",
""" /* VK_KHR_robustness2 */
.robustBufferAccess2 = PAN_ARCH >= 11,
.robustImageAccess2 = false,
.nullDescriptor = true,
/* VK_EXT_transform_feedback (iter13) */
.transformFeedback = PAN_ARCH < 9,
.geometryStreams = false,""",
marker_in_new=".transformFeedback = PAN_ARCH < 9",
)
# C. Add properties. Anchor to the existing /* VK_KHR_robustness2 */ properties
# block near line 1019. We'll add right after it.
replace_once(
"src/panfrost/vulkan/panvk_vX_physical_device.c",
""" /* VK_KHR_robustness2 */
.robustStorageBufferAccessSizeAlignment = 1,
.robustUniformBufferAccessSizeAlignment = 1,""",
""" /* VK_KHR_robustness2 */
.robustStorageBufferAccessSizeAlignment = 1,
.robustUniformBufferAccessSizeAlignment = 1,
/* VK_EXT_transform_feedback (iter13) */
.maxTransformFeedbackStreams = 1,
.maxTransformFeedbackBuffers = 4,
.maxTransformFeedbackBufferSize = UINT32_MAX,
.maxTransformFeedbackStreamDataSize = 512,
.maxTransformFeedbackBufferDataSize = 512,
.maxTransformFeedbackBufferDataStride = 2048,
.transformFeedbackQueries = false,
.transformFeedbackStreamsLinesTriangles = false,
.transformFeedbackRasterizationStreamSelect = false,
.transformFeedbackDraw = false,""",
marker_in_new="maxTransformFeedbackStreams",
)
# ============================================================
# 3. panvk_vX_shader.c — intrinsic lowering + NIR pass wiring
# ============================================================
print("\n[3/7] panvk_vX_shader.c — intrinsic lowering + pan_nir_lower_xfb wiring")
# A. Add intrinsic cases inside the PAN_ARCH < 9 block.
# Anchor to the existing `vs.raw_vertex_offset` case.
replace_once(
"src/panfrost/vulkan/panvk_vX_shader.c",
"""#if PAN_ARCH < 9
case nir_intrinsic_load_raw_vertex_offset_pan:
val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
break;""",
"""#if PAN_ARCH < 9
case nir_intrinsic_load_raw_vertex_offset_pan:
val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
break;
case nir_intrinsic_load_num_vertices: /* iter13: XFB index calc */
val = load_sysval(b, graphics, bit_size, vs.num_vertices);
break;
case nir_intrinsic_load_xfb_address: { /* iter13: XFB buffer N base address */
unsigned idx = nir_intrinsic_base(intr);
switch (idx) {
case 0: val = load_sysval(b, graphics, bit_size, vs.xfb_address[0]); break;
case 1: val = load_sysval(b, graphics, bit_size, vs.xfb_address[1]); break;
case 2: val = load_sysval(b, graphics, bit_size, vs.xfb_address[2]); break;
case 3: val = load_sysval(b, graphics, bit_size, vs.xfb_address[3]); break;
default: return false;
}
break;
}""",
marker_in_new="load_num_vertices",
)
# B. Wire pan_nir_lower_xfb into the lowering chain.
# We want it right after nir_lower_system_values runs.
# Look for the existing call.
replace_once(
"src/panfrost/vulkan/panvk_vX_shader.c",
""" NIR_PASS(_, nir, nir_lower_system_values);
nir_lower_compute_system_values_options options = {""",
""" NIR_PASS(_, nir, nir_lower_system_values);
#if PAN_ARCH < 9
/* iter13: VK_EXT_transform_feedback if the shader has XFB output
* decorations, run the Mesa standard XFB-info NIR pass + Panfrost's
* own NIR lowering that turns store_output into nir_store_global
* to the per-buffer base address (the panvk lowering above wires
* nir_load_xfb_address to vs.xfb_address[N]). Single-variant: if
* an app binds an XFB pipeline outside vkCmdBeginTransformFeedback,
* the writes go to address 0 undefined behavior per spec. */
if (nir->info.stage == MESA_SHADER_VERTEX &&
nir->xfb_info != NULL) {
NIR_PASS(_, nir, pan_nir_lower_xfb);
}
#endif
nir_lower_compute_system_values_options options = {""",
marker_in_new="pan_nir_lower_xfb",
)
# C. Add #include for pan_nir.h at the top (where pan_nir_lower_xfb is declared)
replace_once(
"src/panfrost/vulkan/panvk_vX_shader.c",
'#include "panvk_shader.h"',
'#include "panvk_shader.h"\n#include "pan_nir.h" /* iter13: pan_nir_lower_xfb */',
marker_in_new='/* iter13: pan_nir_lower_xfb */',
)
# ============================================================
# 4. panvk_cmd_draw.h — add XFB state struct + pipeline state member
# ============================================================
print("\n[4/7] panvk_cmd_draw.h — add panvk_xfb_state to cmd buffer state")
# We add a definition and inject xfb into the graphics state.
# We need to find the right place. Looking at the file: there's a `struct
# panvk_graphics_state` or similar that holds per-cmdbuf graphics state.
# This is intrinsically file-specific; we need to read the file to find the right spot.
# For now, place a self-contained inclusion at the top of the file and add
# state as a separate sibling struct in the gfx state. The cleaner long-term
# place is inside the existing graphics state struct.
# Defer the inclusion approach. Instead use a forward declaration + put the
# struct definition in jm/panvk_vX_cmd_xfb.c and reference via include.
# Actually let's just add a state struct to panvk_cmd_draw.h after the sysvals member.
replace_once(
"src/panfrost/vulkan/panvk_cmd_draw.h",
" struct panvk_graphics_sysvals sysvals;",
""" struct panvk_graphics_sysvals sysvals;
#if PAN_ARCH < 9
/* iter13: VK_EXT_transform_feedback state (JM-class only for now). */
struct {
bool active;
uint32_t buffer_count;
struct {
uint64_t addr;
uint64_t offset;
uint64_t size;
} buffers[4];
} xfb;
#endif""",
marker_in_new="iter13: VK_EXT_transform_feedback state",
)
# ============================================================
# 5. panvk_vX_cmd_draw.c (arch-templated, NOT jm/) — populate XFB sysvals
# ============================================================
print("\n[5/7] panvk_vX_cmd_draw.c — populate vs.num_vertices + vs.xfb_address[] inside the PAN_ARCH<9 block")
# Insert just inside the existing `#if PAN_ARCH < 9` block where
# raw_vertex_offset is set. info->vertex.count is available in scope.
replace_once(
"src/panfrost/vulkan/panvk_vX_cmd_draw.c",
"""#if PAN_ARCH < 9
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
info->vertex.raw_offset);
set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
#endif""",
"""#if PAN_ARCH < 9
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
info->vertex.raw_offset);
set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
/* iter13: VK_EXT_transform_feedback sysvals always set (per draw),
* reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count);
{
const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx;
uint64_t _xa0 = 0, _xa1 = 0, _xa2 = 0, _xa3 = 0;
if (_gfx->xfb.active) {
if (_gfx->xfb.buffer_count > 0)
_xa0 = _gfx->xfb.buffers[0].addr + _gfx->xfb.buffers[0].offset;
if (_gfx->xfb.buffer_count > 1)
_xa1 = _gfx->xfb.buffers[1].addr + _gfx->xfb.buffers[1].offset;
if (_gfx->xfb.buffer_count > 2)
_xa2 = _gfx->xfb.buffers[2].addr + _gfx->xfb.buffers[2].offset;
if (_gfx->xfb.buffer_count > 3)
_xa3 = _gfx->xfb.buffers[3].addr + _gfx->xfb.buffers[3].offset;
}
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[0], _xa0);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[1], _xa1);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[2], _xa2);
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[3], _xa3);
}
#endif""",
marker_in_new="iter13: VK_EXT_transform_feedback sysvals",
)
# ============================================================
# 6. NEW: jm/panvk_vX_cmd_xfb.c — Vulkan command handlers
# ============================================================
print("\n[6/7] jm/panvk_vX_cmd_xfb.c — XFB Vulkan command handlers (NEW FILE)")
xfb_c = r'''/*
* Copyright © 2026 mfritsche / claude-noether
* SPDX-License-Identifier: MIT
*
* iter13: VK_EXT_transform_feedback command handlers for the JM
* architecture path (Bifrost v6/v7 + Valhall-JM v9).
*
* The runtime contract:
* - vkCmdBindTransformFeedbackBuffersEXT: stash (gpu_addr, offset, size)
* for each slot into cmdbuf->state.gfx.xfb.buffers[].
* - vkCmdBeginTransformFeedbackEXT: set cmdbuf->state.gfx.xfb.active = true.
* Mark sysvals dirty so the next draw re-emits vs.xfb_address[].
* - vkCmdEndTransformFeedbackEXT: set active = false.
*
* Counter buffers (firstCounterBuffer/counterBufferCount/pCounterBuffers/
* pCounterBufferOffsets) are accepted by API but ignored v1 doesn't
* support pause/resume. transformFeedbackDraw is advertised as false.
*
* Per-draw integration: jm/panvk_vX_cmd_draw.c reads cmdbuf->state.gfx.xfb
* and populates vs.xfb_address[i] for shader use. The pan_nir_lower_xfb
* pass in panvk_vX_shader.c emits nir_load_xfb_address(i) which lowers
* (via panvk_vX_shader.c sysval handler) to a load from the per-draw
* sysval push area.
*/
#include "vk_log.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_draw.h"
#include "panvk_buffer.h"
#include "panvk_entrypoints.h"
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBindTransformFeedbackBuffersEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstBinding,
uint32_t bindingCount,
const VkBuffer *pBuffers,
const VkDeviceSize *pOffsets,
const VkDeviceSize *pSizes)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
for (uint32_t i = 0; i < bindingCount; i++) {
uint32_t slot = firstBinding + i;
if (slot >= 4)
continue;
VK_FROM_HANDLE(panvk_buffer, buf, pBuffers[i]);
gfx->xfb.buffers[slot].addr = panvk_buffer_gpu_ptr(buf, 0);
gfx->xfb.buffers[slot].offset = pOffsets[i];
gfx->xfb.buffers[slot].size =
(pSizes != NULL && pSizes[i] != VK_WHOLE_SIZE)
? pSizes[i]
: (buf->vk.size - pOffsets[i]);
}
if (firstBinding + bindingCount > gfx->xfb.buffer_count)
gfx->xfb.buffer_count = firstBinding + bindingCount;
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginTransformFeedbackEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer *pCounterBuffers,
const VkDeviceSize *pCounterBufferOffsets)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
/* Counter buffers ignored in v1 see VkPhysicalDeviceTransformFeedback
* PropertiesEXT.transformFeedbackDraw = false in panvk_vX_physical_device.c.
*/
(void)firstCounterBuffer;
(void)counterBufferCount;
(void)pCounterBuffers;
(void)pCounterBufferOffsets;
gfx->xfb.active = true;
/* Per-draw set_gfx_sysval picks up the change automatically no
* explicit dirty marking required (set_gfx_sysval uses memcmp +
* BITSET to detect state diffs and re-emit sysvals). */
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndTransformFeedbackEXT)(
VkCommandBuffer commandBuffer,
uint32_t firstCounterBuffer,
uint32_t counterBufferCount,
const VkBuffer *pCounterBuffers,
const VkDeviceSize *pCounterBufferOffsets)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
(void)firstCounterBuffer;
(void)counterBufferCount;
(void)pCounterBuffers;
(void)pCounterBufferOffsets;
gfx->xfb.active = false;
}
'''
create_file("src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c", xfb_c)
# ============================================================
# 7. meson.build — register the new file in the jm_files array
# ============================================================
print("\n[7/7] meson.build — register jm/panvk_vX_cmd_xfb.c")
replace_once(
"src/panfrost/vulkan/meson.build",
"jm_files = [\n 'jm/panvk_vX_bind_queue.c',",
"jm_files = [\n 'jm/panvk_vX_bind_queue.c',\n 'jm/panvk_vX_cmd_xfb.c', # iter13",
marker_in_new="iter13",
)
print("\n[iter13] all patches applied — run incremental ninja build next")
+438
View File
@@ -0,0 +1,438 @@
/*
* iter13 minimal Vulkan transform feedback probe.
*
* Goal: drive a single-stream, single-buffer VK_EXT_transform_feedback
* capture end-to-end on (patched) PanVk-Bifrost 3 vertices, each emitting
* one vec4 with a known pattern, captured into a host-visible buffer, read
* back and verified byte-exactly.
*
* Uses VK_EXT_transform_feedback. If the extension isn't exposed by the
* driver, the probe exits with an error before doing any GPU work.
*
* Pipeline shape:
* - vertex shader (probe_xfb.vert) writes a vec4 per vertex
* - no fragment shader needed (rasterizerDiscardEnable=VK_TRUE)
* - dynamic rendering with 0 color attachments
* - vkCmdBindTransformFeedbackBuffersEXT + vkCmdBeginTransformFeedbackEXT
* wrap a vkCmdDraw(3, 1, 0, 0)
* - readback buffer is 3*16 = 48 bytes
*
* Pure Vulkan 1.0 core + VK_KHR_dynamic_rendering + VK_EXT_transform_feedback.
*/
#include <errno.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <vulkan/vulkan.h>
#define VERTEX_COUNT 3
#define XFB_BUFFER_BYTES (VERTEX_COUNT * 16) /* 3 vec4s = 48 bytes */
#define VSPV_PATH "probe_xfb.vert.spv"
#define STEP(name) do { fprintf(stderr, "[step] " name "\n"); fflush(stderr); } while (0)
#define VK_CHECK(call) do { \
VkResult _r = (call); \
if (_r != VK_SUCCESS) { \
fprintf(stderr, "[fail] " #call " => %d at %s:%d\n", \
(int)_r, __FILE__, __LINE__); \
exit(2); \
} \
} while (0)
static uint32_t *read_spv(const char *path, size_t *out_bytes)
{
FILE *f = fopen(path, "rb");
if (!f) { fprintf(stderr, "[fail] open %s: %s\n", path, strerror(errno)); exit(3); }
fseek(f, 0, SEEK_END);
long n = ftell(f);
fseek(f, 0, SEEK_SET);
uint32_t *buf = malloc((size_t)n);
fread(buf, 1, (size_t)n, f);
fclose(f);
*out_bytes = (size_t)n;
return buf;
}
static uint32_t pick_memtype(const VkPhysicalDeviceMemoryProperties *mp,
uint32_t type_bits, VkMemoryPropertyFlags want)
{
for (uint32_t i = 0; i < mp->memoryTypeCount; i++) {
if ((type_bits & (1u << i)) &&
(mp->memoryTypes[i].propertyFlags & want) == want)
return i;
}
fprintf(stderr, "[fail] no memtype\n"); exit(4);
}
static uint32_t pick_host_visible(const VkPhysicalDeviceMemoryProperties *mp,
uint32_t type_bits)
{
VkMemoryPropertyFlags pref =
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
for (uint32_t i = 0; i < mp->memoryTypeCount; i++) {
if ((type_bits & (1u << i)) &&
(mp->memoryTypes[i].propertyFlags & pref) == pref) return i;
}
for (uint32_t i = 0; i < mp->memoryTypeCount; i++) {
if ((type_bits & (1u << i)) &&
(mp->memoryTypes[i].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
return i;
}
fprintf(stderr, "[fail] no HOST_VISIBLE\n"); exit(4);
}
int main(void)
{
STEP("vkCreateInstance");
const char *inst_exts[] = { "VK_KHR_get_physical_device_properties2" };
VkApplicationInfo app = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "panvk-bifrost iter13 XFB probe",
.apiVersion = VK_API_VERSION_1_0,
};
VkInstanceCreateInfo ici = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app,
.enabledExtensionCount = 1,
.ppEnabledExtensionNames = inst_exts,
};
VkInstance inst;
VK_CHECK(vkCreateInstance(&ici, NULL, &inst));
uint32_t n_phys = 0;
VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, NULL));
VkPhysicalDevice *phys = calloc(n_phys, sizeof(*phys));
VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, phys));
VkPhysicalDevice gpu = phys[0];
/* Check VK_EXT_transform_feedback is exposed before we proceed. */
uint32_t ext_count = 0;
vkEnumerateDeviceExtensionProperties(gpu, NULL, &ext_count, NULL);
VkExtensionProperties *exts = calloc(ext_count, sizeof(*exts));
vkEnumerateDeviceExtensionProperties(gpu, NULL, &ext_count, exts);
int has_xfb = 0;
for (uint32_t i = 0; i < ext_count; i++) {
if (!strcmp(exts[i].extensionName, "VK_EXT_transform_feedback"))
has_xfb = 1;
}
free(exts);
if (!has_xfb) {
fprintf(stderr, "[fail] VK_EXT_transform_feedback NOT exposed by driver "
"(this is the iter13 implementation gap — re-run on a Mesa "
"build with the iter13 patches applied)\n");
return 9;
}
fprintf(stderr, "[info] VK_EXT_transform_feedback present on device\n");
VkPhysicalDeviceMemoryProperties mp;
vkGetPhysicalDeviceMemoryProperties(gpu, &mp);
/* Query the transform feedback features struct via vkGetPhysicalDeviceFeatures2. */
PFN_vkGetPhysicalDeviceFeatures2KHR pGetFeats2 =
(PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(
inst, "vkGetPhysicalDeviceFeatures2KHR");
if (!pGetFeats2) { fprintf(stderr, "[fail] no vkGetPhysicalDeviceFeatures2KHR\n"); return 5; }
VkPhysicalDeviceTransformFeedbackFeaturesEXT xfb_feats = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT,
};
VkPhysicalDeviceFeatures2 feats2 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
.pNext = &xfb_feats,
};
pGetFeats2(gpu, &feats2);
fprintf(stderr, "[info] transformFeedback=%u geometryStreams=%u\n",
xfb_feats.transformFeedback, xfb_feats.geometryStreams);
if (!xfb_feats.transformFeedback) {
fprintf(stderr, "[fail] transformFeedback feature is FALSE — driver exposes ext but not feature\n");
return 10;
}
/* ---- queue family ---- */
uint32_t n_qf = 0;
vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, NULL);
VkQueueFamilyProperties *qfp = calloc(n_qf, sizeof(*qfp));
vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, qfp);
uint32_t qfam = UINT32_MAX;
for (uint32_t i = 0; i < n_qf; i++) {
if (qfp[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { qfam = i; break; }
}
/* ---- device with XFB + dynamic_rendering enabled ---- */
STEP("vkCreateDevice (+VK_EXT_transform_feedback, +dynamic_rendering chain)");
const char *dev_exts[] = {
"VK_KHR_multiview", "VK_KHR_maintenance2",
"VK_KHR_create_renderpass2", "VK_KHR_depth_stencil_resolve",
"VK_KHR_dynamic_rendering",
"VK_EXT_transform_feedback",
};
VkPhysicalDeviceTransformFeedbackFeaturesEXT enable_xfb = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT,
.transformFeedback = VK_TRUE,
.geometryStreams = VK_FALSE,
};
VkPhysicalDeviceDynamicRenderingFeaturesKHR dyn_feat = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES_KHR,
.pNext = &enable_xfb,
.dynamicRendering = VK_TRUE,
};
float qprio = 1.0f;
VkDeviceQueueCreateInfo qci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = qfam, .queueCount = 1, .pQueuePriorities = &qprio,
};
VkDeviceCreateInfo dci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.pNext = &dyn_feat,
.queueCreateInfoCount = 1, .pQueueCreateInfos = &qci,
.enabledExtensionCount = sizeof(dev_exts)/sizeof(dev_exts[0]),
.ppEnabledExtensionNames = dev_exts,
};
VkDevice dev;
VK_CHECK(vkCreateDevice(gpu, &dci, NULL, &dev));
VkQueue queue;
vkGetDeviceQueue(dev, qfam, 0, &queue);
/* ---- XFB function pointers ---- */
PFN_vkCmdBindTransformFeedbackBuffersEXT pBindXfb =
(PFN_vkCmdBindTransformFeedbackBuffersEXT)vkGetDeviceProcAddr(
dev, "vkCmdBindTransformFeedbackBuffersEXT");
PFN_vkCmdBeginTransformFeedbackEXT pBeginXfb =
(PFN_vkCmdBeginTransformFeedbackEXT)vkGetDeviceProcAddr(
dev, "vkCmdBeginTransformFeedbackEXT");
PFN_vkCmdEndTransformFeedbackEXT pEndXfb =
(PFN_vkCmdEndTransformFeedbackEXT)vkGetDeviceProcAddr(
dev, "vkCmdEndTransformFeedbackEXT");
PFN_vkCmdBeginRenderingKHR pBeginRendering =
(PFN_vkCmdBeginRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdBeginRenderingKHR");
PFN_vkCmdEndRenderingKHR pEndRendering =
(PFN_vkCmdEndRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdEndRenderingKHR");
if (!pBindXfb || !pBeginXfb || !pEndXfb || !pBeginRendering || !pEndRendering) {
fprintf(stderr, "[fail] one or more XFB / dynamic_rendering entry points missing\n");
return 11;
}
/* ---- XFB capture buffer (host-visible) ---- */
STEP("vkCreateBuffer XFB capture (host-visible)");
VkBufferCreateInfo xfb_bci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = XFB_BUFFER_BYTES,
.usage = VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
};
VkBuffer xfb_buf;
VK_CHECK(vkCreateBuffer(dev, &xfb_bci, NULL, &xfb_buf));
VkMemoryRequirements xfb_mr;
vkGetBufferMemoryRequirements(dev, xfb_buf, &xfb_mr);
VkMemoryAllocateInfo xfb_mai = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = xfb_mr.size,
.memoryTypeIndex = pick_host_visible(&mp, xfb_mr.memoryTypeBits),
};
VkDeviceMemory xfb_mem;
VK_CHECK(vkAllocateMemory(dev, &xfb_mai, NULL, &xfb_mem));
VK_CHECK(vkBindBufferMemory(dev, xfb_buf, xfb_mem, 0));
/* Pre-fill with sentinel so we can detect "GPU never wrote" vs "wrong write". */
void *mapped = NULL;
VK_CHECK(vkMapMemory(dev, xfb_mem, 0, VK_WHOLE_SIZE, 0, &mapped));
uint32_t *u32 = (uint32_t *)mapped;
for (uint32_t i = 0; i < XFB_BUFFER_BYTES / 4; i++) u32[i] = 0xDEADBEEFu;
/* ---- pipeline (vertex stage only, raster-discard, no color attachment) ---- */
STEP("vkCreatePipelineLayout + vert shader");
VkPipelineLayoutCreateInfo plci = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
};
VkPipelineLayout pl;
VK_CHECK(vkCreatePipelineLayout(dev, &plci, NULL, &pl));
size_t spv_bytes = 0;
uint32_t *spv = read_spv(VSPV_PATH, &spv_bytes);
VkShaderModuleCreateInfo smci = {
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
.codeSize = spv_bytes, .pCode = spv,
};
VkShaderModule vsm;
VK_CHECK(vkCreateShaderModule(dev, &smci, NULL, &vsm));
free(spv);
VkPipelineShaderStageCreateInfo stages[1] = {
{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_VERTEX_BIT, .module = vsm, .pName = "main" },
};
VkPipelineVertexInputStateCreateInfo vi = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
};
VkPipelineInputAssemblyStateCreateInfo ia = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
};
VkViewport vp_dummy = { 0, 0, 1, 1, 0.0f, 1.0f };
VkRect2D sc_dummy = {{0,0}, {1,1}};
VkPipelineViewportStateCreateInfo vp = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
.viewportCount = 1, .pViewports = &vp_dummy,
.scissorCount = 1, .pScissors = &sc_dummy,
};
VkPipelineRasterizationStateCreateInfo rs = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
.rasterizerDiscardEnable = VK_TRUE, /* THE point — no rasterization */
.polygonMode = VK_POLYGON_MODE_FILL,
.cullMode = VK_CULL_MODE_NONE,
.lineWidth = 1.0f,
};
VkPipelineMultisampleStateCreateInfo ms = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
};
VkPipelineRenderingCreateInfoKHR pri = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO_KHR,
.colorAttachmentCount = 0, /* No color attachment with raster discard. */
};
VkGraphicsPipelineCreateInfo gpci = {
.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
.pNext = &pri,
.stageCount = 1, .pStages = stages,
.pVertexInputState = &vi,
.pInputAssemblyState = &ia,
.pViewportState = &vp,
.pRasterizationState = &rs,
.pMultisampleState = &ms,
.layout = pl,
};
STEP("vkCreateGraphicsPipelines (raster-discard + XFB-output VS)");
VkPipeline pipe;
VK_CHECK(vkCreateGraphicsPipelines(dev, VK_NULL_HANDLE, 1, &gpci, NULL, &pipe));
/* ---- command buffer ---- */
VkCommandPoolCreateInfo cpoolci = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.queueFamilyIndex = qfam,
};
VkCommandPool cpool;
VK_CHECK(vkCreateCommandPool(dev, &cpoolci, NULL, &cpool));
VkCommandBufferAllocateInfo cbai = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = cpool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
VkCommandBuffer cb;
VK_CHECK(vkAllocateCommandBuffers(dev, &cbai, &cb));
STEP("record (bind XFB buffer + begin XFB + draw + end XFB)");
VkCommandBufferBeginInfo cbbi = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
};
VK_CHECK(vkBeginCommandBuffer(cb, &cbbi));
/* Bind XFB buffer to slot 0 */
VkDeviceSize xfb_offset = 0, xfb_size = XFB_BUFFER_BYTES;
pBindXfb(cb, 0, 1, &xfb_buf, &xfb_offset, &xfb_size);
/* Dynamic rendering with NO color attachments (raster-discard).
* Render-area is required by the spec to be > 0 even if discarded;
* use 1x1. */
VkRenderingInfoKHR ri = {
.sType = VK_STRUCTURE_TYPE_RENDERING_INFO_KHR,
.renderArea = {{0,0}, {1,1}},
.layerCount = 1,
.colorAttachmentCount = 0,
};
pBeginRendering(cb, &ri);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, pipe);
pBeginXfb(cb, 0, 0, NULL, NULL);
vkCmdDraw(cb, VERTEX_COUNT, 1, 0, 0);
pEndXfb(cb, 0, 0, NULL, NULL);
pEndRendering(cb);
/* Sync XFB writes for host read. */
VkBufferMemoryBarrier bb = {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT,
.dstAccessMask = VK_ACCESS_HOST_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = xfb_buf, .offset = 0, .size = VK_WHOLE_SIZE,
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT,
VK_PIPELINE_STAGE_HOST_BIT,
0, 0, NULL, 1, &bb, 0, NULL);
VK_CHECK(vkEndCommandBuffer(cb));
/* ---- submit ---- */
VkFenceCreateInfo fci = { .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO };
VkFence fence;
VK_CHECK(vkCreateFence(dev, &fci, NULL, &fence));
VkSubmitInfo si = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1, .pCommandBuffers = &cb,
};
STEP("submit + wait (10s)");
VK_CHECK(vkQueueSubmit(queue, 1, &si, fence));
VkResult wr = vkWaitForFences(dev, 1, &fence, VK_TRUE, 10ULL * 1000 * 1000 * 1000);
if (wr != VK_SUCCESS) {
fprintf(stderr, "[fail] vkWaitForFences => %d\n", wr); return 7;
}
/* ---- verify ---- */
STEP("readback + verify");
VkMappedMemoryRange mmr = {
.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
.memory = xfb_mem, .offset = 0, .size = VK_WHOLE_SIZE,
};
vkInvalidateMappedMemoryRanges(dev, 1, &mmr);
/* Expected: each vec4 = (vertex_id, 0, 4660.0, 51966.0) as float32 */
int mismatches = 0;
float *floats = (float *)mapped;
for (uint32_t v = 0; v < VERTEX_COUNT; v++) {
float got[4] = { floats[v*4 + 0], floats[v*4 + 1], floats[v*4 + 2], floats[v*4 + 3] };
float want[4] = { (float)v, 0.0f, (float)0x1234, (float)0xcafe };
for (int c = 0; c < 4; c++) {
if (got[c] != want[c]) {
fprintf(stderr, "[diff] vertex %u comp %d: got=%f want=%f\n",
v, c, got[c], want[c]);
mismatches++;
}
}
fprintf(stderr, "[info] vertex %u: (%f, %f, %f, %f)\n",
v, got[0], got[1], got[2], got[3]);
}
/* ---- teardown ---- */
vkUnmapMemory(dev, xfb_mem);
vkDestroyFence(dev, fence, NULL);
vkDestroyCommandPool(dev, cpool, NULL);
vkDestroyPipeline(dev, pipe, NULL);
vkDestroyShaderModule(dev, vsm, NULL);
vkDestroyPipelineLayout(dev, pl, NULL);
vkDestroyBuffer(dev, xfb_buf, NULL);
vkFreeMemory(dev, xfb_mem, NULL);
vkDestroyDevice(dev, NULL);
vkDestroyInstance(inst, NULL);
free(phys); free(qfp);
if (mismatches == 0) {
fprintf(stderr, "[PASS] PanVk-Bifrost transform feedback: 3 vertices captured correctly.\n");
return 0;
} else {
fprintf(stderr, "[FAIL] %d mismatches across 3 vertices.\n", mismatches);
return 1;
}
}
+24
View File
@@ -0,0 +1,24 @@
#version 450
// iter13 XFB probe vertex shader.
// Writes a known pattern per vertex into transform feedback buffer 0.
// Each vertex emits one vec4: (vertex_id, instance_id, 0x1234, 0xcafe).
// With a 3-vertex single-instance draw + buffer offset 0,
// expected capture (LE float32 array of vec4s):
// vertex 0: 0.0, 0.0, 4660.0, 51966.0
// vertex 1: 1.0, 0.0, 4660.0, 51966.0
// vertex 2: 2.0, 0.0, 4660.0, 51966.0
layout(xfb_buffer = 0, xfb_offset = 0, xfb_stride = 16, location = 0) out vec4 captured;
void main() {
// Position is unused (rasterizerDiscardEnable=VK_TRUE) but needed for valid pipeline.
gl_Position = vec4(0, 0, 0, 1);
captured = vec4(
float(gl_VertexIndex),
float(gl_InstanceIndex),
float(0x1234),
float(0xcafe)
);
}
@@ -0,0 +1,266 @@
/*
* iter13 Janet-CRITICAL regression: XFB-capable pipeline used WITHOUT
* vkCmdBeginTransformFeedback must NOT fault the GPU.
*
* Same pipeline shape as probe_xfb.c, but the draw is not wrapped in
* Begin/End XFB and no XFB buffer is bound. The vertex shader still
* emits a store_global instruction (xfb_address[0] is read from sysval).
*
* With the memory-sink fix (xfb_address defaults to PAN_SHADER_OOB_ADDRESS
* = 0x8000_0000_0000_0000), the store is silently discarded by the MMU.
* Without that fix, the store goes to address 0 page fault GPU job
* failure.
*
* Pass criterion: vkQueueSubmit + vkWaitForFences returns VK_SUCCESS
* (no DEVICE_LOST). No buffer to read back we only care that the GPU
* survives the draw.
*/
#include <errno.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <vulkan/vulkan.h>
#define VSPV_PATH "probe_xfb.vert.spv"
#define STEP(name) do { fprintf(stderr, "[step] " name "\n"); fflush(stderr); } while (0)
#define VK_CHECK(call) do { \
VkResult _r = (call); \
if (_r != VK_SUCCESS) { \
fprintf(stderr, "[fail] " #call " => %d at %s:%d\n", \
(int)_r, __FILE__, __LINE__); \
exit(2); \
} \
} while (0)
static uint32_t *read_spv(const char *path, size_t *out_bytes)
{
FILE *f = fopen(path, "rb");
if (!f) { fprintf(stderr, "[fail] open %s: %s\n", path, strerror(errno)); exit(3); }
fseek(f, 0, SEEK_END);
long n = ftell(f);
fseek(f, 0, SEEK_SET);
uint32_t *buf = malloc((size_t)n);
fread(buf, 1, (size_t)n, f);
fclose(f);
*out_bytes = (size_t)n;
return buf;
}
int main(void)
{
STEP("vkCreateInstance");
VkApplicationInfo app = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "panvk-bifrost iter13 XFB no-draw probe",
.apiVersion = VK_API_VERSION_1_0,
};
const char *inst_exts[] = { "VK_KHR_get_physical_device_properties2" };
VkInstanceCreateInfo ici = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app,
.enabledExtensionCount = 1,
.ppEnabledExtensionNames = inst_exts,
};
VkInstance inst;
VK_CHECK(vkCreateInstance(&ici, NULL, &inst));
uint32_t n_phys = 0;
VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, NULL));
VkPhysicalDevice *phys = calloc(n_phys, sizeof(*phys));
VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, phys));
VkPhysicalDevice gpu = phys[0];
uint32_t n_qf = 0;
vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, NULL);
VkQueueFamilyProperties *qfp = calloc(n_qf, sizeof(*qfp));
vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, qfp);
uint32_t qfam = UINT32_MAX;
for (uint32_t i = 0; i < n_qf; i++) {
if (qfp[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { qfam = i; break; }
}
STEP("vkCreateDevice (+XFB feature enabled + dynamic_rendering)");
const char *dev_exts[] = {
"VK_KHR_multiview", "VK_KHR_maintenance2",
"VK_KHR_create_renderpass2", "VK_KHR_depth_stencil_resolve",
"VK_KHR_dynamic_rendering",
"VK_EXT_transform_feedback",
};
VkPhysicalDeviceTransformFeedbackFeaturesEXT enable_xfb = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT,
.transformFeedback = VK_TRUE,
.geometryStreams = VK_FALSE,
};
VkPhysicalDeviceDynamicRenderingFeaturesKHR dyn_feat = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES_KHR,
.pNext = &enable_xfb,
.dynamicRendering = VK_TRUE,
};
float qprio = 1.0f;
VkDeviceQueueCreateInfo qci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = qfam, .queueCount = 1, .pQueuePriorities = &qprio,
};
VkDeviceCreateInfo dci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.pNext = &dyn_feat,
.queueCreateInfoCount = 1, .pQueueCreateInfos = &qci,
.enabledExtensionCount = sizeof(dev_exts)/sizeof(dev_exts[0]),
.ppEnabledExtensionNames = dev_exts,
};
VkDevice dev;
VK_CHECK(vkCreateDevice(gpu, &dci, NULL, &dev));
VkQueue queue;
vkGetDeviceQueue(dev, qfam, 0, &queue);
PFN_vkCmdBeginRenderingKHR pBeginRendering =
(PFN_vkCmdBeginRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdBeginRenderingKHR");
PFN_vkCmdEndRenderingKHR pEndRendering =
(PFN_vkCmdEndRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdEndRenderingKHR");
/* Same XFB-bearing vertex shader as probe_xfb — its SPIR-V has the
* xfb_buffer / xfb_offset decorations on `captured`. PanVk's driver
* will run pan_nir_lower_xfb on it, producing nir_store_global to
* vs.xfb_address[0]. We rely on the driver setting that sysval to
* PAN_SHADER_OOB_ADDRESS when xfb is inactive. */
STEP("vkCreateGraphicsPipelines (XFB-capable VS, no XFB buffer bound)");
VkPipelineLayoutCreateInfo plci = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
};
VkPipelineLayout pl;
VK_CHECK(vkCreatePipelineLayout(dev, &plci, NULL, &pl));
size_t spv_bytes = 0;
uint32_t *spv = read_spv(VSPV_PATH, &spv_bytes);
VkShaderModuleCreateInfo smci = {
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
.codeSize = spv_bytes, .pCode = spv,
};
VkShaderModule vsm;
VK_CHECK(vkCreateShaderModule(dev, &smci, NULL, &vsm));
free(spv);
VkPipelineShaderStageCreateInfo stages[1] = {
{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_VERTEX_BIT, .module = vsm, .pName = "main" },
};
VkPipelineVertexInputStateCreateInfo vi = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
};
VkPipelineInputAssemblyStateCreateInfo ia = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
};
VkViewport vp_dummy = { 0, 0, 1, 1, 0.0f, 1.0f };
VkRect2D sc_dummy = {{0,0}, {1,1}};
VkPipelineViewportStateCreateInfo vp = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
.viewportCount = 1, .pViewports = &vp_dummy,
.scissorCount = 1, .pScissors = &sc_dummy,
};
VkPipelineRasterizationStateCreateInfo rs = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
.rasterizerDiscardEnable = VK_TRUE,
.polygonMode = VK_POLYGON_MODE_FILL,
.cullMode = VK_CULL_MODE_NONE,
.lineWidth = 1.0f,
};
VkPipelineMultisampleStateCreateInfo ms = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
};
VkPipelineRenderingCreateInfoKHR pri = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO_KHR,
.colorAttachmentCount = 0,
};
VkGraphicsPipelineCreateInfo gpci = {
.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
.pNext = &pri,
.stageCount = 1, .pStages = stages,
.pVertexInputState = &vi,
.pInputAssemblyState = &ia,
.pViewportState = &vp,
.pRasterizationState = &rs,
.pMultisampleState = &ms,
.layout = pl,
};
VkPipeline pipe;
VK_CHECK(vkCreateGraphicsPipelines(dev, VK_NULL_HANDLE, 1, &gpci, NULL, &pipe));
VkCommandPoolCreateInfo cpoolci = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.queueFamilyIndex = qfam,
};
VkCommandPool cpool;
VK_CHECK(vkCreateCommandPool(dev, &cpoolci, NULL, &cpool));
VkCommandBufferAllocateInfo cbai = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = cpool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
VkCommandBuffer cb;
VK_CHECK(vkAllocateCommandBuffers(dev, &cbai, &cb));
STEP("record (draw WITHOUT XFB Begin/End; no buffer bound)");
VkCommandBufferBeginInfo cbbi = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
};
VK_CHECK(vkBeginCommandBuffer(cb, &cbbi));
VkRenderingInfoKHR ri = {
.sType = VK_STRUCTURE_TYPE_RENDERING_INFO_KHR,
.renderArea = {{0,0}, {1,1}},
.layerCount = 1,
.colorAttachmentCount = 0,
};
pBeginRendering(cb, &ri);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, pipe);
/* No vkCmdBindTransformFeedbackBuffersEXT.
* No vkCmdBeginTransformFeedbackEXT.
* Just draw the XFB store in the shader must be silently discarded. */
vkCmdDraw(cb, 3, 1, 0, 0);
pEndRendering(cb);
VK_CHECK(vkEndCommandBuffer(cb));
VkFenceCreateInfo fci = { .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO };
VkFence fence;
VK_CHECK(vkCreateFence(dev, &fci, NULL, &fence));
VkSubmitInfo si = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1, .pCommandBuffers = &cb,
};
STEP("submit + wait (10s) — expect VK_SUCCESS, not DEVICE_LOST");
VK_CHECK(vkQueueSubmit(queue, 1, &si, fence));
VkResult wr = vkWaitForFences(dev, 1, &fence, VK_TRUE, 10ULL * 1000 * 1000 * 1000);
if (wr == VK_ERROR_DEVICE_LOST) {
fprintf(stderr, "[FAIL] DEVICE_LOST — the XFB store-global probably faulted "
"(memory-sink sentinel not applied).\n");
return 1;
}
if (wr != VK_SUCCESS) {
fprintf(stderr, "[FAIL] vkWaitForFences => %d\n", wr);
return 2;
}
vkDestroyFence(dev, fence, NULL);
vkDestroyCommandPool(dev, cpool, NULL);
vkDestroyPipeline(dev, pipe, NULL);
vkDestroyShaderModule(dev, vsm, NULL);
vkDestroyPipelineLayout(dev, pl, NULL);
vkDestroyDevice(dev, NULL);
vkDestroyInstance(inst, NULL);
free(phys); free(qfp);
fprintf(stderr, "[PASS] XFB-capable pipeline survives non-XFB draw — memory-sink active.\n");
return 0;
}