Files
panvk-bifrost/mesa-panvk-bifrost/iter13/applied_state/panvk_shader.h
T
marfrit a4e7d8ab90 initial seed: retrofit campaign lineage from local working trees
panvk-bifrost campaigns (r1..r4 Vulkan compositor + r5.video1 Vulkan
video decode) shipped before this repo existed; the deliverable
patches live in marfrit-packages, but the reasoning chain, phase docs,
and source-state evidence lived only in local working trees on the
development host.

This retrofit imports:
- mesa-panvk-bifrost/   — r1..r4 era phase docs (iter1..iter18)
                          (libmali stub blobs at iter18/blob/ excluded
                          — 109MB of RE artifacts replaced with a README
                          pointer)
- mesa-panvk-bifrost-video/ — sibling campaign phase docs + probe
- evidence/             — frozen .tgz source snapshots at each milestone
                          (basis for the 0005 patch diff generation)

Future iterations should branch off here from day one, so each iter is
a commit rather than a snapshot. See [[feedback-session-local-process-pins]]
for the process drift this retrofit closes.

Total: 1.9 MB across 124 files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 05:25:37 +02:00

573 lines
19 KiB
C

/*
* Copyright © 2021 Collabora Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef PANVK_SHADER_H
#define PANVK_SHADER_H
#ifndef PAN_ARCH
#error "PAN_ARCH must be defined"
#endif
#include "compiler/pan_compiler.h"
#include "pan_desc.h"
#include "pan_earlyzs.h"
#include "panvk_cmd_push_constant.h"
#include "panvk_descriptor_set.h"
#include "panvk_macros.h"
#include "panvk_mempool.h"
#include "vk_pipeline_layout.h"
#include "vk_shader.h"
extern const struct vk_device_shader_ops panvk_per_arch(device_shader_ops);
#define MAX_RTS 8
#define MAX_VS_ATTRIBS 16
#if PAN_ARCH < 9
/* We could theoretically use the MAX_PER_SET values here (except for UBOs
* where we're really limited to 256 on the shader side), but on Bifrost we
* have to copy some tables around, which comes at an extra memory/processing
* cost, so let's pick something smaller. */
#define MAX_PER_STAGE_SAMPLED_IMAGES 256
#define MAX_PER_STAGE_SAMPLERS 128
#define MAX_PER_STAGE_UNIFORM_BUFFERS MAX_PER_SET_UNIFORM_BUFFERS
#define MAX_PER_STAGE_STORAGE_BUFFERS 64
#define MAX_PER_STAGE_STORAGE_IMAGES 32
#define MAX_PER_STAGE_INPUT_ATTACHMENTS MAX_PER_SET_INPUT_ATTACHMENTS
#else
#define MAX_PER_STAGE_SAMPLED_IMAGES MAX_PER_SET_SAMPLED_IMAGES
#define MAX_PER_STAGE_SAMPLERS MAX_PER_SET_SAMPLERS
#define MAX_PER_STAGE_UNIFORM_BUFFERS MAX_PER_SET_UNIFORM_BUFFERS
#define MAX_PER_STAGE_STORAGE_BUFFERS MAX_PER_SET_STORAGE_BUFFERS
#define MAX_PER_STAGE_STORAGE_IMAGES MAX_PER_SET_STORAGE_IMAGES
#define MAX_PER_STAGE_INPUT_ATTACHMENTS MAX_PER_SET_INPUT_ATTACHMENTS
#endif
#define MAX_PER_STAGE_RESOURCES ( \
MAX_PER_STAGE_SAMPLED_IMAGES + MAX_PER_STAGE_SAMPLERS + \
MAX_PER_STAGE_UNIFORM_BUFFERS + MAX_PER_STAGE_STORAGE_BUFFERS + \
MAX_PER_STAGE_STORAGE_IMAGES + MAX_PER_STAGE_INPUT_ATTACHMENTS)
struct nir_shader;
struct pan_blend_state;
struct panvk_device;
enum panvk_varying_buf_id {
PANVK_VARY_BUF_GENERAL,
PANVK_VARY_BUF_POSITION,
PANVK_VARY_BUF_PSIZ,
/* Keep last */
PANVK_VARY_BUF_MAX,
};
#if PAN_ARCH < 9
enum panvk_desc_table_id {
PANVK_DESC_TABLE_USER = 0,
PANVK_DESC_TABLE_CS_DYN_SSBOS = MAX_SETS,
PANVK_DESC_TABLE_COMPUTE_COUNT = PANVK_DESC_TABLE_CS_DYN_SSBOS + 1,
PANVK_DESC_TABLE_VS_DYN_SSBOS = MAX_SETS,
PANVK_DESC_TABLE_FS_DYN_SSBOS = MAX_SETS + 1,
PANVK_DESC_TABLE_GFX_COUNT = PANVK_DESC_TABLE_FS_DYN_SSBOS + 1,
};
#endif
#define PANVK_COLOR_ATTACHMENT(x) (x)
#define PANVK_ZS_ATTACHMENT 255
struct panvk_input_attachment_info {
uint32_t target;
uint32_t conversion;
};
/* One attachment per color, one for depth, one for stencil, and the last one
* for the attachment without an InputAttachmentIndex attribute. */
#define INPUT_ATTACHMENT_MAP_SIZE 11
#define FAU_WORD_SIZE sizeof(uint64_t)
#define aligned_u64 __attribute__((aligned(sizeof(uint64_t)))) uint64_t
/* System values which are common to both graphics and compute. These are
* always at the same offset in both graphics and compute allowing us to
* compile the shader without knowing which queue it will be dispatched on.
*/
struct panvk_common_sysvals_inner {
/* Address of sysval/push constant buffer used for indirect loads */
aligned_u64 push_uniforms;
/* Address of the printf buffer */
aligned_u64 printf_buffer_address;
} __attribute__((aligned(FAU_WORD_SIZE)));
struct panvk_common_sysvals {
uint32_t _pad[4];
struct panvk_common_sysvals_inner common;
} __attribute__((aligned(FAU_WORD_SIZE)));
static_assert((offsetof(struct panvk_common_sysvals, common) %
FAU_WORD_SIZE) == 0,
"struct panvk_graphics_sysvals_inner must be 8-byte aligned");
static_assert((sizeof(struct panvk_common_sysvals_inner) %
FAU_WORD_SIZE) == 0,
"struct panvk_graphics_sysvals_inner must be 8-byte aligned");
#define SYSVALS_COMMON_START \
(offsetof(struct panvk_common_sysvals, common) / FAU_WORD_SIZE)
#define SYSVALS_COMMON_COUNT \
(sizeof(struct panvk_common_sysvals_inner) / FAU_WORD_SIZE)
#define SYSVALS_COMMON_END (SYSVALS_COMMON_START + SYSVALS_COMMON_COUNT)
struct panvk_graphics_sysvals {
/* Blend constants MUST come first because their position cannot depend on
* the FAU packing of the fragment shader.
*/
struct {
float constants[4];
} blend;
/* This must be at the same offset for both compute and graphics */
struct panvk_common_sysvals_inner common;
struct {
struct {
float x, y, z;
} scale, offset;
} viewport;
struct {
#if PAN_ARCH < 9
int32_t raw_vertex_offset;
uint32_t num_vertices; /* iter13: XFB needs per-draw vertex count */
/* aligned_u64 attribute below inserts the 4-byte alignment gap
* after num_vertices automatically — no explicit pad needed. */
aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */
#endif
int32_t first_vertex;
int32_t base_instance;
uint32_t noperspective_varyings;
} vs;
struct {
aligned_u64 blend_descs[MAX_RTS];
} fs;
struct panvk_input_attachment_info iam[INPUT_ATTACHMENT_MAP_SIZE];
#if PAN_ARCH < 9
/* gl_Layer on Bifrost is a bit of hack. We have to issue one draw per
* layer, and filter primitives at the VS level.
*/
int32_t layer_id;
struct {
aligned_u64 sets[PANVK_DESC_TABLE_GFX_COUNT];
} desc;
#endif
} __attribute__((aligned(FAU_WORD_SIZE)));
static_assert(offsetof(struct panvk_graphics_sysvals, blend) == 0,
"panvk_graphics_sysvals::blend must be at the start");
static_assert(offsetof(struct panvk_graphics_sysvals, common) ==
offsetof(struct panvk_common_sysvals, common),
"Common sysvals must be at the same offset everywhere");
static_assert((sizeof(struct panvk_graphics_sysvals) % FAU_WORD_SIZE) == 0,
"struct panvk_graphics_sysvals must be 8-byte aligned");
#if PAN_ARCH < 9
static_assert((offsetof(struct panvk_graphics_sysvals, desc) % FAU_WORD_SIZE) ==
0,
"panvk_graphics_sysvals::desc must be 8-byte aligned");
#endif
struct panvk_compute_sysvals {
struct {
uint32_t x, y, z;
} base;
uint32_t _pad;
/* This must be at the same offset for both compute and graphics */
struct panvk_common_sysvals_inner common;
struct {
uint32_t x, y, z;
} num_work_groups;
struct {
uint32_t x, y, z;
} local_group_size;
#if PAN_ARCH < 9
struct {
aligned_u64 sets[PANVK_DESC_TABLE_COMPUTE_COUNT];
} desc;
#endif
} __attribute__((aligned(FAU_WORD_SIZE)));
static_assert(offsetof(struct panvk_compute_sysvals, common) ==
offsetof(struct panvk_common_sysvals, common),
"Common sysvals must be at the same offset everywhere");
static_assert((sizeof(struct panvk_compute_sysvals) % FAU_WORD_SIZE) == 0,
"struct panvk_compute_sysvals must be 8-byte aligned");
#if PAN_ARCH < 9
static_assert((offsetof(struct panvk_compute_sysvals, desc) % FAU_WORD_SIZE) ==
0,
"panvk_compute_sysvals::desc must be 8-byte aligned");
#endif
/* This is not the final offset in the push constant buffer (AKA FAU), but
* just a magic offset we use before packing push constants so we can easily
* identify the type of push constant (driver sysvals vs user push constants).
*/
#define SYSVALS_PUSH_CONST_BASE MAX_PUSH_CONSTANTS_SIZE
#define common_sysval_size(__name) \
sizeof(((struct panvk_common_sysvals *)NULL)->common.__name)
#define graphics_sysval_size(__name) \
sizeof(((struct panvk_graphics_sysvals *)NULL)->__name)
#define compute_sysval_size(__name) \
sizeof(((struct panvk_compute_sysvals *)NULL)->__name)
#define sysval_size(__ptype, __name) __ptype##_sysval_size(__name)
#define common_sysval_offset(__name) \
offsetof(struct panvk_common_sysvals, common.__name)
#define graphics_sysval_offset(__name) \
offsetof(struct panvk_graphics_sysvals, __name)
#define compute_sysval_offset(__name) \
offsetof(struct panvk_compute_sysvals, __name)
#define sysval_offset(__ptype, __name) __ptype##_sysval_offset(__name)
#define sysval_entry_size(__ptype, __name) \
sizeof(((struct panvk_##__ptype##_sysvals *)NULL)->__name[0])
#define sysval_entry_offset(__ptype, __name, __idx) \
(sysval_offset(__ptype, __name) + \
(sysval_entry_size(__ptype, __name) * __idx))
#define sysval_fau_start(__ptype, __name) \
(sysval_offset(__ptype, __name) / FAU_WORD_SIZE)
#define sysval_fau_end(__ptype, __name) \
((sysval_offset(__ptype, __name) + sysval_size(__ptype, __name) - 1) / \
FAU_WORD_SIZE)
#define sysval_fau_entry_start(__ptype, __name, __idx) \
(sysval_entry_offset(__ptype, __name, __idx) / FAU_WORD_SIZE)
#define sysval_fau_entry_end(__ptype, __name, __idx) \
((sysval_entry_offset(__ptype, __name, __idx + 1) - 1) / FAU_WORD_SIZE)
#define shader_remapped_fau_offset(__shader, __kind, __offset) \
((FAU_WORD_SIZE * BITSET_PREFIX_SUM((__shader)->fau.used_##__kind, \
(__offset) / FAU_WORD_SIZE)) + \
((__offset) % FAU_WORD_SIZE))
#define shader_remapped_sysval_offset(__shader, __offset) \
shader_remapped_fau_offset(__shader, sysvals, __offset)
#define shader_remapped_push_const_offset(__shader, __offset) \
(((__shader)->fau.sysval_count * FAU_WORD_SIZE) + \
shader_remapped_fau_offset(__shader, push_consts, __offset))
#define shader_use_sysval(__shader, __ptype, __name) \
BITSET_SET_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_start(__ptype, __name), \
sysval_fau_end(__ptype, __name))
#define shader_uses_sysval(__shader, __ptype, __name) \
BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_start(__ptype, __name), \
sysval_fau_end(__ptype, __name))
#define shader_uses_sysval_entry(__shader, __ptype, __name, __idx) \
BITSET_TEST_RANGE((__shader)->fau.used_sysvals, \
sysval_fau_entry_start(__ptype, __name, __idx), \
sysval_fau_entry_end(__ptype, __name, __idx))
#define shader_use_sysval_range(__shader, __base, __range) \
BITSET_SET_RANGE((__shader)->fau.used_sysvals, (__base) / FAU_WORD_SIZE, \
((__base) + (__range) - 1) / FAU_WORD_SIZE)
#define shader_use_push_const_range(__shader, __base, __range) \
BITSET_SET_RANGE((__shader)->fau.used_push_consts, \
(__base) / FAU_WORD_SIZE, \
((__base) + (__range) - 1) / FAU_WORD_SIZE)
#define load_sysval(__b, __ptype, __bitsz, __name) \
nir_load_push_constant( \
__b, sysval_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \
nir_imm_int(__b, sysval_offset(__ptype, __name)), \
.base = SYSVALS_PUSH_CONST_BASE)
#define load_sysval_entry(__b, __ptype, __bitsz, __name, __dyn_idx) \
nir_load_push_constant( \
__b, sysval_entry_size(__ptype, __name) / ((__bitsz) / 8), __bitsz, \
nir_imul_imm(__b, __dyn_idx, sysval_entry_size(__ptype, __name)), \
.base = SYSVALS_PUSH_CONST_BASE + sysval_offset(__ptype, __name), \
.range = sysval_size(__ptype, __name))
#if PAN_ARCH < 9
enum panvk_bifrost_desc_table_type {
PANVK_BIFROST_DESC_TABLE_INVALID = -1,
/* UBO is encoded on 8 bytes */
PANVK_BIFROST_DESC_TABLE_UBO = 0,
/* Images are using a <3DAttributeBuffer,Attribute> pair, each
* of them being stored in a separate table. */
PANVK_BIFROST_DESC_TABLE_IMG,
/* Texture and sampler are encoded on 32 bytes */
PANVK_BIFROST_DESC_TABLE_TEXTURE,
PANVK_BIFROST_DESC_TABLE_SAMPLER,
PANVK_BIFROST_DESC_TABLE_COUNT,
};
#endif
#define COPY_DESC_HANDLE(table, idx) ((table << 28) | (idx))
#define COPY_DESC_HANDLE_EXTRACT_INDEX(handle) ((handle) & BITFIELD_MASK(28))
#define COPY_DESC_HANDLE_EXTRACT_TABLE(handle) ((handle) >> 28)
#define MAX_COMPUTE_SYSVAL_FAUS \
(sizeof(struct panvk_compute_sysvals) / FAU_WORD_SIZE)
#define MAX_GFX_SYSVAL_FAUS \
(sizeof(struct panvk_graphics_sysvals) / FAU_WORD_SIZE)
#define MAX_SYSVAL_FAUS MAX2(MAX_COMPUTE_SYSVAL_FAUS, MAX_GFX_SYSVAL_FAUS)
#define MAX_PUSH_CONST_FAUS (MAX_PUSH_CONSTANTS_SIZE / FAU_WORD_SIZE)
struct panvk_shader_fau_info {
BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
uint32_t sysval_count;
uint32_t total_count;
};
struct panvk_shader_desc_info {
uint32_t used_set_mask;
#if PAN_ARCH < 9
struct {
uint32_t map[MAX_DYNAMIC_UNIFORM_BUFFERS];
uint32_t count;
} dyn_ubos;
struct {
uint32_t map[MAX_DYNAMIC_STORAGE_BUFFERS];
uint32_t count;
} dyn_ssbos;
struct {
struct panvk_priv_mem map;
uint32_t count[PANVK_BIFROST_DESC_TABLE_COUNT];
} others;
#else
struct {
uint32_t map[MAX_DYNAMIC_BUFFERS];
uint32_t count;
} dyn_bufs;
uint32_t fs_varying_attr_desc_count;
#endif
};
struct panvk_shader_variant {
struct pan_shader_info info;
union {
struct {
struct pan_compute_dim local_size;
} cs;
struct {
struct pan_earlyzs_lut earlyzs_lut;
uint32_t input_attachment_read;
} fs;
};
struct panvk_shader_desc_info desc_info;
struct panvk_shader_fau_info fau;
const void *bin_ptr;
uint32_t bin_size;
bool own_bin;
struct panvk_priv_mem code_mem;
#if PAN_ARCH < 9
struct panvk_priv_mem rsd;
#else
union {
struct panvk_priv_mem spd;
struct {
#if PAN_ARCH < 12
struct panvk_priv_mem pos_points;
struct panvk_priv_mem pos_triangles;
struct panvk_priv_mem var;
#else
struct panvk_priv_mem all_points;
struct panvk_priv_mem all_triangles;
#endif
} spds;
};
#endif
const char *nir_str;
const char *asm_str;
};
enum panvk_vs_variant {
/* Hardware vertex shader, when next stage is fragment */
PANVK_VS_VARIANT_HW,
PANVK_VS_VARIANTS,
};
struct panvk_shader {
struct vk_shader vk;
struct panvk_shader_variant variants[];
};
static inline unsigned
panvk_shader_num_variants(mesa_shader_stage stage)
{
if (stage == MESA_SHADER_VERTEX)
return PANVK_VS_VARIANTS;
return 1;
}
static const char *panvk_vs_shader_variant_name[] = {
[PANVK_VS_VARIANT_HW] = NULL,
};
static const char *
panvk_shader_variant_name(const struct panvk_shader *shader,
struct panvk_shader_variant *variant)
{
unsigned i = variant - shader->variants;
assert(i < panvk_shader_num_variants(shader->vk.stage));
if (shader->vk.stage == MESA_SHADER_VERTEX) {
assert(i < ARRAY_SIZE(panvk_vs_shader_variant_name));
return panvk_vs_shader_variant_name[i];
}
assert(panvk_shader_num_variants(shader->vk.stage) == 1);
return NULL;
}
static const struct panvk_shader_variant *
panvk_shader_only_variant(const struct panvk_shader *shader)
{
if (!shader)
return NULL;
assert(panvk_shader_num_variants(shader->vk.stage) == 1);
return &shader->variants[0];
}
static const struct panvk_shader_variant *
panvk_shader_hw_variant(const struct panvk_shader *shader)
{
if (!shader)
return NULL;
return &shader->variants[0];
}
static inline uint64_t
panvk_shader_variant_get_dev_addr(const struct panvk_shader_variant *shader)
{
return shader != NULL ? panvk_priv_mem_dev_addr(shader->code_mem) : 0;
}
#define panvk_shader_foreach_variant(__shader, __var) \
for (struct panvk_shader_variant *__var = (__shader)->variants; \
__var < (__shader)->variants + \
panvk_shader_num_variants((__shader)->vk.stage); \
++__var)
#if PAN_ARCH < 9
struct panvk_shader_link {
struct {
struct panvk_priv_mem attribs;
} vs, fs;
unsigned buf_strides[PANVK_VARY_BUF_MAX];
};
VkResult panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
const struct panvk_shader_variant *vs,
const struct panvk_shader_variant *fs,
struct panvk_shader_link *link);
static inline void
panvk_shader_link_cleanup(struct panvk_shader_link *link)
{
panvk_pool_free_mem(&link->vs.attribs);
panvk_pool_free_mem(&link->fs.attribs);
}
#endif
bool panvk_per_arch(nir_lower_input_attachment_loads)(
nir_shader *nir,
const struct vk_graphics_pipeline_state *state,
uint32_t *input_attachment_read_out);
void panvk_per_arch(nir_lower_descriptors)(
nir_shader *nir, struct panvk_device *dev,
const struct vk_pipeline_robustness_state *rs, uint32_t set_layout_count,
struct vk_descriptor_set_layout *const *set_layouts,
const struct vk_graphics_pipeline_state *state,
struct panvk_shader_desc_info *desc_info);
/* This a stripped-down version of panvk_shader for internal shaders that
* are managed by vk_meta (blend and preload shaders). Those don't need the
* complexity inherent to user provided shaders as they're not exposed. */
struct panvk_internal_shader {
struct vk_shader vk;
struct pan_shader_info info;
struct panvk_priv_mem code_mem;
#if PAN_ARCH < 9
struct panvk_priv_mem rsd;
#else
struct panvk_priv_mem spd;
#endif
};
VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_internal_shader, vk.base, VkShaderEXT,
VK_OBJECT_TYPE_SHADER_EXT)
void panvk_per_arch(compiler_lock)(void);
void panvk_per_arch(compiler_unlock)(void);
VkResult panvk_per_arch(create_internal_shader)(
struct panvk_device *dev, nir_shader *nir,
struct pan_compile_inputs *compiler_inputs,
struct panvk_internal_shader **shader_out);
VkResult panvk_per_arch(create_shader_from_binary)(
struct panvk_device *dev, const struct pan_shader_info *info,
struct pan_compute_dim local_size, const void *bin_ptr, size_t bin_size,
struct panvk_shader **shader_out);
#endif