a4e7d8ab90
panvk-bifrost campaigns (r1..r4 Vulkan compositor + r5.video1 Vulkan
video decode) shipped before this repo existed; the deliverable
patches live in marfrit-packages, but the reasoning chain, phase docs,
and source-state evidence lived only in local working trees on the
development host.
This retrofit imports:
- mesa-panvk-bifrost/ — r1..r4 era phase docs (iter1..iter18)
(libmali stub blobs at iter18/blob/ excluded
— 109MB of RE artifacts replaced with a README
pointer)
- mesa-panvk-bifrost-video/ — sibling campaign phase docs + probe
- evidence/ — frozen .tgz source snapshots at each milestone
(basis for the 0005 patch diff generation)
Future iterations should branch off here from day one, so each iter is
a commit rather than a snapshot. See [[feedback-session-local-process-pins]]
for the process drift this retrofit closes.
Total: 1.9 MB across 124 files.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2415 lines
82 KiB
C
2415 lines
82 KiB
C
/*
|
|
* Copyright © 2025 Arm Ltd.
|
|
* Copyright © 2021 Collabora Ltd.
|
|
*
|
|
* Derived from tu_shader.c which is:
|
|
* Copyright © 2019 Google LLC
|
|
*
|
|
* Also derived from anv_pipeline.c which is
|
|
* Copyright © 2015 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "genxml/gen_macros.h"
|
|
|
|
#include "panvk_cmd_buffer.h"
|
|
#include "panvk_descriptor_set_layout.h"
|
|
#include "panvk_device.h"
|
|
#include "panvk_instance.h"
|
|
#include "panvk_mempool.h"
|
|
#include "panvk_physical_device.h"
|
|
#include "panvk_sampler.h"
|
|
#include "panvk_shader.h"
|
|
#include "pan_nir.h" /* iter13: pan_nir_lower_xfb */
|
|
|
|
#include "spirv/nir_spirv.h"
|
|
#include "util/memstream.h"
|
|
#include "util/mesa-sha1.h"
|
|
#include "util/shader_stats.h"
|
|
#include "util/u_dynarray.h"
|
|
#include "nir_builder.h"
|
|
#include "nir_conversion_builder.h"
|
|
#include "nir_deref.h"
|
|
|
|
#include "shader_enums.h"
|
|
#include "vk_graphics_state.h"
|
|
#include "vk_nir_convert_ycbcr.h"
|
|
#include "vk_shader_module.h"
|
|
#include "vk_ycbcr_conversion.h"
|
|
|
|
#include "compiler/bifrost/bifrost_nir.h"
|
|
#include "compiler/pan_compiler.h"
|
|
#include "compiler/pan_nir.h"
|
|
#include "pan_shader.h"
|
|
|
|
#include "vk_log.h"
|
|
#include "vk_pipeline.h"
|
|
#include "vk_pipeline_layout.h"
|
|
#include "vk_shader.h"
|
|
#include "vk_util.h"
|
|
|
|
#define FAU_WORD_COUNT 64
|
|
|
|
struct panvk_lower_sysvals_context {
|
|
struct panvk_shader_variant *shader;
|
|
const struct vk_graphics_pipeline_state *state;
|
|
};
|
|
|
|
static bool
|
|
panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
const struct panvk_lower_sysvals_context *ctx = data;
|
|
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
|
unsigned bit_size = intr->def.bit_size;
|
|
nir_def *val = NULL;
|
|
b->cursor = nir_before_instr(instr);
|
|
|
|
switch (intr->intrinsic) {
|
|
case nir_intrinsic_load_base_workgroup_id:
|
|
val = load_sysval(b, compute, bit_size, base);
|
|
break;
|
|
case nir_intrinsic_load_num_workgroups:
|
|
val = load_sysval(b, compute, bit_size, num_work_groups);
|
|
break;
|
|
case nir_intrinsic_load_workgroup_size:
|
|
val = load_sysval(b, compute, bit_size, local_group_size);
|
|
break;
|
|
case nir_intrinsic_load_viewport_scale:
|
|
val = load_sysval(b, graphics, bit_size, viewport.scale);
|
|
break;
|
|
case nir_intrinsic_load_viewport_offset:
|
|
val = load_sysval(b, graphics, bit_size, viewport.offset);
|
|
break;
|
|
case nir_intrinsic_load_first_vertex:
|
|
val = load_sysval(b, graphics, bit_size, vs.first_vertex);
|
|
break;
|
|
case nir_intrinsic_load_base_instance:
|
|
val = load_sysval(b, graphics, bit_size, vs.base_instance);
|
|
break;
|
|
case nir_intrinsic_load_noperspective_varyings_pan:
|
|
/* TODO: use a VS epilog specialized on constant noperspective_varyings
|
|
* with VK_EXT_graphics_pipeline_libraries and VK_EXT_shader_object */
|
|
assert(b->shader->info.stage == MESA_SHADER_VERTEX);
|
|
val = load_sysval(b, graphics, bit_size, vs.noperspective_varyings);
|
|
break;
|
|
|
|
#if PAN_ARCH < 9
|
|
case nir_intrinsic_load_raw_vertex_offset_pan:
|
|
val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
|
|
break;
|
|
case nir_intrinsic_load_num_vertices: /* iter13: XFB index calc */
|
|
val = load_sysval(b, graphics, bit_size, vs.num_vertices);
|
|
break;
|
|
case nir_intrinsic_load_xfb_address: { /* iter13: XFB buffer N base address */
|
|
unsigned idx = nir_intrinsic_base(intr);
|
|
switch (idx) {
|
|
case 0: val = load_sysval(b, graphics, bit_size, vs.xfb_address[0]); break;
|
|
case 1: val = load_sysval(b, graphics, bit_size, vs.xfb_address[1]); break;
|
|
case 2: val = load_sysval(b, graphics, bit_size, vs.xfb_address[2]); break;
|
|
case 3: val = load_sysval(b, graphics, bit_size, vs.xfb_address[3]); break;
|
|
default: return false;
|
|
}
|
|
break;
|
|
}
|
|
case nir_intrinsic_load_layer_id:
|
|
assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
|
|
val = load_sysval(b, graphics, bit_size, layer_id);
|
|
break;
|
|
case nir_intrinsic_load_view_index:
|
|
assert(b->shader->info.stage != MESA_SHADER_COMPUTE);
|
|
if (ctx->state->rp->view_mask == 0)
|
|
val = nir_imm_zero(b, 1, 32);
|
|
else
|
|
val = load_sysval(b, graphics, bit_size, layer_id);
|
|
break;
|
|
#endif
|
|
|
|
case nir_intrinsic_load_draw_id:
|
|
/* Multidraw is supported on v10. */
|
|
if (PAN_ARCH >= 10)
|
|
return false;
|
|
|
|
/* TODO: We only implement single-draw direct and indirect draws, so this
|
|
* is sufficient. We'll revisit this when we get around to implementing
|
|
* multidraw. */
|
|
assert(b->shader->info.stage == MESA_SHADER_VERTEX);
|
|
val = nir_imm_int(b, 0);
|
|
break;
|
|
|
|
case nir_intrinsic_load_printf_buffer_address:
|
|
val = load_sysval(b, common, bit_size, printf_buffer_address);
|
|
break;
|
|
|
|
case nir_intrinsic_load_blend_descriptor_pan: {
|
|
uint32_t loc = nir_intrinsic_base(intr);
|
|
val = load_sysval(b, graphics, bit_size, fs.blend_descs[loc]);
|
|
break;
|
|
}
|
|
|
|
case nir_intrinsic_load_input_attachment_target_pan: {
|
|
const struct vk_input_attachment_location_state *ial =
|
|
ctx->state ? ctx->state->ial : NULL;
|
|
|
|
if (ial && nir_src_is_const(intr->src[0])) {
|
|
uint32_t index = nir_src_as_uint(intr->src[0]);
|
|
uint32_t depth_idx = ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX
|
|
? 0
|
|
: ial->depth_att + 1;
|
|
uint32_t stencil_idx = ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX
|
|
? 0
|
|
: ial->stencil_att + 1;
|
|
uint32_t target = ~0;
|
|
|
|
if (depth_idx == index || stencil_idx == index) {
|
|
target = PANVK_ZS_ATTACHMENT;
|
|
} else {
|
|
for (unsigned i = 0; i < ial->color_attachment_count; i++) {
|
|
if (ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
|
|
continue;
|
|
|
|
if (ial->color_map[i] + 1 == index) {
|
|
target = PANVK_COLOR_ATTACHMENT(i);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
val = nir_imm_int(b, target);
|
|
} else {
|
|
nir_def *ia_info =
|
|
load_sysval_entry(b, graphics, bit_size, iam, intr->src[0].ssa);
|
|
|
|
val = nir_channel(b, ia_info, 0);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case nir_intrinsic_load_input_attachment_conv_pan: {
|
|
nir_def *ia_info =
|
|
load_sysval_entry(b, graphics, bit_size, iam, intr->src[0].ssa);
|
|
|
|
val = nir_channel(b, ia_info, 1);
|
|
break;
|
|
}
|
|
|
|
case nir_intrinsic_load_ro_sink_address_poly:
|
|
val = nir_imm_int64(b, PAN_SHADER_OOB_ADDRESS);
|
|
break;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
assert(val->num_components == intr->def.num_components);
|
|
|
|
b->cursor = nir_after_instr(instr);
|
|
nir_def_rewrite_uses(&intr->def, val);
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
panvk_lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
UNUSED void *data)
|
|
{
|
|
if (intrin->intrinsic != nir_intrinsic_load_input)
|
|
return false;
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
nir_def *ld_attr = nir_load_attribute_pan(
|
|
b, intrin->def.num_components, intrin->def.bit_size,
|
|
PAN_ARCH < 9 ?
|
|
nir_load_raw_vertex_id_pan(b) :
|
|
nir_load_vertex_id(b),
|
|
nir_load_instance_id(b),
|
|
nir_get_io_offset_src(intrin)->ssa,
|
|
.base = nir_intrinsic_base(intrin),
|
|
.component = nir_intrinsic_component(intrin),
|
|
.dest_type = nir_intrinsic_dest_type(intrin));
|
|
nir_def_replace(&intrin->def, ld_attr);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
panvk_lower_load_fs_input(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
UNUSED void *data)
|
|
{
|
|
if (intrin->intrinsic != nir_intrinsic_load_input)
|
|
return false;
|
|
|
|
/* Lower PrimitiveID varying loads to the equivalent intrinsic. This only
|
|
* works since v6 and will require additional changes if PrimitiveID is
|
|
* explicitly written to (for example by a geometry shader). */
|
|
if (nir_intrinsic_io_semantics(intrin).location ==
|
|
VARYING_SLOT_PRIMITIVE_ID) {
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
nir_def_replace(&intrin->def, nir_load_primitive_id(b));
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
#if PAN_ARCH < 9
|
|
static bool
|
|
lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
|
|
|
if (intr->intrinsic != nir_intrinsic_copy_deref)
|
|
return false;
|
|
|
|
nir_variable *dst_var = nir_intrinsic_get_var(intr, 0);
|
|
nir_variable *src_var = nir_intrinsic_get_var(intr, 1);
|
|
|
|
if (!dst_var || dst_var->data.mode != nir_var_shader_out || !src_var ||
|
|
src_var->data.mode != nir_var_shader_temp)
|
|
return false;
|
|
|
|
if (dst_var->data.location == VARYING_SLOT_LAYER) {
|
|
/* We don't really write the layer, we just make sure primitives are
|
|
* discarded if gl_Layer doesn't match the layer passed to the draw.
|
|
*/
|
|
b->cursor = nir_instr_remove(instr);
|
|
return true;
|
|
}
|
|
|
|
if (dst_var->data.location == VARYING_SLOT_POS) {
|
|
nir_variable *temp_layer_var = data;
|
|
nir_variable *temp_pos_var = src_var;
|
|
|
|
b->cursor = nir_before_instr(instr);
|
|
nir_def *layer = nir_load_var(b, temp_layer_var);
|
|
nir_def *pos = nir_load_var(b, temp_pos_var);
|
|
nir_def *inf_pos = nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, 1.0f);
|
|
nir_def *ref_layer = load_sysval(b, graphics, 32, layer_id);
|
|
|
|
nir_store_var(b, temp_pos_var,
|
|
nir_bcsel(b, nir_ieq(b, layer, ref_layer), pos, inf_pos),
|
|
0xf);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
lower_layer_writes(nir_shader *nir)
|
|
{
|
|
if (nir->info.stage == MESA_SHADER_FRAGMENT)
|
|
return false;
|
|
|
|
nir_variable *temp_layer_var = NULL;
|
|
bool has_layer_var = false;
|
|
|
|
nir_foreach_variable_with_modes(var, nir,
|
|
nir_var_shader_out | nir_var_shader_temp) {
|
|
if (var->data.mode == nir_var_shader_out &&
|
|
var->data.location == VARYING_SLOT_LAYER)
|
|
has_layer_var = true;
|
|
|
|
if (var->data.mode == nir_var_shader_temp &&
|
|
var->data.location == VARYING_SLOT_LAYER)
|
|
temp_layer_var = var;
|
|
}
|
|
|
|
if (!has_layer_var)
|
|
return false;
|
|
|
|
assert(temp_layer_var);
|
|
|
|
return nir_shader_instructions_pass(nir, lower_gl_pos_layer_writes,
|
|
nir_metadata_control_flow,
|
|
temp_layer_var);
|
|
}
|
|
#endif
|
|
|
|
static void
|
|
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
|
|
{
|
|
assert(glsl_type_is_vector_or_scalar(type));
|
|
|
|
uint32_t comp_size =
|
|
glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
|
|
unsigned length = glsl_get_vector_elements(type);
|
|
*size = comp_size * length, *align = comp_size * (length == 3 ? 4 : length);
|
|
}
|
|
|
|
static inline nir_address_format
|
|
panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
|
|
{
|
|
switch (robustness) {
|
|
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
|
|
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
|
|
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
|
|
return PAN_ARCH < 9 ? nir_address_format_32bit_index_offset
|
|
: nir_address_format_vec2_index_32bit_offset;
|
|
default:
|
|
UNREACHABLE("Invalid robust buffer access behavior");
|
|
}
|
|
}
|
|
|
|
static inline nir_address_format
|
|
panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
|
|
{
|
|
switch (robustness) {
|
|
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
|
|
return PAN_ARCH < 9 ? nir_address_format_64bit_global_32bit_offset
|
|
: nir_address_format_vec2_index_32bit_offset;
|
|
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
|
|
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
|
|
return PAN_ARCH < 9 ? nir_address_format_64bit_bounded_global
|
|
: nir_address_format_vec2_index_32bit_offset;
|
|
default:
|
|
UNREACHABLE("Invalid robust buffer access behavior");
|
|
}
|
|
}
|
|
|
|
static const nir_shader_compiler_options *
|
|
panvk_get_nir_options(UNUSED struct vk_physical_device *vk_pdev,
|
|
UNUSED mesa_shader_stage stage,
|
|
UNUSED const struct vk_pipeline_robustness_state *rs)
|
|
{
|
|
struct panvk_physical_device *phys_dev = to_panvk_physical_device(vk_pdev);
|
|
return pan_get_nir_shader_compiler_options(
|
|
pan_arch(phys_dev->kmod.dev->props.gpu_id));
|
|
}
|
|
|
|
static struct spirv_to_nir_options
|
|
panvk_get_spirv_options(UNUSED struct vk_physical_device *vk_pdev,
|
|
UNUSED mesa_shader_stage stage,
|
|
const struct vk_pipeline_robustness_state *rs)
|
|
{
|
|
return (struct spirv_to_nir_options){
|
|
.ubo_addr_format = panvk_buffer_ubo_addr_format(rs->uniform_buffers),
|
|
.ssbo_addr_format = panvk_buffer_ssbo_addr_format(rs->storage_buffers),
|
|
.phys_ssbo_addr_format = nir_address_format_64bit_global,
|
|
.shared_addr_format = nir_address_format_32bit_offset,
|
|
};
|
|
}
|
|
|
|
static void
|
|
panvk_preprocess_nir(struct vk_physical_device *vk_pdev,
|
|
nir_shader *nir,
|
|
UNUSED const struct vk_pipeline_robustness_state *rs)
|
|
{
|
|
struct panvk_physical_device *pdev = to_panvk_physical_device(vk_pdev);
|
|
|
|
/* Ensure to regroup output variables at the same location */
|
|
if (nir->info.stage == MESA_SHADER_FRAGMENT)
|
|
NIR_PASS(_, nir, nir_opt_vectorize_io_vars, nir_var_shader_out);
|
|
|
|
NIR_PASS(_, nir, nir_lower_io_vars_to_temporaries,
|
|
nir_shader_get_entrypoint(nir), nir_var_shader_out);
|
|
|
|
#if PAN_ARCH < 9
|
|
/* This needs to be done just after the io_to_temporaries pass, because we
|
|
* rely on out temporaries to collect the final layer_id value.
|
|
*/
|
|
NIR_PASS(_, nir, lower_layer_writes);
|
|
#endif
|
|
|
|
NIR_PASS(_, nir, nir_lower_global_vars_to_local);
|
|
NIR_PASS(_, nir, nir_split_var_copies);
|
|
|
|
NIR_PASS(_, nir, nir_opt_copy_prop_vars);
|
|
NIR_PASS(_, nir, nir_opt_combine_stores, nir_var_all);
|
|
NIR_PASS(_, nir, nir_opt_loop);
|
|
|
|
NIR_PASS(_, nir, nir_opt_barrier_modes);
|
|
NIR_PASS(_, nir, nir_opt_acquire_release_barriers, SCOPE_DEVICE);
|
|
|
|
/* Do texture lowering here. We need to lower texture stuff
|
|
* now, before we call panvk_per_arch(nir_lower_descriptors)() because some
|
|
* of the texture lowering generates nir_texop_txs which we handle as part
|
|
* of descriptor lowering.
|
|
*
|
|
* TODO: We really should be doing this in common code, not duplicated in
|
|
* panvk. In order to do that, we need to rework the panfrost compile
|
|
* flow to look more like the Intel flow:
|
|
*
|
|
* 1. Compile SPIR-V to NIR and maybe do a tiny bit of lowering that needs
|
|
* to be done really early.
|
|
*
|
|
* 2. pan_preprocess_nir: Does common lowering and runs the optimization
|
|
* loop. Nothing here should be API-specific.
|
|
*
|
|
* 3. Do additional lowering in panvk
|
|
*
|
|
* 4. pan_postprocess_nir: Does final lowering and runs the optimization
|
|
* loop again. This can happen as part of the final compile.
|
|
*
|
|
* This would give us a better place to do panvk-specific lowering.
|
|
*/
|
|
pan_nir_lower_texture_early(nir, pdev->kmod.dev->props.gpu_id);
|
|
NIR_PASS(_, nir, nir_lower_system_values);
|
|
|
|
nir_lower_compute_system_values_options options = {
|
|
.has_base_workgroup_id = true,
|
|
};
|
|
|
|
NIR_PASS(_, nir, nir_lower_compute_system_values, &options);
|
|
|
|
if (nir->info.stage == MESA_SHADER_FRAGMENT)
|
|
NIR_PASS(_, nir, nir_lower_wpos_center);
|
|
|
|
pan_optimize_nir(nir, pdev->kmod.dev->props.gpu_id);
|
|
|
|
NIR_PASS(_, nir, nir_split_var_copies);
|
|
NIR_PASS(_, nir, nir_lower_var_copies);
|
|
|
|
assert(pdev->kmod.dev->props.shader_present != 0);
|
|
uint64_t core_max_id =
|
|
util_last_bit(pdev->kmod.dev->props.shader_present) - 1;
|
|
NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_core_max_id_arm,
|
|
core_max_id);
|
|
|
|
pan_preprocess_nir(nir, pdev->kmod.dev->props.gpu_id);
|
|
|
|
}
|
|
|
|
static void
|
|
panvk_hash_state(struct vk_physical_device *device,
|
|
const struct vk_graphics_pipeline_state *state,
|
|
const struct vk_features *enabled_features,
|
|
VkShaderStageFlags stages, blake3_hash blake3_out)
|
|
{
|
|
struct mesa_blake3 blake3_ctx;
|
|
_mesa_blake3_init(&blake3_ctx);
|
|
|
|
if (state != NULL) {
|
|
/* This doesn't impact the shader compile but it does go in the
|
|
* panvk_shader and gets [de]serialized along with the binary so
|
|
* we need to hash it.
|
|
*/
|
|
bool sample_shading_enable =
|
|
state->ms && state->ms->sample_shading_enable;
|
|
_mesa_blake3_update(&blake3_ctx, &sample_shading_enable,
|
|
sizeof(sample_shading_enable));
|
|
|
|
_mesa_blake3_update(&blake3_ctx, &state->rp->view_mask,
|
|
sizeof(state->rp->view_mask));
|
|
|
|
if (state->ial)
|
|
_mesa_blake3_update(&blake3_ctx, state->ial, sizeof(*state->ial));
|
|
}
|
|
|
|
_mesa_blake3_final(&blake3_ctx, blake3_out);
|
|
}
|
|
|
|
#if PAN_ARCH >= 9
|
|
static bool
|
|
valhall_pack_buf_idx(nir_builder *b, nir_instr *instr, UNUSED void *data)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
unsigned index_src;
|
|
|
|
switch (intrin->intrinsic) {
|
|
case nir_intrinsic_load_ubo:
|
|
case nir_intrinsic_load_ssbo:
|
|
case nir_intrinsic_ssbo_atomic:
|
|
case nir_intrinsic_ssbo_atomic_swap:
|
|
index_src = 0;
|
|
break;
|
|
|
|
case nir_intrinsic_store_ssbo:
|
|
index_src = 1;
|
|
break;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
nir_def *index = intrin->src[index_src].ssa;
|
|
|
|
/* The descriptor lowering pass can add UBO loads, and those already have the
|
|
* right index format. */
|
|
if (index->num_components == 1)
|
|
return false;
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
|
|
/* The valhall backend expects nir_address_format_32bit_index_offset,
|
|
* but address mode is nir_address_format_vec2_index_32bit_offset to allow
|
|
* us to store the array size, set and index without losing information
|
|
* while walking the descriptor deref chain (needed to do a bound check on
|
|
* the array index when we reach the end of the chain).
|
|
* Turn it back to nir_address_format_32bit_index_offset after IOs
|
|
* have been lowered. */
|
|
nir_def *packed_index =
|
|
nir_iadd(b, nir_channel(b, index, 0), nir_channel(b, index, 1));
|
|
nir_src_rewrite(&intrin->src[index_src], packed_index);
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
static bool
|
|
valhall_lower_get_ssbo_size(struct nir_builder *b,
|
|
nir_intrinsic_instr *intr, void *data)
|
|
{
|
|
if (intr->intrinsic != nir_intrinsic_get_ssbo_size)
|
|
return false;
|
|
|
|
b->cursor = nir_before_instr(&intr->instr);
|
|
|
|
nir_def *res_handle = nir_channel(b, intr->src[0].ssa, 0);
|
|
nir_def *table_idx = nir_ushr_imm(b, res_handle, 24);
|
|
nir_def *res_idx = nir_iand_imm(b, res_handle, BITFIELD_MASK(24));
|
|
nir_def *res_table = nir_ior_imm(b, table_idx, pan_res_handle(62, 0));
|
|
nir_def *buf_idx = nir_iadd(b, res_idx, nir_channel(b, intr->src[0].ssa, 1));
|
|
nir_def *desc_offset = nir_imul_imm(b, buf_idx, PANVK_DESCRIPTOR_SIZE);
|
|
nir_def *size = nir_load_ubo(
|
|
b, 1, 32, res_table, nir_iadd_imm(b, desc_offset, 4), .range = ~0u,
|
|
.align_mul = PANVK_DESCRIPTOR_SIZE, .align_offset = 4);
|
|
|
|
nir_def_replace(&intr->def, size);
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
collect_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr,
|
|
void *data)
|
|
{
|
|
if (intr->intrinsic != nir_intrinsic_load_push_constant)
|
|
return false;
|
|
|
|
struct panvk_shader_variant *shader = data;
|
|
uint32_t base = nir_intrinsic_base(intr);
|
|
bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
|
|
uint32_t offset, size;
|
|
|
|
if (is_sysval)
|
|
base -= SYSVALS_PUSH_CONST_BASE;
|
|
|
|
/* If the offset is dynamic, we need to flag [base:base+range] as used, to
|
|
* allow global mem access. */
|
|
if (!nir_src_is_const(intr->src[0])) {
|
|
offset = base;
|
|
size = nir_intrinsic_range(intr);
|
|
|
|
/* Flag the push_uniforms sysval as needed if we have an indirect offset.
|
|
*/
|
|
shader_use_sysval(shader, common, push_uniforms);
|
|
} else {
|
|
offset = base + nir_src_as_uint(intr->src[0]);
|
|
size = (intr->def.bit_size / 8) * intr->def.num_components;
|
|
}
|
|
|
|
if (is_sysval)
|
|
shader_use_sysval_range(shader, offset, size);
|
|
else
|
|
shader_use_push_const_range(shader, offset, size);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
move_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
|
{
|
|
if (intr->intrinsic != nir_intrinsic_load_push_constant)
|
|
return false;
|
|
|
|
struct panvk_shader_variant *shader = data;
|
|
unsigned base = nir_intrinsic_base(intr);
|
|
bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
|
|
|
|
if (is_sysval)
|
|
base -= SYSVALS_PUSH_CONST_BASE;
|
|
|
|
b->cursor = nir_before_instr(&intr->instr);
|
|
|
|
if (nir_src_is_const(intr->src[0])) {
|
|
unsigned offset = base + nir_src_as_uint(intr->src[0]);
|
|
|
|
/* We place the sysvals first, and then comes the user push constants.
|
|
* We do that so we always have the blend constants at offset 0 for
|
|
* blend shaders. */
|
|
if (is_sysval)
|
|
offset = shader_remapped_sysval_offset(shader, offset);
|
|
else
|
|
offset = shader_remapped_push_const_offset(shader, offset);
|
|
|
|
nir_src_rewrite(&intr->src[0], nir_imm_int(b, offset));
|
|
|
|
/* We always set the range/base to zero, to make sure no pass is using it
|
|
* after that point. */
|
|
nir_intrinsic_set_base(intr, 0);
|
|
nir_intrinsic_set_range(intr, 0);
|
|
} else {
|
|
/* We don't use load_sysval() on purpose, because it would set
|
|
* .base=SYSVALS_PUSH_CONST_BASE, and we're supposed to force a base of
|
|
* zero in this pass. */
|
|
unsigned push_const_buf_offset = shader_remapped_sysval_offset(
|
|
shader, sysval_offset(common, push_uniforms));
|
|
nir_def *push_const_buf = nir_load_push_constant(
|
|
b, 1, 64, nir_imm_int(b, push_const_buf_offset));
|
|
unsigned push_const_offset = is_sysval ?
|
|
shader_remapped_sysval_offset(shader, base) :
|
|
shader_remapped_push_const_offset(shader, base);
|
|
nir_def *offset = nir_iadd_imm(b, intr->src[0].ssa, push_const_offset);
|
|
unsigned align = nir_combined_align(nir_intrinsic_align_mul(intr),
|
|
nir_intrinsic_align_offset(intr));
|
|
|
|
/* We assume an alignment of 64-bit max for packed push-constants. */
|
|
align = MIN2(align, FAU_WORD_SIZE);
|
|
nir_def *value = nir_load_global(
|
|
b, intr->def.num_components, intr->def.bit_size,
|
|
nir_iadd(b, push_const_buf, nir_u2u64(b, offset)), .align_mul = align);
|
|
|
|
nir_def_replace(&intr->def, value);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
|
|
{
|
|
/* Before we lower load_push_constant()s with a dynamic offset to global
|
|
* loads, we want to run a few optimization passes to get rid of offset
|
|
* calculation involving only constant values. */
|
|
bool progress = false;
|
|
do {
|
|
progress = false;
|
|
NIR_PASS(progress, nir, nir_opt_copy_prop);
|
|
NIR_PASS(progress, nir, nir_opt_remove_phis);
|
|
NIR_PASS(progress, nir, nir_opt_dce);
|
|
NIR_PASS(progress, nir, nir_opt_dead_cf);
|
|
NIR_PASS(progress, nir, nir_opt_cse);
|
|
|
|
nir_opt_peephole_select_options peephole_select_options = {
|
|
.limit = 64,
|
|
.expensive_alu_ok = true,
|
|
};
|
|
NIR_PASS(progress, nir, nir_opt_peephole_select, &peephole_select_options);
|
|
NIR_PASS(progress, nir, nir_opt_algebraic);
|
|
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
|
} while (progress);
|
|
|
|
/* We always reserve the 4 blend constant words for fragment shaders,
|
|
* because we don't know the blend configuration at this point, and
|
|
* we might end up with a blend shader reading those blend constants. */
|
|
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
|
|
/* We rely on blend constants being placed first and covering 4 words. */
|
|
STATIC_ASSERT(
|
|
offsetof(struct panvk_graphics_sysvals, blend.constants) == 0 &&
|
|
sizeof(((struct panvk_graphics_sysvals *)NULL)->blend.constants) ==
|
|
16);
|
|
|
|
shader_use_sysval(shader, graphics, blend.constants);
|
|
}
|
|
|
|
progress = false;
|
|
NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
|
|
nir_metadata_all, shader);
|
|
|
|
/* Some load_push_constant instructions might be eliminated after
|
|
* scalarization+dead-code-elimination. Since these pass happen in
|
|
* bifrost_compile(), we can't run the push_constant packing after the
|
|
* optimization took place, so let's just have our own FAU count instead
|
|
* of using info.push.count to make it consistent with the
|
|
* used_{sysvals,push_consts} bitmaps, even if it sometimes implies loading
|
|
* more than we really need. Doing that also takes into account the fact
|
|
* blend constants are never loaded from the fragment shader, but might be
|
|
* needed in the blend shader. */
|
|
shader->fau.sysval_count = BITSET_COUNT(shader->fau.used_sysvals);
|
|
/* 32 FAUs (256 bytes) are reserved for API push constants */
|
|
assert(shader->fau.sysval_count <= FAU_WORD_COUNT - 32 &&
|
|
"too many sysval FAUs");
|
|
shader->fau.total_count =
|
|
shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
|
|
assert(shader->fau.total_count <= FAU_WORD_COUNT &&
|
|
"asking for more FAUs than the hardware has to offer");
|
|
|
|
if (!progress)
|
|
return;
|
|
|
|
NIR_PASS(_, nir, nir_shader_intrinsics_pass, move_push_constant,
|
|
nir_metadata_control_flow, shader);
|
|
}
|
|
|
|
struct lower_ycbcr_state {
|
|
uint32_t set_layout_count;
|
|
struct vk_descriptor_set_layout *const *set_layouts;
|
|
};
|
|
|
|
static const struct vk_ycbcr_conversion_state *
|
|
lookup_ycbcr_conversion(const void *_state, uint32_t set,
|
|
uint32_t binding, uint32_t array_index)
|
|
{
|
|
const struct lower_ycbcr_state *state = _state;
|
|
assert(set < state->set_layout_count);
|
|
assert(state->set_layouts[set] != NULL);
|
|
const struct panvk_descriptor_set_layout *set_layout =
|
|
to_panvk_descriptor_set_layout(state->set_layouts[set]);
|
|
assert(binding < set_layout->binding_count);
|
|
|
|
const struct panvk_descriptor_set_binding_layout *bind_layout =
|
|
&set_layout->bindings[binding];
|
|
|
|
if (bind_layout->immutable_samplers == NULL)
|
|
return NULL;
|
|
|
|
array_index = MIN2(array_index, bind_layout->desc_count - 1);
|
|
|
|
const struct panvk_sampler *sampler =
|
|
bind_layout->immutable_samplers[array_index];
|
|
|
|
return sampler && sampler->vk.ycbcr_conversion ?
|
|
&sampler->vk.ycbcr_conversion->state : NULL;
|
|
}
|
|
|
|
static int
|
|
glsl_type_size(const struct glsl_type *type, bool bindless)
|
|
{
|
|
return glsl_count_attribute_slots(type, false);
|
|
}
|
|
|
|
static void
|
|
panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
|
|
uint32_t set_layout_count,
|
|
struct vk_descriptor_set_layout *const *set_layouts,
|
|
const struct vk_pipeline_robustness_state *rs,
|
|
const struct vk_graphics_pipeline_state *state,
|
|
struct panvk_shader_desc_info *desc_info)
|
|
{
|
|
mesa_shader_stage stage = nir->info.stage;
|
|
|
|
const nir_opt_access_options access_options = {
|
|
.is_vulkan = true,
|
|
};
|
|
NIR_PASS(_, nir, nir_opt_access, &access_options);
|
|
|
|
const struct lower_ycbcr_state ycbcr_state = {
|
|
.set_layout_count = set_layout_count,
|
|
.set_layouts = set_layouts,
|
|
};
|
|
NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion,
|
|
&ycbcr_state);
|
|
|
|
panvk_per_arch(nir_lower_descriptors)(nir, dev, rs, set_layout_count,
|
|
set_layouts, state, desc_info);
|
|
|
|
NIR_PASS(_, nir, nir_split_var_copies);
|
|
NIR_PASS(_, nir, nir_lower_var_copies);
|
|
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
|
|
panvk_buffer_ubo_addr_format(rs->uniform_buffers));
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
|
|
panvk_buffer_ssbo_addr_format(rs->storage_buffers));
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
|
|
nir_address_format_32bit_offset);
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
|
|
nir_address_format_64bit_global);
|
|
|
|
/* nir_lower_non_uniform_access needs to run after lowering UBO and SSBO
|
|
* IO. This means we run it after nir_lower_descriptors, which reads the
|
|
* array indices, but it's okay because lower_descriptors treats all
|
|
* dynamic indices the same. */
|
|
enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
|
|
nir_lower_non_uniform_ubo_access |
|
|
nir_lower_non_uniform_ssbo_access |
|
|
nir_lower_non_uniform_texture_access |
|
|
nir_lower_non_uniform_image_access |
|
|
nir_lower_non_uniform_get_ssbo_size;
|
|
#if PAN_ARCH < 9
|
|
lower_non_uniform_access_types |=
|
|
nir_lower_non_uniform_texture_offset_access;
|
|
#endif
|
|
|
|
/* In practice, most shaders do not have non-uniform-qualified accesses
|
|
* thus a cheaper and likely to fail check is run first. */
|
|
if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
|
|
NIR_PASS(_, nir, nir_opt_non_uniform_access);
|
|
struct nir_lower_non_uniform_access_options opts = {
|
|
.types = lower_non_uniform_access_types,
|
|
};
|
|
NIR_PASS(_, nir, nir_lower_non_uniform_access, &opts);
|
|
}
|
|
|
|
#if PAN_ARCH >= 9
|
|
NIR_PASS(_, nir, nir_shader_intrinsics_pass, valhall_lower_get_ssbo_size,
|
|
nir_metadata_control_flow, NULL);
|
|
NIR_PASS(_, nir, nir_shader_instructions_pass, valhall_pack_buf_idx,
|
|
nir_metadata_control_flow, NULL);
|
|
#endif
|
|
|
|
if (mesa_shader_stage_uses_workgroup(stage)) {
|
|
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
|
|
shared_type_info);
|
|
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
|
|
nir_address_format_32bit_offset);
|
|
}
|
|
|
|
if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
|
|
/* Align everything up to 16 bytes to take advantage of load store
|
|
* vectorization. */
|
|
nir->info.shared_size = align(nir->info.shared_size, 16);
|
|
NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
|
|
16);
|
|
|
|
/* We need to call lower_compute_system_values again because
|
|
* nir_zero_initialize_shared_memory generates load_invocation_id which
|
|
* has to be lowered to load_invocation_index.
|
|
*/
|
|
NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
|
|
}
|
|
|
|
/* Needed to turn shader_temp into function_temp since the backend only
|
|
* handles the latter for now.
|
|
*/
|
|
NIR_PASS(_, nir, nir_lower_global_vars_to_local);
|
|
|
|
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
|
|
if (PANVK_DEBUG(NIR)) {
|
|
mesa_logi("translated nir:");
|
|
nir_log_shaderi(nir);
|
|
}
|
|
}
|
|
|
|
static void
|
|
panvk_lower_nir_io(nir_shader *nir)
|
|
{
|
|
NIR_PASS(_, nir, nir_lower_var_copies);
|
|
NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
|
|
nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
|
|
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
|
|
glsl_type_size, nir_lower_io_use_interpolated_input_intrinsics);
|
|
|
|
#if PAN_ARCH < 9
|
|
/* iter13: VK_EXT_transform_feedback — runs AFTER nir_lower_io so that
|
|
* shader outputs are now store_output intrinsics that pan_nir_lower_xfb
|
|
* can rewrite to nir_store_global+nir_load_xfb_address. */
|
|
if (nir->info.stage == MESA_SHADER_VERTEX &&
|
|
nir->info.has_transform_feedback_varyings) {
|
|
NIR_PASS(_, nir, nir_opt_constant_folding);
|
|
NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info);
|
|
NIR_PASS(_, nir, pan_nir_lower_xfb);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static VkResult
|
|
panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
|
|
VkShaderCreateFlagsEXT shader_flags,
|
|
const struct pan_compile_inputs *compile_input,
|
|
const struct vk_graphics_pipeline_state *state,
|
|
const uint32_t *noperspective_varyings,
|
|
struct panvk_shader_variant *shader)
|
|
{
|
|
const bool dump_asm =
|
|
shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;
|
|
|
|
/* We're going to modify this so make our own copy to be nicer to callers */
|
|
struct pan_compile_inputs input = *compile_input;
|
|
|
|
pan_postprocess_nir(nir, input.gpu_id);
|
|
|
|
if (nir->info.stage == MESA_SHADER_VERTEX)
|
|
NIR_PASS(_, nir, nir_shader_intrinsics_pass, panvk_lower_load_vs_input,
|
|
nir_metadata_control_flow, NULL);
|
|
else if (nir->info.stage == MESA_SHADER_FRAGMENT)
|
|
NIR_PASS(_, nir, nir_shader_intrinsics_pass, panvk_lower_load_fs_input,
|
|
nir_metadata_control_flow, NULL);
|
|
|
|
/* since valhall, panvk_per_arch(nir_lower_descriptors) separates the
|
|
* driver set and the user sets, and does not need pan_nir_lower_image_index
|
|
*/
|
|
if (PAN_ARCH < 9 && nir->info.stage == MESA_SHADER_VERTEX) {
|
|
NIR_PASS(_, nir, pan_nir_lower_image_index, MAX_VS_ATTRIBS);
|
|
NIR_PASS(_, nir, pan_nir_lower_texel_buffer_fetch_index, MAX_VS_ATTRIBS);
|
|
}
|
|
pan_nir_lower_texture_late(nir, input.gpu_id);
|
|
|
|
if (noperspective_varyings && nir->info.stage == MESA_SHADER_VERTEX) {
|
|
NIR_PASS(_, nir, nir_inline_sysval,
|
|
nir_intrinsic_load_noperspective_varyings_pan,
|
|
*noperspective_varyings);
|
|
}
|
|
|
|
struct panvk_lower_sysvals_context lower_sysvals_ctx = {
|
|
.shader = shader,
|
|
.state = state,
|
|
};
|
|
|
|
NIR_PASS(_, nir, nir_shader_instructions_pass, panvk_lower_sysvals,
|
|
nir_metadata_control_flow, &lower_sysvals_ctx);
|
|
|
|
lower_load_push_consts(nir, shader);
|
|
|
|
/* Allow the remaining FAU space to be filled with constants. */
|
|
input.fau_consts.max_amount =
|
|
2 * (FAU_WORD_COUNT - shader->fau.total_count);
|
|
input.fau_consts.offset = shader->fau.total_count * 2;
|
|
input.fau_consts.values = &shader->info.fau_consts[0];
|
|
assert(input.fau_consts.max_amount <= ARRAY_SIZE(shader->info.fau_consts));
|
|
|
|
struct util_dynarray binary = UTIL_DYNARRAY_INIT;
|
|
pan_shader_compile(nir, &input, &binary, &shader->info);
|
|
|
|
/* Propagate potential additional FAU values into the panvk info struct. */
|
|
/* FAU consts are pushed as 32bit values, but total_count is for 64bit
|
|
* ones. */
|
|
shader->fau.total_count += DIV_ROUND_UP(shader->info.fau_consts_count, 2);
|
|
|
|
void *bin_ptr = util_dynarray_element(&binary, uint8_t, 0);
|
|
unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t);
|
|
|
|
shader->bin_size = 0;
|
|
shader->bin_ptr = NULL;
|
|
|
|
if (bin_size) {
|
|
void *data = malloc(bin_size);
|
|
|
|
if (data == NULL)
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
memcpy(data, bin_ptr, bin_size);
|
|
shader->bin_size = bin_size;
|
|
shader->bin_ptr = data;
|
|
}
|
|
util_dynarray_fini(&binary);
|
|
|
|
if (dump_asm) {
|
|
shader->nir_str = nir_shader_as_str(nir, NULL);
|
|
|
|
char *data = NULL;
|
|
size_t disasm_size = 0;
|
|
|
|
if (shader->bin_size) {
|
|
struct u_memstream mem;
|
|
if (u_memstream_open(&mem, &data, &disasm_size)) {
|
|
FILE *const stream = u_memstream_get(&mem);
|
|
pan_disassemble(stream, shader->bin_ptr, shader->bin_size,
|
|
compile_input->gpu_id, false);
|
|
u_memstream_close(&mem);
|
|
}
|
|
}
|
|
|
|
char *asm_str = malloc(disasm_size + 1);
|
|
memcpy(asm_str, data, disasm_size);
|
|
asm_str[disasm_size] = '\0';
|
|
free(data);
|
|
|
|
shader->asm_str = asm_str;
|
|
}
|
|
|
|
/* We need to update info.push.count because it's used to initialize the
|
|
* RSD in pan_shader_prepare_rsd().
|
|
*/
|
|
shader->info.push.count = shader->fau.total_count * 2;
|
|
|
|
#if PAN_ARCH < 9
|
|
/* Patch the descriptor count */
|
|
shader->info.ubo_count =
|
|
shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_UBO] +
|
|
shader->desc_info.dyn_ubos.count;
|
|
shader->info.texture_count =
|
|
shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_TEXTURE];
|
|
shader->info.sampler_count =
|
|
shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_SAMPLER];
|
|
|
|
/* Dummy sampler. */
|
|
if (!shader->info.sampler_count && shader->info.texture_count)
|
|
shader->info.sampler_count++;
|
|
|
|
if (nir->info.stage == MESA_SHADER_VERTEX) {
|
|
/* We leave holes in the attribute locations, but pan_shader.c assumes the
|
|
* opposite. Patch attribute_count accordingly, so
|
|
* pan_shader_prepare_rsd() does what we expect.
|
|
*/
|
|
uint32_t gen_attribs =
|
|
(shader->info.attributes_read & VERT_BIT_GENERIC_ALL) >>
|
|
VERT_ATTRIB_GENERIC0;
|
|
|
|
shader->info.attribute_count = util_last_bit(gen_attribs);
|
|
|
|
/* NULL IDVS shaders are not allowed. */
|
|
if (!bin_size)
|
|
shader->info.vs.idvs = false;
|
|
}
|
|
|
|
/* Image attributes start at MAX_VS_ATTRIBS in the VS attribute table,
|
|
* and zero in other stages.
|
|
*/
|
|
if (shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] > 0)
|
|
shader->info.attribute_count =
|
|
shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] +
|
|
(nir->info.stage == MESA_SHADER_VERTEX ? MAX_VS_ATTRIBS : 0);
|
|
#endif
|
|
|
|
switch (nir->info.stage) {
|
|
case MESA_SHADER_COMPUTE:
|
|
case MESA_SHADER_KERNEL:
|
|
shader->cs.local_size.x = nir->info.workgroup_size[0];
|
|
shader->cs.local_size.y = nir->info.workgroup_size[1];
|
|
shader->cs.local_size.z = nir->info.workgroup_size[2];
|
|
break;
|
|
|
|
case MESA_SHADER_FRAGMENT:
|
|
shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info, PAN_ARCH);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
#if PAN_ARCH >= 9
|
|
static enum mali_flush_to_zero_mode
|
|
shader_ftz_mode(struct panvk_shader_variant *shader)
|
|
{
|
|
if (shader->info.ftz_fp32) {
|
|
if (shader->info.ftz_fp16)
|
|
return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
|
|
else
|
|
return MALI_FLUSH_TO_ZERO_MODE_DX11;
|
|
} else {
|
|
/* We don't have a "flush FP16, preserve FP32" mode, but APIs
|
|
* should not be able to generate that.
|
|
*/
|
|
assert(!shader->info.ftz_fp16 && !shader->info.ftz_fp32);
|
|
return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static VkResult
|
|
panvk_shader_upload(struct panvk_device *dev,
|
|
struct panvk_shader_variant *shader,
|
|
const VkAllocationCallbacks *pAllocator)
|
|
{
|
|
shader->code_mem = (struct panvk_priv_mem){0};
|
|
|
|
#if PAN_ARCH < 9
|
|
shader->rsd = (struct panvk_priv_mem){0};
|
|
#else
|
|
shader->spd = (struct panvk_priv_mem){0};
|
|
#endif
|
|
|
|
if (!shader->bin_size)
|
|
return VK_SUCCESS;
|
|
|
|
shader->code_mem = panvk_pool_upload_aligned(
|
|
&dev->mempools.exec, shader->bin_ptr, shader->bin_size, 128);
|
|
if (!panvk_priv_mem_check_alloc(shader->code_mem))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
#if PAN_ARCH < 9
|
|
if (shader->info.stage == MESA_SHADER_FRAGMENT)
|
|
return VK_SUCCESS;
|
|
|
|
shader->rsd = panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
|
|
if (!panvk_priv_mem_check_alloc(shader->rsd))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_desc(shader->rsd, 0, RENDERER_STATE, cfg) {
|
|
pan_shader_prepare_rsd(&shader->info,
|
|
panvk_shader_variant_get_dev_addr(shader), &cfg);
|
|
}
|
|
#else
|
|
if (shader->info.stage != MESA_SHADER_VERTEX) {
|
|
shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
|
|
if (!panvk_priv_mem_check_alloc(shader->spd))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) {
|
|
cfg.stage = pan_shader_stage(&shader->info);
|
|
|
|
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
|
|
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
|
|
#if PAN_ARCH < 12
|
|
else if (cfg.stage == MALI_SHADER_STAGE_VERTEX)
|
|
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
|
|
#endif
|
|
|
|
cfg.register_allocation =
|
|
pan_register_allocation(shader->info.work_reg_count);
|
|
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
|
|
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
|
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
|
|
|
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
|
|
cfg.requires_helper_threads = shader->info.contains_barrier;
|
|
}
|
|
} else {
|
|
#if PAN_ARCH >= 12
|
|
shader->spds.all_points =
|
|
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
|
|
if (!panvk_priv_mem_check_alloc(shader->spds.all_points))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM,
|
|
cfg) {
|
|
cfg.stage = pan_shader_stage(&shader->info);
|
|
cfg.register_allocation =
|
|
pan_register_allocation(shader->info.work_reg_count);
|
|
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
|
|
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
|
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
|
}
|
|
|
|
shader->spds.all_triangles =
|
|
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
|
|
if (!panvk_priv_mem_check_alloc(shader->spds.all_triangles))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM,
|
|
cfg) {
|
|
cfg.stage = pan_shader_stage(&shader->info);
|
|
cfg.register_allocation =
|
|
pan_register_allocation(shader->info.work_reg_count);
|
|
cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
|
|
shader->info.vs.no_psiz_offset;
|
|
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
|
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
|
}
|
|
#else
|
|
shader->spds.pos_points =
|
|
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
|
|
if (!panvk_priv_mem_check_alloc(shader->spds.pos_points))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_desc(shader->spds.pos_points, 0, SHADER_PROGRAM,
|
|
cfg) {
|
|
cfg.stage = pan_shader_stage(&shader->info);
|
|
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
|
|
cfg.register_allocation =
|
|
pan_register_allocation(shader->info.work_reg_count);
|
|
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
|
|
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
|
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
|
}
|
|
|
|
shader->spds.pos_triangles =
|
|
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
|
|
if (!panvk_priv_mem_check_alloc(shader->spds.pos_triangles))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_desc(shader->spds.pos_triangles, 0, SHADER_PROGRAM,
|
|
cfg) {
|
|
cfg.stage = pan_shader_stage(&shader->info);
|
|
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
|
|
cfg.register_allocation =
|
|
pan_register_allocation(shader->info.work_reg_count);
|
|
cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
|
|
shader->info.vs.no_psiz_offset;
|
|
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
|
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
|
}
|
|
|
|
if (shader->info.vs.secondary_enable) {
|
|
shader->spds.var =
|
|
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
|
|
if (!panvk_priv_mem_check_alloc(shader->spds.var))
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_desc(shader->spds.var, 0, SHADER_PROGRAM, cfg) {
|
|
unsigned work_count = shader->info.vs.secondary_work_reg_count;
|
|
|
|
cfg.stage = pan_shader_stage(&shader->info);
|
|
cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
|
|
cfg.register_allocation = pan_register_allocation(work_count);
|
|
cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
|
|
shader->info.vs.secondary_offset;
|
|
cfg.preload.r48_r63 = (shader->info.vs.secondary_preload >> 48);
|
|
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static void
|
|
panvk_shader_variant_destroy(struct panvk_shader_variant *shader)
|
|
{
|
|
free((void *)shader->asm_str);
|
|
ralloc_free((void *)shader->nir_str);
|
|
|
|
panvk_pool_free_mem(&shader->code_mem);
|
|
|
|
#if PAN_ARCH < 9
|
|
panvk_pool_free_mem(&shader->rsd);
|
|
panvk_pool_free_mem(&shader->desc_info.others.map);
|
|
#else
|
|
if (shader->info.stage != MESA_SHADER_VERTEX) {
|
|
panvk_pool_free_mem(&shader->spd);
|
|
} else {
|
|
#if PAN_ARCH >= 12
|
|
panvk_pool_free_mem(&shader->spds.all_points);
|
|
panvk_pool_free_mem(&shader->spds.all_triangles);
|
|
#else
|
|
panvk_pool_free_mem(&shader->spds.var);
|
|
panvk_pool_free_mem(&shader->spds.pos_points);
|
|
panvk_pool_free_mem(&shader->spds.pos_triangles);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
if (shader->own_bin)
|
|
free((void *)shader->bin_ptr);
|
|
}
|
|
|
|
static void
|
|
panvk_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
|
|
const VkAllocationCallbacks *pAllocator)
|
|
{
|
|
struct panvk_shader *shader =
|
|
container_of(vk_shader, struct panvk_shader, vk);
|
|
|
|
panvk_shader_foreach_variant(shader, variant) {
|
|
panvk_shader_variant_destroy(variant);
|
|
}
|
|
|
|
vk_shader_free(vk_dev, pAllocator, &shader->vk);
|
|
}
|
|
|
|
static const struct vk_shader_ops panvk_shader_ops;
|
|
|
|
static VkResult
|
|
panvk_compile_shader(struct panvk_device *dev,
|
|
struct vk_shader_compile_info *info,
|
|
const struct vk_graphics_pipeline_state *state,
|
|
const uint32_t *noperspective_varyings,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
struct vk_shader **shader_out)
|
|
{
|
|
struct panvk_physical_device *phys_dev =
|
|
to_panvk_physical_device(dev->vk.physical);
|
|
|
|
struct panvk_shader *shader;
|
|
VkResult result;
|
|
|
|
size_t size =
|
|
sizeof(struct panvk_shader) + sizeof(struct panvk_shader_variant) *
|
|
panvk_shader_num_variants(info->stage);
|
|
shader = vk_shader_zalloc(&dev->vk, &panvk_shader_ops, info->stage,
|
|
pAllocator, size);
|
|
if (shader == NULL)
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
nir_variable_mode robust2_modes = 0;
|
|
if (info->robustness->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
|
|
robust2_modes |= nir_var_mem_ubo;
|
|
if (info->robustness->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
|
|
robust2_modes |= nir_var_mem_ssbo;
|
|
|
|
struct pan_compile_inputs inputs = {
|
|
.gpu_id = phys_dev->kmod.dev->props.gpu_id,
|
|
.gpu_variant = phys_dev->kmod.dev->props.gpu_variant,
|
|
.view_mask = (state && state->rp) ? state->rp->view_mask : 0,
|
|
.robust2_modes = robust2_modes,
|
|
.robust_descriptors = dev->vk.enabled_features.nullDescriptor,
|
|
/* iter13: XFB shaders must disable IDVS (matches Panfrost-Gallium). */
|
|
.no_idvs = (info->stage == MESA_SHADER_VERTEX) &&
|
|
info->nir->info.has_transform_feedback_varyings,
|
|
};
|
|
|
|
switch (info->stage) {
|
|
case MESA_SHADER_VERTEX: {
|
|
const enum panvk_vs_variant last_variant = PANVK_VS_VARIANT_HW;
|
|
for (enum panvk_vs_variant v = 0; v <= last_variant; v++) {
|
|
struct panvk_shader_variant *variant = &shader->variants[v];
|
|
|
|
/* Each variant gets its own NIR. To save an extra clone, we use the
|
|
* original NIR for the last stage.
|
|
*/
|
|
const bool clone_nir = (v != last_variant);
|
|
nir_shader *nir =
|
|
clone_nir ? nir_shader_clone(NULL, info->nir) : info->nir;
|
|
|
|
#if PAN_ARCH >= 10
|
|
if (inputs.view_mask) {
|
|
nir_lower_multiview_options options = {
|
|
.view_mask = inputs.view_mask,
|
|
.allowed_per_view_outputs = ~0
|
|
};
|
|
|
|
/* The only case where this should fail is with memory/image
|
|
* writes, which we don't support in vertex shaders
|
|
*/
|
|
assert(nir_can_lower_multiview(nir, options));
|
|
NIR_PASS(_, nir, nir_lower_multiview, options);
|
|
|
|
/* Pull output writes out of the loop and give them constant
|
|
* offsets for pan_lower_store_components
|
|
*/
|
|
NIR_PASS(_, nir, nir_lower_io_vars_to_temporaries,
|
|
nir_shader_get_entrypoint(nir), nir_var_shader_out);
|
|
}
|
|
#endif
|
|
|
|
panvk_lower_nir(dev, nir, info->set_layout_count,
|
|
info->set_layouts, info->robustness,
|
|
state, &variant->desc_info);
|
|
|
|
/* We need the driver_location to match the vertex attribute
|
|
* location, so we can use the attribute layout described by
|
|
* vk_vertex_input_state where there are holes in the attribute
|
|
* locations.
|
|
*/
|
|
nir_foreach_shader_in_variable(var, nir) {
|
|
assert(var->data.location >= VERT_ATTRIB_GENERIC0 &&
|
|
var->data.location <= VERT_ATTRIB_GENERIC15);
|
|
var->data.driver_location =
|
|
var->data.location - VERT_ATTRIB_GENERIC0;
|
|
}
|
|
nir_assign_io_var_locations(nir, nir_var_shader_out);
|
|
panvk_lower_nir_io(nir);
|
|
|
|
variant->own_bin = true;
|
|
|
|
result = panvk_compile_nir(dev, nir, info->flags, &inputs, state,
|
|
noperspective_varyings, variant);
|
|
|
|
/* If we cloned, it's our job to clean up */
|
|
if (clone_nir)
|
|
ralloc_free(nir);
|
|
|
|
if (result != VK_SUCCESS) {
|
|
panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
|
|
return result;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
case MESA_SHADER_FRAGMENT: {
|
|
struct panvk_shader_variant *variant =
|
|
(struct panvk_shader_variant *)panvk_shader_only_variant(shader);
|
|
|
|
nir_shader *nir = info->nir;
|
|
|
|
if (state && state->ms && state->ms->sample_shading_enable)
|
|
nir->info.fs.uses_sample_shading = true;
|
|
|
|
/* We need to lower input attachments before we lower descriptors */
|
|
NIR_PASS(_, nir, panvk_per_arch(nir_lower_input_attachment_loads),
|
|
state, &variant->fs.input_attachment_read);
|
|
|
|
/* Lower input intrinsics for fragment shaders early to get the max
|
|
* number of varying loads, as this number is required during descriptor
|
|
* lowering for v9+.
|
|
*/
|
|
nir_assign_io_var_locations(nir, nir_var_shader_in);
|
|
|
|
#if PAN_ARCH >= 9
|
|
/* LD_VAR_BUF[_IMM] takes an 8-bit offset, limiting its use to 16 or
|
|
* less varyings, assuming highp vec4.
|
|
*/
|
|
inputs.valhall.use_ld_var_buf = nir->num_inputs <= 16;
|
|
variant->desc_info.fs_varying_attr_desc_count =
|
|
inputs.valhall.use_ld_var_buf ? 0 : nir->num_inputs;
|
|
#endif
|
|
|
|
panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,
|
|
info->robustness, state, &variant->desc_info);
|
|
|
|
nir_assign_io_var_locations(nir, nir_var_shader_out);
|
|
panvk_lower_nir_io(nir);
|
|
|
|
/* Lower FS outputs now so that we can lower load_blend_descriptor_pan
|
|
* to a driver-provided FAU instead of using the blend descriptors
|
|
* uploaded by the hardware. See panvk_vX_blend.c for details.
|
|
*/
|
|
NIR_PASS(_, nir, nir_opt_constant_folding);
|
|
NIR_PASS(_, nir, pan_nir_lower_fs_outputs, false);
|
|
|
|
variant->own_bin = true;
|
|
|
|
result = panvk_compile_nir(dev, nir, info->flags, &inputs, state,
|
|
noperspective_varyings, variant);
|
|
if (result != VK_SUCCESS) {
|
|
panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
|
|
return result;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case MESA_SHADER_COMPUTE: {
|
|
struct panvk_shader_variant *variant =
|
|
(struct panvk_shader_variant *)panvk_shader_only_variant(shader);
|
|
|
|
nir_shader *nir = info->nir;
|
|
|
|
panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,
|
|
info->robustness, state, &variant->desc_info);
|
|
|
|
variant->own_bin = true;
|
|
|
|
result = panvk_compile_nir(dev, nir, info->flags, &inputs, state,
|
|
noperspective_varyings, variant);
|
|
if (result != VK_SUCCESS) {
|
|
panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
|
|
return result;
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
UNREACHABLE("Unknown shader stage");
|
|
}
|
|
|
|
panvk_shader_foreach_variant(shader, variant) {
|
|
result = panvk_shader_upload(dev, variant, pAllocator);
|
|
if (result != VK_SUCCESS) {
|
|
panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
*shader_out = &shader->vk;
|
|
|
|
return result;
|
|
}
|
|
|
|
VkResult
|
|
panvk_per_arch(create_shader_from_binary)(struct panvk_device *dev,
|
|
const struct pan_shader_info *info,
|
|
struct pan_compute_dim local_size,
|
|
const void *bin_ptr, size_t bin_size,
|
|
struct panvk_shader **shader_out)
|
|
{
|
|
struct panvk_shader *shader;
|
|
VkResult result;
|
|
|
|
size_t size =
|
|
sizeof(struct panvk_shader) + sizeof(struct panvk_shader_variant) *
|
|
panvk_shader_num_variants(info->stage);
|
|
shader = vk_shader_zalloc(&dev->vk, &panvk_shader_ops, info->stage,
|
|
&dev->vk.alloc, size);
|
|
if (shader == NULL)
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
assert(panvk_shader_num_variants(info->stage) == 1);
|
|
|
|
struct panvk_shader_variant *variant = &shader->variants[0];
|
|
variant->info = *info;
|
|
variant->cs.local_size = local_size;
|
|
variant->bin_ptr = bin_ptr;
|
|
variant->bin_size = bin_size;
|
|
variant->own_bin = false;
|
|
variant->nir_str = NULL;
|
|
variant->asm_str = NULL;
|
|
|
|
result = panvk_shader_upload(dev, variant, &dev->vk.alloc);
|
|
|
|
if (result != VK_SUCCESS) {
|
|
panvk_shader_destroy(&dev->vk, &shader->vk, &dev->vk.alloc);
|
|
return result;
|
|
}
|
|
|
|
*shader_out = shader;
|
|
|
|
return result;
|
|
}
|
|
|
|
static VkResult
|
|
compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
|
|
struct vk_shader_compile_info *infos,
|
|
const struct vk_graphics_pipeline_state *state,
|
|
const struct vk_features *enabled_features,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
struct vk_shader **shaders_out)
|
|
{
|
|
struct panvk_device *dev = to_panvk_device(vk_dev);
|
|
bool use_static_noperspective = false;
|
|
uint32_t noperspective_varyings = 0;
|
|
VkResult result;
|
|
int32_t i;
|
|
|
|
/* Vulkan runtime passes us shaders in stage order, so the FS will always
|
|
* be last if it exists. Iterate shaders in reverse order to ensure FS is
|
|
* processed before VS. */
|
|
for (i = shader_count - 1; i >= 0; i--) {
|
|
uint32_t *noperspective_varyings_ptr =
|
|
use_static_noperspective ? &noperspective_varyings : NULL;
|
|
result =
|
|
panvk_compile_shader(dev, &infos[i], state, noperspective_varyings_ptr,
|
|
pAllocator, &shaders_out[i]);
|
|
|
|
if (result != VK_SUCCESS)
|
|
goto err_cleanup;
|
|
|
|
/* If we are linking VS and FS, we can use the static interpolation
|
|
* qualifiers from the FS in the VS. */
|
|
if (infos[i].nir->info.stage == MESA_SHADER_FRAGMENT) {
|
|
struct panvk_shader *shader =
|
|
container_of(shaders_out[i], struct panvk_shader, vk);
|
|
const struct panvk_shader_variant *variant =
|
|
panvk_shader_only_variant(shader);
|
|
|
|
use_static_noperspective = true;
|
|
noperspective_varyings = variant->info.varyings.noperspective;
|
|
}
|
|
|
|
/* Clean up NIR for the current shader */
|
|
ralloc_free(infos[i].nir);
|
|
}
|
|
|
|
/* TODO: If we get multiple shaders here, we can perform part of the link
|
|
* logic at compile time. */
|
|
|
|
return VK_SUCCESS;
|
|
|
|
err_cleanup:
|
|
/* Clean up all the shaders before this point */
|
|
for (int32_t j = shader_count - 1; j > i; j--)
|
|
panvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
|
|
|
|
/* Clean up all the NIR from this point */
|
|
for (int32_t j = i; j >= 0; j--)
|
|
ralloc_free(infos[j].nir);
|
|
|
|
/* Memset the output array */
|
|
memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
|
|
|
|
return result;
|
|
}
|
|
|
|
static simple_mtx_t compiler_mutex = SIMPLE_MTX_INITIALIZER;
|
|
|
|
void
|
|
panvk_per_arch(compiler_lock)(void)
|
|
{
|
|
if (pan_will_dump_shaders(PAN_ARCH) || PANVK_DEBUG(NIR))
|
|
simple_mtx_lock(&compiler_mutex);
|
|
}
|
|
|
|
void
|
|
panvk_per_arch(compiler_unlock)(void)
|
|
{
|
|
if (pan_will_dump_shaders(PAN_ARCH) || PANVK_DEBUG(NIR))
|
|
simple_mtx_unlock(&compiler_mutex);
|
|
}
|
|
|
|
static VkResult
|
|
panvk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
|
|
struct vk_shader_compile_info *infos,
|
|
const struct vk_graphics_pipeline_state *state,
|
|
const struct vk_features *enabled_features,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
struct vk_shader **shaders_out)
|
|
{
|
|
panvk_per_arch(compiler_lock)();
|
|
|
|
VkResult result = compile_shaders(vk_dev, shader_count, infos, state,
|
|
enabled_features, pAllocator,
|
|
shaders_out);
|
|
|
|
panvk_per_arch(compiler_unlock)();
|
|
|
|
return result;
|
|
}
|
|
|
|
static VkResult
|
|
shader_desc_info_deserialize(struct panvk_device *dev,
|
|
struct blob_reader *blob,
|
|
struct panvk_shader_variant *shader)
|
|
{
|
|
shader->desc_info.used_set_mask = blob_read_uint32(blob);
|
|
|
|
#if PAN_ARCH < 9
|
|
shader->desc_info.dyn_ubos.count = blob_read_uint32(blob);
|
|
blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map,
|
|
sizeof(*shader->desc_info.dyn_ubos.map) *
|
|
shader->desc_info.dyn_ubos.count);
|
|
shader->desc_info.dyn_ssbos.count = blob_read_uint32(blob);
|
|
blob_copy_bytes(blob, shader->desc_info.dyn_ssbos.map,
|
|
sizeof(*shader->desc_info.dyn_ssbos.map) *
|
|
shader->desc_info.dyn_ssbos.count);
|
|
|
|
uint32_t others_count = 0;
|
|
for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
|
|
shader->desc_info.others.count[i] = blob_read_uint32(blob);
|
|
others_count += shader->desc_info.others.count[i];
|
|
}
|
|
|
|
if (others_count) {
|
|
struct panvk_pool_alloc_info alloc_info = {
|
|
.size = others_count * sizeof(uint32_t),
|
|
.alignment = sizeof(uint32_t),
|
|
};
|
|
shader->desc_info.others.map =
|
|
panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
|
|
if (!panvk_priv_mem_check_alloc(shader->desc_info.others.map))
|
|
return panvk_error(shader, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
|
|
panvk_priv_mem_write_array(shader->desc_info.others.map, 0, uint32_t,
|
|
others_count, copy_table) {
|
|
blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table));
|
|
}
|
|
}
|
|
#else
|
|
shader->desc_info.dyn_bufs.count = blob_read_uint32(blob);
|
|
blob_copy_bytes(blob, shader->desc_info.dyn_bufs.map,
|
|
sizeof(*shader->desc_info.dyn_bufs.map) *
|
|
shader->desc_info.dyn_bufs.count);
|
|
shader->desc_info.fs_varying_attr_desc_count = blob_read_uint32(blob);
|
|
#endif
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult
|
|
panvk_deserialize_shader_variant(struct vk_device *vk_dev,
|
|
struct blob_reader *blob,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
struct panvk_shader_variant *shader)
|
|
{
|
|
struct panvk_device *device = to_panvk_device(vk_dev);
|
|
struct pan_shader_info info;
|
|
VkResult result;
|
|
|
|
blob_copy_bytes(blob, &info, sizeof(info));
|
|
if (blob->overrun)
|
|
return panvk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
|
|
|
|
shader->info = info;
|
|
blob_copy_bytes(blob, &shader->fau, sizeof(shader->fau));
|
|
|
|
switch (shader->info.stage) {
|
|
case MESA_SHADER_COMPUTE:
|
|
case MESA_SHADER_KERNEL:
|
|
blob_copy_bytes(blob, &shader->cs.local_size,
|
|
sizeof(shader->cs.local_size));
|
|
break;
|
|
|
|
case MESA_SHADER_FRAGMENT:
|
|
shader->fs.earlyzs_lut = pan_earlyzs_analyze(&shader->info, PAN_ARCH);
|
|
blob_copy_bytes(blob, &shader->fs.input_attachment_read,
|
|
sizeof(shader->fs.input_attachment_read));
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
shader->bin_size = blob_read_uint32(blob);
|
|
|
|
if (blob->overrun)
|
|
return panvk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
|
|
|
|
shader->bin_ptr = malloc(shader->bin_size);
|
|
if (shader->bin_ptr == NULL)
|
|
return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
shader->own_bin = true;
|
|
blob_copy_bytes(blob, (void *)shader->bin_ptr, shader->bin_size);
|
|
|
|
result = shader_desc_info_deserialize(device, blob, shader);
|
|
|
|
if (result != VK_SUCCESS)
|
|
return panvk_error(device, result);
|
|
|
|
uint32_t nir_str_size = blob_read_uint32(blob);
|
|
uint32_t asm_str_size = blob_read_uint32(blob);
|
|
const char *nir_str = blob_read_bytes(blob, nir_str_size);
|
|
const char *asm_str = blob_read_bytes(blob, asm_str_size);
|
|
|
|
if (blob->overrun)
|
|
return panvk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
|
|
|
|
if (nir_str_size > 0) {
|
|
shader->nir_str = ralloc_strndup(NULL, nir_str, nir_str_size);
|
|
if (shader->nir_str == NULL)
|
|
return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
}
|
|
|
|
if (asm_str_size > 0) {
|
|
shader->asm_str = strndup(asm_str, asm_str_size);
|
|
if (shader->asm_str == NULL)
|
|
return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
}
|
|
|
|
result = panvk_shader_upload(device, shader, pAllocator);
|
|
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
|
|
return result;
|
|
}
|
|
|
|
static VkResult
|
|
panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob,
|
|
uint32_t binary_version,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
struct vk_shader **shader_out)
|
|
{
|
|
struct panvk_device *device = to_panvk_device(vk_dev);
|
|
struct panvk_shader *shader;
|
|
|
|
mesa_shader_stage stage = blob_read_uint8(blob);
|
|
if (blob->overrun)
|
|
return vk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
|
|
|
|
size_t size =
|
|
sizeof(struct panvk_shader) +
|
|
sizeof(struct panvk_shader_variant) * panvk_shader_num_variants(stage);
|
|
shader =
|
|
vk_shader_zalloc(vk_dev, &panvk_shader_ops, stage, pAllocator, size);
|
|
if (shader == NULL)
|
|
return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
panvk_shader_foreach_variant(shader, variant) {
|
|
VkResult result =
|
|
panvk_deserialize_shader_variant(vk_dev, blob, pAllocator, variant);
|
|
|
|
if (result != VK_SUCCESS) {
|
|
panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
*shader_out = &shader->vk;
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static void
|
|
shader_desc_info_serialize(struct blob *blob,
|
|
const struct panvk_shader_variant *shader)
|
|
{
|
|
blob_write_uint32(blob, shader->desc_info.used_set_mask);
|
|
|
|
#if PAN_ARCH < 9
|
|
blob_write_uint32(blob, shader->desc_info.dyn_ubos.count);
|
|
blob_write_bytes(blob, shader->desc_info.dyn_ubos.map,
|
|
sizeof(*shader->desc_info.dyn_ubos.map) *
|
|
shader->desc_info.dyn_ubos.count);
|
|
blob_write_uint32(blob, shader->desc_info.dyn_ssbos.count);
|
|
blob_write_bytes(blob, shader->desc_info.dyn_ssbos.map,
|
|
sizeof(*shader->desc_info.dyn_ssbos.map) *
|
|
shader->desc_info.dyn_ssbos.count);
|
|
|
|
unsigned others_count = 0;
|
|
for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
|
|
blob_write_uint32(blob, shader->desc_info.others.count[i]);
|
|
others_count += shader->desc_info.others.count[i];
|
|
}
|
|
|
|
/* No need to wrap this one in panvk_priv_mem_readback(), because the
|
|
* GPU is not supposed to touch it. */
|
|
blob_write_bytes(blob,
|
|
panvk_priv_mem_host_addr(shader->desc_info.others.map),
|
|
sizeof(uint32_t) * others_count);
|
|
#else
|
|
blob_write_uint32(blob, shader->desc_info.dyn_bufs.count);
|
|
blob_write_bytes(blob, shader->desc_info.dyn_bufs.map,
|
|
sizeof(*shader->desc_info.dyn_bufs.map) *
|
|
shader->desc_info.dyn_bufs.count);
|
|
blob_write_uint32(blob, shader->desc_info.fs_varying_attr_desc_count);
|
|
#endif
|
|
}
|
|
|
|
static bool
|
|
panvk_shader_serialize_variant(struct vk_device *vk_dev,
|
|
const struct panvk_shader_variant *shader,
|
|
struct blob *blob)
|
|
{
|
|
blob_write_bytes(blob, &shader->info, sizeof(shader->info));
|
|
blob_write_bytes(blob, &shader->fau, sizeof(shader->fau));
|
|
|
|
switch (shader->info.stage) {
|
|
case MESA_SHADER_COMPUTE:
|
|
case MESA_SHADER_KERNEL:
|
|
blob_write_bytes(blob, &shader->cs.local_size,
|
|
sizeof(shader->cs.local_size));
|
|
break;
|
|
|
|
case MESA_SHADER_FRAGMENT:
|
|
blob_write_bytes(blob, &shader->fs.input_attachment_read,
|
|
sizeof(shader->fs.input_attachment_read));
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
blob_write_uint32(blob, shader->bin_size);
|
|
blob_write_bytes(blob, shader->bin_ptr, shader->bin_size);
|
|
shader_desc_info_serialize(blob, shader);
|
|
|
|
/* Include the terminating NULL in the serialization */
|
|
uint32_t nir_str_size = shader->nir_str ? strlen(shader->nir_str) + 1 : 0;
|
|
uint32_t asm_str_size = shader->asm_str ? strlen(shader->asm_str) + 1 : 0;
|
|
blob_write_uint32(blob, nir_str_size);
|
|
blob_write_uint32(blob, asm_str_size);
|
|
blob_write_bytes(blob, shader->nir_str, nir_str_size);
|
|
blob_write_bytes(blob, shader->asm_str, asm_str_size);
|
|
|
|
return !blob->out_of_memory;
|
|
}
|
|
|
|
static bool
|
|
panvk_shader_serialize(struct vk_device *vk_dev,
|
|
const struct vk_shader *vk_shader, struct blob *blob)
|
|
{
|
|
struct panvk_shader *shader =
|
|
container_of(vk_shader, struct panvk_shader, vk);
|
|
|
|
blob_write_uint8(blob, vk_shader->stage);
|
|
|
|
panvk_shader_foreach_variant(shader, variant) {
|
|
panvk_shader_serialize_variant(vk_dev, variant, blob);
|
|
}
|
|
|
|
return !blob->out_of_memory;
|
|
}
|
|
|
|
static VkResult
|
|
panvk_shader_get_executable_properties(
|
|
UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
|
|
uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
|
|
{
|
|
struct panvk_shader *shader =
|
|
container_of(vk_shader, struct panvk_shader, vk);
|
|
|
|
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
|
|
executable_count);
|
|
|
|
panvk_shader_foreach_variant(shader, variant) {
|
|
/* Ignore absent variants but always add vertex on IDVS */
|
|
if (variant->bin_size == 0 &&
|
|
(variant->info.stage != MESA_SHADER_VERTEX || !variant->info.vs.idvs))
|
|
continue;
|
|
|
|
const char *variant_name = panvk_shader_variant_name(shader, variant);
|
|
const char *stage_name = _mesa_shader_stage_to_string(shader->vk.stage);
|
|
|
|
vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
|
|
{
|
|
props->stages = mesa_to_vk_shader_stage(shader->vk.stage);
|
|
props->subgroupSize = pan_subgroup_size(PAN_ARCH);
|
|
|
|
if (variant_name != NULL) {
|
|
VK_PRINT_STR(props->name, "%s %s", variant_name, stage_name);
|
|
VK_PRINT_STR(props->description, "%s %s shader", variant_name,
|
|
stage_name);
|
|
} else {
|
|
VK_COPY_STR(props->name, stage_name);
|
|
VK_PRINT_STR(props->description, "%s shader", stage_name);
|
|
}
|
|
}
|
|
|
|
if (variant->info.stage == MESA_SHADER_VERTEX && variant->info.vs.idvs) {
|
|
vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out,
|
|
props)
|
|
{
|
|
props->stages = mesa_to_vk_shader_stage(shader->vk.stage);
|
|
props->subgroupSize = pan_subgroup_size(PAN_ARCH);
|
|
VK_COPY_STR(props->name, "varying");
|
|
VK_COPY_STR(props->description, "varying shader");
|
|
}
|
|
}
|
|
}
|
|
|
|
return vk_outarray_status(&out);
|
|
}
|
|
|
|
static const struct panvk_shader_variant *
|
|
get_variant_from_executable_index(struct panvk_shader *shader,
|
|
uint32_t executable_index)
|
|
{
|
|
uint32_t i = 0;
|
|
|
|
panvk_shader_foreach_variant(shader, variant) {
|
|
/* Ignore absent variants but always add vertex on IDVS */
|
|
if (variant->bin_size == 0 &&
|
|
(variant->info.stage != MESA_SHADER_VERTEX || !variant->info.vs.idvs))
|
|
continue;
|
|
|
|
if (i == executable_index)
|
|
return variant;
|
|
|
|
i++;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static VkResult
|
|
panvk_shader_get_executable_statistics(
|
|
UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
|
|
uint32_t executable_index, uint32_t *statistic_count,
|
|
VkPipelineExecutableStatisticKHR *statistics)
|
|
{
|
|
struct panvk_shader *shader =
|
|
container_of(vk_shader, struct panvk_shader, vk);
|
|
|
|
bool needs_vary = false;
|
|
if (shader->vk.stage == MESA_SHADER_VERTEX) {
|
|
assert(executable_index == 0 || executable_index == 1);
|
|
|
|
needs_vary = executable_index == 1;
|
|
|
|
/* Readjust index to skip embedded varying variant */
|
|
if (executable_index >= 1)
|
|
executable_index--;
|
|
}
|
|
|
|
assert(executable_index < panvk_shader_num_variants(shader->vk.stage));
|
|
const struct panvk_shader_variant *variant =
|
|
get_variant_from_executable_index(shader, executable_index);
|
|
assert(variant != NULL);
|
|
|
|
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
|
|
statistic_count);
|
|
|
|
assert(executable_index == 0 || executable_index == 1);
|
|
const struct pan_stats *stats =
|
|
needs_vary ? &variant->info.stats_idvs_varying : &variant->info.stats;
|
|
|
|
vk_add_pan_stats(out, stats);
|
|
return vk_outarray_status(&out);
|
|
}
|
|
|
|
static bool
|
|
write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
|
|
const char *data)
|
|
{
|
|
ir->isText = VK_TRUE;
|
|
|
|
size_t data_len = strlen(data) + 1;
|
|
|
|
if (ir->pData == NULL) {
|
|
ir->dataSize = data_len;
|
|
return true;
|
|
}
|
|
|
|
strncpy(ir->pData, data, ir->dataSize);
|
|
if (ir->dataSize < data_len)
|
|
return false;
|
|
|
|
ir->dataSize = data_len;
|
|
return true;
|
|
}
|
|
|
|
static VkResult
|
|
panvk_shader_get_executable_internal_representations(
|
|
UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
|
|
uint32_t executable_index, uint32_t *internal_representation_count,
|
|
VkPipelineExecutableInternalRepresentationKHR *internal_representations)
|
|
{
|
|
struct panvk_shader *shader =
|
|
container_of(vk_shader, struct panvk_shader, vk);
|
|
|
|
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
|
|
internal_representations,
|
|
internal_representation_count);
|
|
|
|
bool needs_vary = false;
|
|
if (shader->vk.stage == MESA_SHADER_VERTEX) {
|
|
assert(executable_index == 0 || executable_index == 1);
|
|
|
|
needs_vary = executable_index == 1;
|
|
|
|
/* Readjust index to skip embedded varying variant */
|
|
if (executable_index >= 1)
|
|
executable_index--;
|
|
}
|
|
|
|
/* XXX: Varying shader assembly */
|
|
if (needs_vary)
|
|
return vk_outarray_status(&out);
|
|
|
|
assert(executable_index < panvk_shader_num_variants(shader->vk.stage));
|
|
const struct panvk_shader_variant *variant =
|
|
get_variant_from_executable_index(shader, executable_index);
|
|
assert(variant != NULL);
|
|
|
|
bool incomplete_text = false;
|
|
|
|
if (variant->nir_str != NULL) {
|
|
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
|
|
&out, ir)
|
|
{
|
|
VK_COPY_STR(ir->name, "NIR shader");
|
|
VK_COPY_STR(ir->description,
|
|
"NIR shader before sending to the back-end compiler");
|
|
if (!write_ir_text(ir, variant->nir_str))
|
|
incomplete_text = true;
|
|
}
|
|
}
|
|
|
|
if (variant->asm_str != NULL) {
|
|
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
|
|
&out, ir)
|
|
{
|
|
VK_COPY_STR(ir->name, "Assembly");
|
|
VK_COPY_STR(ir->description, "Final Assembly");
|
|
if (!write_ir_text(ir, variant->asm_str))
|
|
incomplete_text = true;
|
|
}
|
|
}
|
|
|
|
return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
|
|
}
|
|
|
|
#if PAN_ARCH < 9
|
|
static mali_pixel_format
|
|
get_varying_format(mesa_shader_stage stage, gl_varying_slot loc,
|
|
enum pipe_format pfmt)
|
|
{
|
|
switch (loc) {
|
|
case VARYING_SLOT_PNTC:
|
|
case VARYING_SLOT_PSIZ:
|
|
#if PAN_ARCH <= 6
|
|
return (MALI_R16F << 12) | pan_get_default_swizzle(1);
|
|
#else
|
|
return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
|
|
#endif
|
|
case VARYING_SLOT_POS:
|
|
#if PAN_ARCH <= 6
|
|
return (MALI_SNAP_4 << 12) | pan_get_default_swizzle(4);
|
|
#else
|
|
return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
|
|
#endif
|
|
default:
|
|
assert(pfmt != PIPE_FORMAT_NONE);
|
|
return GENX(pan_format_from_pipe_format)(pfmt)->hw;
|
|
}
|
|
}
|
|
|
|
struct varyings_info {
|
|
enum pipe_format fmts[VARYING_SLOT_MAX];
|
|
BITSET_DECLARE(active, VARYING_SLOT_MAX);
|
|
};
|
|
|
|
static void
|
|
collect_varyings_info(const struct pan_shader_varying *varyings,
|
|
unsigned varying_count, struct varyings_info *info)
|
|
{
|
|
for (unsigned i = 0; i < varying_count; i++) {
|
|
gl_varying_slot loc = varyings[i].location;
|
|
|
|
if (varyings[i].format == PIPE_FORMAT_NONE)
|
|
continue;
|
|
|
|
info->fmts[loc] = varyings[i].format;
|
|
BITSET_SET(info->active, loc);
|
|
}
|
|
}
|
|
|
|
static inline enum panvk_varying_buf_id
|
|
varying_buf_id(gl_varying_slot loc)
|
|
{
|
|
switch (loc) {
|
|
case VARYING_SLOT_POS:
|
|
return PANVK_VARY_BUF_POSITION;
|
|
case VARYING_SLOT_PSIZ:
|
|
return PANVK_VARY_BUF_PSIZ;
|
|
default:
|
|
return PANVK_VARY_BUF_GENERAL;
|
|
}
|
|
}
|
|
|
|
static mali_pixel_format
|
|
varying_format(gl_varying_slot loc, enum pipe_format pfmt)
|
|
{
|
|
switch (loc) {
|
|
case VARYING_SLOT_PNTC:
|
|
case VARYING_SLOT_PSIZ:
|
|
#if PAN_ARCH <= 6
|
|
return (MALI_R16F << 12) | pan_get_default_swizzle(1);
|
|
#else
|
|
return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
|
|
#endif
|
|
case VARYING_SLOT_POS:
|
|
#if PAN_ARCH <= 6
|
|
return (MALI_SNAP_4 << 12) | pan_get_default_swizzle(4);
|
|
#else
|
|
return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
|
|
#endif
|
|
default:
|
|
return GENX(pan_format_from_pipe_format)(pfmt)->hw;
|
|
}
|
|
}
|
|
|
|
static VkResult
|
|
emit_varying_attrs(struct panvk_pool *desc_pool,
|
|
const struct pan_shader_varying *varyings,
|
|
unsigned varying_count, const struct varyings_info *info,
|
|
unsigned *buf_offsets, struct panvk_priv_mem *mem)
|
|
{
|
|
if (!varying_count) {
|
|
*mem = (struct panvk_priv_mem){0};
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
*mem = panvk_pool_alloc_desc_array(desc_pool, varying_count, ATTRIBUTE);
|
|
if (!panvk_priv_mem_check_alloc(*mem))
|
|
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
|
|
|
panvk_priv_mem_write_array(*mem, 0, struct mali_attribute_packed,
|
|
varying_count, attrs) {
|
|
unsigned attr_idx = 0;
|
|
|
|
for (unsigned i = 0; i < varying_count; i++) {
|
|
pan_pack(&attrs[attr_idx++], ATTRIBUTE, cfg) {
|
|
gl_varying_slot loc = varyings[i].location;
|
|
enum pipe_format pfmt = varyings[i].format != PIPE_FORMAT_NONE
|
|
? info->fmts[loc]
|
|
: PIPE_FORMAT_NONE;
|
|
|
|
if (pfmt == PIPE_FORMAT_NONE) {
|
|
#if PAN_ARCH >= 7
|
|
cfg.format =
|
|
(MALI_CONSTANT << 12) | MALI_RGB_COMPONENT_ORDER_0000;
|
|
#else
|
|
cfg.format = (MALI_CONSTANT << 12) | PAN_V6_SWIZZLE(0, 0, 0, 0);
|
|
#endif
|
|
} else {
|
|
cfg.buffer_index = varying_buf_id(loc);
|
|
cfg.offset = buf_offsets[loc];
|
|
cfg.format = varying_format(loc, info->fmts[loc]);
|
|
}
|
|
cfg.offset_enable = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
VkResult
|
|
panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
|
|
const struct panvk_shader_variant *vs,
|
|
const struct panvk_shader_variant *fs,
|
|
struct panvk_shader_link *link)
|
|
{
|
|
BITSET_DECLARE(active_attrs, VARYING_SLOT_MAX) = {0};
|
|
unsigned buf_strides[PANVK_VARY_BUF_MAX] = {0};
|
|
unsigned buf_offsets[VARYING_SLOT_MAX] = {0};
|
|
struct varyings_info out_vars = {0};
|
|
struct varyings_info in_vars = {0};
|
|
unsigned loc;
|
|
|
|
assert(vs);
|
|
assert(vs->info.stage == MESA_SHADER_VERTEX);
|
|
|
|
collect_varyings_info(vs->info.varyings.output,
|
|
vs->info.varyings.output_count, &out_vars);
|
|
|
|
if (fs) {
|
|
assert(fs->info.stage == MESA_SHADER_FRAGMENT);
|
|
collect_varyings_info(fs->info.varyings.input,
|
|
fs->info.varyings.input_count, &in_vars);
|
|
}
|
|
|
|
BITSET_OR(active_attrs, in_vars.active, out_vars.active);
|
|
|
|
/* Handle the position and point size buffers explicitly, as they are
|
|
* passed through separate buffer pointers to the tiler job.
|
|
*/
|
|
if (BITSET_TEST(out_vars.active, VARYING_SLOT_POS)) {
|
|
buf_strides[PANVK_VARY_BUF_POSITION] = sizeof(float) * 4;
|
|
BITSET_CLEAR(active_attrs, VARYING_SLOT_POS);
|
|
}
|
|
|
|
if (BITSET_TEST(out_vars.active, VARYING_SLOT_PSIZ)) {
|
|
buf_strides[PANVK_VARY_BUF_PSIZ] = sizeof(uint16_t);
|
|
BITSET_CLEAR(active_attrs, VARYING_SLOT_PSIZ);
|
|
}
|
|
|
|
BITSET_FOREACH_SET(loc, active_attrs, VARYING_SLOT_MAX) {
|
|
/* We expect the VS to write to all inputs read by the FS, and the
|
|
* FS to read all inputs written by the VS. If that's not the
|
|
* case, we keep PIPE_FORMAT_NONE to reflect the fact we should use a
|
|
* sink attribute (writes are discarded, reads return zeros).
|
|
*/
|
|
if (in_vars.fmts[loc] == PIPE_FORMAT_NONE ||
|
|
out_vars.fmts[loc] == PIPE_FORMAT_NONE) {
|
|
in_vars.fmts[loc] = PIPE_FORMAT_NONE;
|
|
out_vars.fmts[loc] = PIPE_FORMAT_NONE;
|
|
continue;
|
|
}
|
|
|
|
unsigned out_size = util_format_get_blocksize(out_vars.fmts[loc]);
|
|
unsigned buf_idx = varying_buf_id(loc);
|
|
|
|
/* Always trust the VS input format, so we can:
|
|
* - discard components that are never read
|
|
* - use float types for interpolated fragment shader inputs
|
|
* - use fp16 for floats with mediump
|
|
* - make sure components that are not written by the FS are set to zero
|
|
*/
|
|
out_vars.fmts[loc] = in_vars.fmts[loc];
|
|
|
|
/* Special buffers are handled explicitly before this loop, everything
|
|
* else should be laid out in the general varying buffer.
|
|
*/
|
|
assert(buf_idx == PANVK_VARY_BUF_GENERAL);
|
|
|
|
/* Keep things aligned a 32-bit component. */
|
|
buf_offsets[loc] = buf_strides[buf_idx];
|
|
buf_strides[buf_idx] += ALIGN_POT(out_size, 4);
|
|
}
|
|
|
|
VkResult result = emit_varying_attrs(
|
|
desc_pool, vs->info.varyings.output, vs->info.varyings.output_count,
|
|
&out_vars, buf_offsets, &link->vs.attribs);
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
|
|
if (fs) {
|
|
result = emit_varying_attrs(desc_pool, fs->info.varyings.input,
|
|
fs->info.varyings.input_count, &in_vars,
|
|
buf_offsets, &link->fs.attribs);
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
}
|
|
|
|
memcpy(link->buf_strides, buf_strides, sizeof(link->buf_strides));
|
|
return VK_SUCCESS;
|
|
}
|
|
#endif
|
|
|
|
static const struct vk_shader_ops panvk_shader_ops = {
|
|
.destroy = panvk_shader_destroy,
|
|
.serialize = panvk_shader_serialize,
|
|
.get_executable_properties = panvk_shader_get_executable_properties,
|
|
.get_executable_statistics = panvk_shader_get_executable_statistics,
|
|
.get_executable_internal_representations =
|
|
panvk_shader_get_executable_internal_representations,
|
|
};
|
|
|
|
static void
|
|
panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const mesa_shader_stage stage,
|
|
struct panvk_shader *shader)
|
|
{
|
|
switch (stage) {
|
|
case MESA_SHADER_COMPUTE:
|
|
if (cmd->state.compute.shader != shader) {
|
|
cmd->state.compute.shader = shader;
|
|
compute_state_set_dirty(cmd, CS);
|
|
compute_state_set_dirty(cmd, PUSH_UNIFORMS);
|
|
}
|
|
break;
|
|
case MESA_SHADER_VERTEX:
|
|
if (cmd->state.gfx.vs.shader != shader) {
|
|
cmd->state.gfx.vs.shader = shader;
|
|
gfx_state_set_dirty(cmd, VS);
|
|
gfx_state_set_dirty(cmd, VS_PUSH_UNIFORMS);
|
|
}
|
|
break;
|
|
case MESA_SHADER_FRAGMENT:
|
|
if (cmd->state.gfx.fs.shader != shader) {
|
|
cmd->state.gfx.fs.shader = shader;
|
|
gfx_state_set_dirty(cmd, FS);
|
|
gfx_state_set_dirty(cmd, FS_PUSH_UNIFORMS);
|
|
}
|
|
break;
|
|
default:
|
|
assert(!"Unsupported stage");
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
panvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
|
|
const mesa_shader_stage *stages,
|
|
struct vk_shader **const shaders)
|
|
{
|
|
struct panvk_cmd_buffer *cmd =
|
|
container_of(vk_cmd, struct panvk_cmd_buffer, vk);
|
|
|
|
for (uint32_t i = 0; i < stage_count; i++) {
|
|
struct panvk_shader *shader =
|
|
container_of(shaders[i], struct panvk_shader, vk);
|
|
|
|
panvk_cmd_bind_shader(cmd, stages[i], shader);
|
|
}
|
|
}
|
|
|
|
const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
|
|
.get_nir_options = panvk_get_nir_options,
|
|
.get_spirv_options = panvk_get_spirv_options,
|
|
.preprocess_nir = panvk_preprocess_nir,
|
|
.hash_state = panvk_hash_state,
|
|
.compile = panvk_compile_shaders,
|
|
.deserialize = panvk_deserialize_shader,
|
|
.cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
|
|
.cmd_bind_shaders = panvk_cmd_bind_shaders,
|
|
};
|
|
|
|
static void
|
|
panvk_internal_shader_destroy(struct vk_device *vk_dev,
|
|
struct vk_shader *vk_shader,
|
|
const VkAllocationCallbacks *pAllocator)
|
|
{
|
|
struct panvk_device *dev = to_panvk_device(vk_dev);
|
|
struct panvk_internal_shader *shader =
|
|
container_of(vk_shader, struct panvk_internal_shader, vk);
|
|
|
|
panvk_pool_free_mem(&shader->code_mem);
|
|
|
|
#if PAN_ARCH < 9
|
|
panvk_pool_free_mem(&shader->rsd);
|
|
#else
|
|
panvk_pool_free_mem(&shader->spd);
|
|
#endif
|
|
|
|
vk_shader_free(&dev->vk, pAllocator, &shader->vk);
|
|
}
|
|
|
|
static const struct vk_shader_ops panvk_internal_shader_ops = {
|
|
.destroy = panvk_internal_shader_destroy,
|
|
};
|
|
|
|
VkResult
|
|
panvk_per_arch(create_internal_shader)(
|
|
struct panvk_device *dev, nir_shader *nir,
|
|
struct pan_compile_inputs *compiler_inputs,
|
|
struct panvk_internal_shader **shader_out)
|
|
{
|
|
struct panvk_internal_shader *shader =
|
|
vk_shader_zalloc(&dev->vk, &panvk_internal_shader_ops, nir->info.stage,
|
|
NULL, sizeof(*shader));
|
|
if (shader == NULL)
|
|
return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
VkResult result;
|
|
struct util_dynarray binary;
|
|
|
|
panvk_per_arch(compiler_lock)();
|
|
|
|
util_dynarray_init(&binary, nir);
|
|
pan_shader_compile(nir, compiler_inputs, &binary, &shader->info);
|
|
|
|
panvk_per_arch(compiler_unlock)();
|
|
|
|
unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t);
|
|
if (bin_size) {
|
|
shader->code_mem = panvk_pool_upload_aligned(&dev->mempools.exec,
|
|
binary.data, bin_size, 128);
|
|
if (!panvk_priv_mem_check_alloc(shader->code_mem)) {
|
|
result = panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
goto err_free_shader;
|
|
}
|
|
}
|
|
|
|
*shader_out = shader;
|
|
return VK_SUCCESS;
|
|
|
|
err_free_shader:
|
|
vk_shader_free(&dev->vk, NULL, &shader->vk);
|
|
return result;
|
|
}
|