/* * Copyright © 2024 Collabora Ltd. * * Derived from tu_cmd_buffer.c which is: * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen * Copyright © 2015 Intel Corporation * * SPDX-License-Identifier: MIT */ #include "genxml/gen_macros.h" #include "panvk_buffer.h" #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" #include "panvk_cmd_desc_state.h" #include "panvk_cmd_draw.h" #include "panvk_cmd_meta.h" #include "panvk_cmd_precomp.h" #include "panvk_device.h" #include "panvk_entrypoints.h" #include "panvk_image.h" #include "panvk_image_view.h" #include "panvk_instance.h" #include "panvk_meta.h" #include "panvk_priv_bo.h" #include "panvk_shader.h" #include "draw_helper.h" #include "pan_desc.h" #include "pan_earlyzs.h" #include "pan_encoder.h" #include "pan_format.h" #include "pan_jc.h" #include "pan_props.h" #include "pan_shader.h" #include "vk_format.h" #include "vk_meta.h" #include "vk_pipeline_layout.h" struct panvk_draw_data { struct panvk_draw_info info; unsigned vertex_range; unsigned padded_vertex_count; struct mali_invocation_packed invocation; struct { uint64_t varyings; uint64_t attributes; uint64_t attribute_bufs; } vs; struct { uint64_t rsd; uint64_t varyings; } fs; uint64_t varying_bufs; uint64_t position; union { uint64_t psiz; float line_width; }; uint64_t tls; uint64_t fb; const struct pan_tiler_context *tiler_ctx; uint64_t viewport; struct { struct pan_ptr vertex_copy_desc; struct pan_ptr frag_copy_desc; union { struct { struct pan_ptr vertex; struct pan_ptr tiler; }; struct pan_ptr idvs; }; } jobs; struct { uint64_t attribs; uint64_t attrib_bufs; uint64_t varying_bufs; } indirect_info; }; static bool is_indirect_draw(const struct panvk_draw_data *draw) { return draw->info.indirect.buffer_dev_addr != 0 || draw->info.index.size != 0; } static bool has_depth_att(struct panvk_cmd_buffer *cmdbuf) { return (cmdbuf->state.gfx.render.bound_attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) != 0; } static bool has_stencil_att(struct panvk_cmd_buffer *cmdbuf) { return (cmdbuf->state.gfx.render.bound_attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) != 0; } static bool writes_depth(struct panvk_cmd_buffer *cmdbuf) { const struct vk_depth_stencil_state *ds = &cmdbuf->vk.dynamic_graphics_state.ds; return has_depth_att(cmdbuf) && ds->depth.test_enable && ds->depth.write_enable && ds->depth.compare_op != VK_COMPARE_OP_NEVER; } static bool writes_stencil(struct panvk_cmd_buffer *cmdbuf) { const struct vk_depth_stencil_state *ds = &cmdbuf->vk.dynamic_graphics_state.ds; return has_stencil_att(cmdbuf) && ds->stencil.test_enable && ((ds->stencil.front.write_mask && (ds->stencil.front.op.fail != VK_STENCIL_OP_KEEP || ds->stencil.front.op.pass != VK_STENCIL_OP_KEEP || ds->stencil.front.op.depth_fail != VK_STENCIL_OP_KEEP)) || (ds->stencil.back.write_mask && (ds->stencil.back.op.fail != VK_STENCIL_OP_KEEP || ds->stencil.back.op.pass != VK_STENCIL_OP_KEEP || ds->stencil.back.op.depth_fail != VK_STENCIL_OP_KEEP))); } static bool ds_test_always_passes(struct panvk_cmd_buffer *cmdbuf) { const struct vk_depth_stencil_state *ds = &cmdbuf->vk.dynamic_graphics_state.ds; if (!has_depth_att(cmdbuf)) return true; if (ds->depth.test_enable && ds->depth.compare_op != VK_COMPARE_OP_ALWAYS) return false; if (ds->stencil.test_enable && (ds->stencil.front.op.compare != VK_COMPARE_OP_ALWAYS || ds->stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) return false; return true; } static inline enum mali_func translate_compare_func(VkCompareOp comp) { STATIC_ASSERT(VK_COMPARE_OP_NEVER == (VkCompareOp)MALI_FUNC_NEVER); STATIC_ASSERT(VK_COMPARE_OP_LESS == (VkCompareOp)MALI_FUNC_LESS); STATIC_ASSERT(VK_COMPARE_OP_EQUAL == (VkCompareOp)MALI_FUNC_EQUAL); STATIC_ASSERT(VK_COMPARE_OP_LESS_OR_EQUAL == (VkCompareOp)MALI_FUNC_LEQUAL); STATIC_ASSERT(VK_COMPARE_OP_GREATER == (VkCompareOp)MALI_FUNC_GREATER); STATIC_ASSERT(VK_COMPARE_OP_NOT_EQUAL == (VkCompareOp)MALI_FUNC_NOT_EQUAL); STATIC_ASSERT(VK_COMPARE_OP_GREATER_OR_EQUAL == (VkCompareOp)MALI_FUNC_GEQUAL); STATIC_ASSERT(VK_COMPARE_OP_ALWAYS == (VkCompareOp)MALI_FUNC_ALWAYS); return (enum mali_func)comp; } static enum mali_stencil_op translate_stencil_op(VkStencilOp in) { switch (in) { case VK_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP; case VK_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO; case VK_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE; case VK_STENCIL_OP_INCREMENT_AND_CLAMP: return MALI_STENCIL_OP_INCR_SAT; case VK_STENCIL_OP_DECREMENT_AND_CLAMP: return MALI_STENCIL_OP_DECR_SAT; case VK_STENCIL_OP_INCREMENT_AND_WRAP: return MALI_STENCIL_OP_INCR_WRAP; case VK_STENCIL_OP_DECREMENT_AND_WRAP: return MALI_STENCIL_OP_DECR_WRAP; case VK_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT; default: UNREACHABLE("Invalid stencil op"); } } static VkResult panvk_draw_prepare_fs_rsd(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { bool dirty = dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) || dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) || dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) || dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_ENABLE) || dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_FACTORS) || dyn_gfx_state_dirty(cmdbuf, RS_LINE_MODE) || /* line mode needs primitive topology */ dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) || dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP_ENABLE) || dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP) || dyn_gfx_state_dirty(cmdbuf, CB_ATTACHMENT_COUNT) || dyn_gfx_state_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) || dyn_gfx_state_dirty(cmdbuf, CB_BLEND_ENABLES) || dyn_gfx_state_dirty(cmdbuf, CB_BLEND_EQUATIONS) || dyn_gfx_state_dirty(cmdbuf, CB_WRITE_MASKS) || dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS) || dyn_gfx_state_dirty(cmdbuf, COLOR_ATTACHMENT_MAP) || dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) || dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) || dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) || dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) || dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) || dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) || dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_COMPARE_MASK) || dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) || dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_REFERENCE) || dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) || dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) || dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) || dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) || gfx_state_dirty(cmdbuf, FS) || gfx_state_dirty(cmdbuf, OQ) || gfx_state_dirty(cmdbuf, RENDER_STATE); if (!dirty) { draw->fs.rsd = cmdbuf->state.gfx.fs.rsd; return VK_SUCCESS; } const struct vk_dynamic_graphics_state *dyns = &cmdbuf->vk.dynamic_graphics_state; const struct vk_rasterization_state *rs = &dyns->rs; const struct vk_depth_stencil_state *ds = &dyns->ds; const struct vk_input_assembly_state *ia = &dyns->ia; const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf)); const struct pan_shader_info *fs_info = fs ? &fs->info : NULL; uint32_t bd_count = MAX2(cmdbuf->state.gfx.render.fb.info.rt_count, 1); bool test_s = has_stencil_att(cmdbuf) && ds->stencil.test_enable; bool test_z = has_depth_att(cmdbuf) && ds->depth.test_enable; bool writes_z = writes_depth(cmdbuf); bool writes_s = writes_stencil(cmdbuf); bool msaa = dyns->ms.rasterization_samples > 1; if ((ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST || ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP) && rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM) { /* we need to disable MSAA when rendering bresenham lines. * * From the Vulkan spec: * "When Bresenham lines are being rasterized, sample locations may * all be treated as being at the pixel center (this may affect * attribute and depth interpolation)."" */ msaa = false; } struct pan_ptr ptr = panvk_cmd_alloc_desc_aggregate( cmdbuf, PAN_DESC(RENDERER_STATE), PAN_DESC_ARRAY(bd_count, BLEND)); if (!ptr.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; struct mali_renderer_state_packed *rsd = ptr.cpu; struct mali_blend_packed *bds = ptr.cpu + pan_size(RENDERER_STATE); struct panvk_blend_info *binfo = &cmdbuf->state.gfx.cb.info; uint64_t fs_code = panvk_shader_variant_get_dev_addr(fs); if (fs_info != NULL) { panvk_per_arch(blend_emit_descs)(cmdbuf, bds); } else { for (unsigned i = 0; i < bd_count; i++) { pan_pack(&bds[i], BLEND, cfg) { cfg.enable = false; cfg.internal.mode = MALI_BLEND_MODE_OFF; } } } pan_pack(rsd, RENDERER_STATE, cfg) { bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable; if (fs) { pan_shader_prepare_rsd(fs_info, fs_code, &cfg); uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments & MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS; uint8_t rt_written = color_attachment_written_mask( fs, &cmdbuf->vk.dynamic_graphics_state.cal); uint8_t rt_read = color_attachment_read_mask(fs, &dyns->ial, rt_mask); enum pan_earlyzs_zs_tilebuf_read zs_read = (z_attachment_read(fs, &dyns->ial) || s_attachment_read(fs, &dyns->ial)) ? PAN_EARLYZS_ZS_TILEBUF_READ_NO_OPT : PAN_EARLYZS_ZS_TILEBUF_NOT_READ; cfg.properties.allow_forward_pixel_to_kill = fs_info->fs.can_fpk && !(rt_mask & ~rt_written) && !(rt_read & rt_written) && !alpha_to_coverage && !binfo->any_dest_read; bool writes_zs = writes_z || writes_s; bool zs_always_passes = ds_test_always_passes(cmdbuf); bool oq = cmdbuf->state.gfx.occlusion_query.mode != MALI_OCCLUSION_MODE_DISABLED; struct pan_earlyzs_state earlyzs = pan_earlyzs_get(fs->fs.earlyzs_lut, writes_zs || oq, alpha_to_coverage, zs_always_passes, zs_read); /* early ZS check for FPK is performed by HW on v7+ */ cfg.properties.allow_forward_pixel_to_be_killed = !fs->info.writes_global && ((PAN_ARCH > 6) || earlyzs.kill != MALI_PIXEL_KILL_FORCE_LATE); cfg.properties.pixel_kill_operation = earlyzs.kill; cfg.properties.zs_update_operation = earlyzs.update; cfg.multisample_misc.evaluate_per_sample = (fs->info.fs.sample_shading && dyns->ms.rasterization_samples > 1); } else { cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION; cfg.properties.allow_forward_pixel_to_kill = true; cfg.properties.allow_forward_pixel_to_be_killed = true; cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY; } cfg.multisample_misc.multisample_enable = msaa; cfg.multisample_misc.sample_mask = msaa ? dyns->ms.sample_mask : UINT16_MAX; cfg.multisample_misc.depth_function = test_z ? translate_compare_func(ds->depth.compare_op) : MALI_FUNC_ALWAYS; cfg.multisample_misc.depth_write_mask = writes_z; cfg.multisample_misc.fixed_function_near_discard = cfg.multisample_misc.fixed_function_far_discard = vk_rasterization_state_depth_clip_enable(rs); cfg.multisample_misc.fixed_function_depth_range_fixed = !rs->depth_clamp_enable; cfg.multisample_misc.shader_depth_range_fixed = true; cfg.stencil_mask_misc.stencil_enable = test_s; cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage; cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS; cfg.stencil_mask_misc.front_facing_depth_bias = rs->depth_bias.enable; cfg.stencil_mask_misc.back_facing_depth_bias = rs->depth_bias.enable; if (rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM) cfg.stencil_mask_misc.aligned_line_ends = true; cfg.depth_units = rs->depth_bias.constant_factor; cfg.depth_factor = rs->depth_bias.slope_factor; cfg.depth_bias_clamp = rs->depth_bias.clamp; cfg.stencil_front.mask = ds->stencil.front.compare_mask; cfg.stencil_back.mask = ds->stencil.back.compare_mask; cfg.stencil_mask_misc.stencil_mask_front = ds->stencil.front.write_mask; cfg.stencil_mask_misc.stencil_mask_back = ds->stencil.back.write_mask; cfg.stencil_front.reference_value = ds->stencil.front.reference; cfg.stencil_back.reference_value = ds->stencil.back.reference; if (test_s) { cfg.stencil_front.compare_function = translate_compare_func(ds->stencil.front.op.compare); cfg.stencil_front.stencil_fail = translate_stencil_op(ds->stencil.front.op.fail); cfg.stencil_front.depth_fail = translate_stencil_op(ds->stencil.front.op.depth_fail); cfg.stencil_front.depth_pass = translate_stencil_op(ds->stencil.front.op.pass); cfg.stencil_back.compare_function = translate_compare_func(ds->stencil.back.op.compare); cfg.stencil_back.stencil_fail = translate_stencil_op(ds->stencil.back.op.fail); cfg.stencil_back.depth_fail = translate_stencil_op(ds->stencil.back.op.depth_fail); cfg.stencil_back.depth_pass = translate_stencil_op(ds->stencil.back.op.pass); } } cmdbuf->state.gfx.fs.rsd = ptr.gpu; draw->fs.rsd = cmdbuf->state.gfx.fs.rsd; return VK_SUCCESS; } static VkResult panvk_draw_prepare_tiler_context(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { struct panvk_batch *batch = cmdbuf->cur_batch; VkResult result = panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf, draw->info.layer_id); if (result != VK_SUCCESS) return result; draw->tiler_ctx = &batch->tiler.ctx; return VK_SUCCESS; } static VkResult panvk_draw_prepare_varyings(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); const struct panvk_shader_link *link = &cmdbuf->state.gfx.link; struct pan_ptr bufs = panvk_cmd_alloc_desc_array( cmdbuf, PANVK_VARY_BUF_MAX + 1, ATTRIBUTE_BUFFER); if (!bufs.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; struct mali_attribute_buffer_packed *buf_descs = bufs.cpu; const struct vk_input_assembly_state *ia = &cmdbuf->vk.dynamic_graphics_state.ia; bool writes_point_size = vs->info.vs.writes_point_size && ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST; uint64_t psiz_buf = 0; if (is_indirect_draw(draw) && !cmdbuf->state.gfx.vs.indirect_varying_bufs_infos) { struct pan_ptr bufs_info_storage = panvk_cmd_alloc_dev_mem( cmdbuf, desc, sizeof(struct libpan_draw_helper_varying_buf_info), 8); if (!bufs_info_storage.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; cmdbuf->state.gfx.vs.indirect_varying_bufs_infos = bufs_info_storage.gpu; struct libpan_draw_helper_varying_buf_info *vary_bufs_info = bufs_info_storage.cpu; vary_bufs_info->address = dev->indirect_varying_buffer->addr.dev; vary_bufs_info->size = PANVK_JM_MAX_PER_VTX_ATTRIBUTES_INDIRECT_SIZE * PANVK_JM_MAX_VERTICES_INDIRECT; vary_bufs_info->offset = 0; } for (unsigned i = 0; i < PANVK_VARY_BUF_MAX; i++) { uint32_t buf_size; uint64_t buf_addr; if (is_indirect_draw(draw)) { buf_addr = dev->indirect_varying_buffer->addr.dev; buf_size = 0; } else { buf_size = draw->padded_vertex_count * draw->info.instance.count * link->buf_strides[i]; buf_addr = buf_size ? panvk_cmd_alloc_dev_mem(cmdbuf, varying, buf_size, 64).gpu : 0; if (buf_size && !buf_addr) return VK_ERROR_OUT_OF_DEVICE_MEMORY; } pan_pack(&buf_descs[i], ATTRIBUTE_BUFFER, cfg) { cfg.stride = link->buf_strides[i]; cfg.size = buf_size; cfg.pointer = buf_addr; } if (i == PANVK_VARY_BUF_POSITION) draw->position = buf_addr; if (i == PANVK_VARY_BUF_PSIZ) psiz_buf = buf_addr; } /* We need an empty entry to stop prefetching on Bifrost */ memset(bufs.cpu + (pan_size(ATTRIBUTE_BUFFER) * PANVK_VARY_BUF_MAX), 0, pan_size(ATTRIBUTE_BUFFER)); if (writes_point_size) draw->psiz = psiz_buf; else if (ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST || ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP) draw->line_width = cmdbuf->vk.dynamic_graphics_state.rs.line.width; else draw->line_width = 1.0f; draw->varying_bufs = bufs.gpu; draw->indirect_info.varying_bufs = cmdbuf->state.gfx.vs.indirect_varying_bufs_infos; draw->vs.varyings = panvk_priv_mem_dev_addr(link->vs.attribs); draw->fs.varyings = panvk_priv_mem_dev_addr(link->fs.attribs); return VK_SUCCESS; } static void panvk_draw_emit_attrib_buf( const struct panvk_draw_data *draw, const struct vk_vertex_binding_state *buf_info, uint32_t stride, const struct panvk_attrib_buf *buf, struct mali_attribute_buffer_packed *desc, struct libpan_draw_helper_attrib_buf_info *helper_buf_info) { uint64_t addr = buf->address & ~63ULL; unsigned size = buf->size + (buf->address & 63); unsigned divisor = draw->padded_vertex_count * buf_info->divisor; bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE; struct mali_attribute_buffer_packed *buf_ext = &desc[1]; /* In case of indirect draw, the descriptor will be patched at runtime */ if (helper_buf_info != NULL) { pan_pack(desc, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D; cfg.pointer = addr; cfg.size = size; } helper_buf_info->divisor = buf_info->divisor; helper_buf_info->stride = stride; helper_buf_info->per_instance = per_instance; } else if (draw->info.instance.count <= 1) { pan_pack(desc, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D; cfg.stride = per_instance ? 0 : stride; cfg.pointer = addr; cfg.size = size; } } else if (!per_instance) { pan_pack(desc, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS; cfg.divisor = draw->padded_vertex_count; cfg.stride = stride; cfg.pointer = addr; cfg.size = size; } } else if (!divisor) { /* instance_divisor == 0 means all instances share the same value. * Make it a 1D array with a zero stride. */ pan_pack(desc, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D; cfg.stride = 0; cfg.pointer = addr; cfg.size = size; } } else if (util_is_power_of_two_or_zero(divisor)) { pan_pack(desc, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; cfg.stride = stride; cfg.pointer = addr; cfg.size = size; cfg.divisor_r = __builtin_ctz(divisor); } } else { unsigned divisor_r = 0, divisor_e = 0; unsigned divisor_d = pan_compute_npot_divisor(divisor, &divisor_r, &divisor_e); pan_pack(desc, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; cfg.stride = stride; cfg.pointer = addr; cfg.size = size; cfg.divisor_r = divisor_r; cfg.divisor_e = divisor_e; } pan_cast_and_pack(buf_ext, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) { cfg.divisor_numerator = divisor_d; cfg.divisor = buf_info->divisor; } buf_ext = NULL; } /* If the buffer extension wasn't used, memset(0) */ if (buf_ext) memset(buf_ext, 0, pan_size(ATTRIBUTE_BUFFER)); } static void panvk_draw_emit_attrib(const struct panvk_draw_data *draw, const struct vk_vertex_attribute_state *attrib_info, const struct vk_vertex_binding_state *buf_info, const struct panvk_attrib_buf *buf, struct mali_attribute_packed *desc, struct libpan_draw_helper_attrib_info *helper_attrib_info) { bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE; enum pipe_format f = vk_format_to_pipe_format(attrib_info->format); unsigned buf_idx = attrib_info->binding; pan_pack(desc, ATTRIBUTE, cfg) { cfg.buffer_index = buf_idx * 2; cfg.offset_enable = true; cfg.format = GENX(pan_format_from_pipe_format)(f)->hw; uint32_t offset = attrib_info->offset + (buf->address & 63); /* In case of indirect draw, the descriptor will be patched at runtime */ if (helper_attrib_info != NULL) { helper_attrib_info->base_offset = offset; helper_attrib_info->stride = per_instance ? buf_info->stride : 0; } else { cfg.offset = offset; if (per_instance) cfg.offset += draw->info.instance.base * buf_info->stride; } } } static VkResult panvk_draw_prepare_vs_attribs(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); const struct vk_dynamic_graphics_state *dyns = &cmdbuf->vk.dynamic_graphics_state; const struct vk_vertex_input_state *vi = dyns->vi; unsigned num_imgs = vs->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG]; unsigned num_vs_attribs = util_last_bit(vi->attributes_valid); unsigned num_vbs = util_last_bit(vi->bindings_valid); unsigned attrib_count = num_imgs ? MAX_VS_ATTRIBS + num_imgs : num_vs_attribs; bool dirty = dyn_gfx_state_dirty(cmdbuf, VI) || dyn_gfx_state_dirty(cmdbuf, VI_BINDINGS_VALID) || dyn_gfx_state_dirty(cmdbuf, VI_BINDING_STRIDES) || gfx_state_dirty(cmdbuf, VB) || gfx_state_dirty(cmdbuf, DESC_STATE) || is_indirect_draw(draw) != cmdbuf->state.gfx.vs.previous_draw_was_indirect; if (!dirty) return VK_SUCCESS; unsigned attrib_buf_count = (num_vbs + num_imgs) * 2; struct pan_ptr bufs = panvk_cmd_alloc_desc_array( cmdbuf, attrib_buf_count + 1, ATTRIBUTE_BUFFER); struct mali_attribute_buffer_packed *attrib_buf_descs = bufs.cpu; struct pan_ptr attribs = panvk_cmd_alloc_desc_array(cmdbuf, attrib_count, ATTRIBUTE); struct mali_attribute_packed *attrib_descs = attribs.cpu; if (!bufs.gpu || (attrib_count && !attribs.gpu)) return VK_ERROR_OUT_OF_DEVICE_MEMORY; struct libpan_draw_helper_attrib_buf_info *bufs_infos = NULL; struct libpan_draw_helper_attrib_info *attribs_infos = NULL; if (is_indirect_draw(draw)) { struct pan_ptr bufs_infos_storage = panvk_cmd_alloc_dev_mem( cmdbuf, desc, num_vbs * sizeof(struct libpan_draw_helper_attrib_buf_info), 8); struct pan_ptr attribs_infos_storage = panvk_cmd_alloc_dev_mem( cmdbuf, desc, num_vs_attribs * sizeof(struct libpan_draw_helper_attrib_info), 8); if (!bufs_infos_storage.gpu || (num_vs_attribs && !attribs_infos_storage.gpu)) return VK_ERROR_OUT_OF_DEVICE_MEMORY; cmdbuf->state.gfx.vs.indirect_attrib_bufs_infos = bufs_infos_storage.gpu; cmdbuf->state.gfx.vs.indirect_attribs_infos = attribs_infos_storage.gpu; bufs_infos = bufs_infos_storage.cpu; attribs_infos = attribs_infos_storage.cpu; } for (unsigned i = 0; i < num_vbs; i++) { if (vi->bindings_valid & BITFIELD_BIT(i)) { struct libpan_draw_helper_attrib_buf_info *helper_buf_info = bufs_infos ? &bufs_infos[i] : NULL; panvk_draw_emit_attrib_buf(draw, &vi->bindings[i], dyns->vi_binding_strides[i], &cmdbuf->state.gfx.vb.bufs[i], &attrib_buf_descs[i * 2], helper_buf_info); } else { memset(&attrib_buf_descs[i * 2], 0, sizeof(*attrib_buf_descs) * 2); } } for (unsigned i = 0; i < num_vs_attribs; i++) { if (vi->attributes_valid & BITFIELD_BIT(i)) { unsigned buf_idx = vi->attributes[i].binding; struct libpan_draw_helper_attrib_info *helper_attrib_info = attribs_infos ? &attribs_infos[i] : NULL; panvk_draw_emit_attrib(draw, &vi->attributes[i], &vi->bindings[buf_idx], &cmdbuf->state.gfx.vb.bufs[buf_idx], &attrib_descs[i], helper_attrib_info); } else { memset(&attrib_descs[i], 0, sizeof(attrib_descs[0])); } } /* A NULL entry is needed to stop prefecting on Bifrost */ memset(bufs.cpu + (pan_size(ATTRIBUTE_BUFFER) * attrib_buf_count), 0, pan_size(ATTRIBUTE_BUFFER)); cmdbuf->state.gfx.vs.attrib_bufs = bufs.gpu; cmdbuf->state.gfx.vs.attribs = attribs.gpu; if (num_imgs) { cmdbuf->state.gfx.vs.desc.img_attrib_table = attribs.gpu + (MAX_VS_ATTRIBS * pan_size(ATTRIBUTE)); cmdbuf->state.gfx.vs.desc.tables[PANVK_BIFROST_DESC_TABLE_IMG] = bufs.gpu + (num_vbs * pan_size(ATTRIBUTE_BUFFER) * 2); } return VK_SUCCESS; } static void panvk_draw_prepare_attributes(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { panvk_draw_prepare_vs_attribs(cmdbuf, draw); draw->vs.attributes = cmdbuf->state.gfx.vs.attribs; draw->vs.attribute_bufs = cmdbuf->state.gfx.vs.attrib_bufs; draw->indirect_info.attribs = cmdbuf->state.gfx.vs.indirect_attribs_infos; draw->indirect_info.attrib_bufs = cmdbuf->state.gfx.vs.indirect_attrib_bufs_infos; } static void panvk_emit_viewport(struct panvk_cmd_buffer *cmdbuf, struct mali_viewport_packed *vpd) { const struct vk_viewport_state *vp = &cmdbuf->vk.dynamic_graphics_state.vp; if (vp->viewport_count < 1) return; const VkViewport *viewport = &vp->viewports[0]; const VkRect2D *scissor = &vp->scissors[0]; float minz, maxz; panvk_depth_range(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state.vp, &minz, &maxz); /* The spec says "width must be greater than 0.0" */ assert(viewport->width >= 0); int minx = (int)viewport->x; int maxx = (int)(viewport->x + viewport->width); /* Viewport height can be negative */ int miny = MIN2((int)viewport->y, (int)(viewport->y + viewport->height)); int maxy = MAX2((int)viewport->y, (int)(viewport->y + viewport->height)); assert(scissor->offset.x >= 0 && scissor->offset.y >= 0); minx = MAX2(scissor->offset.x, minx); miny = MAX2(scissor->offset.y, miny); maxx = MIN2(scissor->offset.x + scissor->extent.width, maxx); maxy = MIN2(scissor->offset.y + scissor->extent.height, maxy); /* Make sure we don't end up with a max < min when width/height is 0 */ maxx = maxx > minx ? maxx - 1 : maxx; maxy = maxy > miny ? maxy - 1 : maxy; /* Clamp viewport scissor to valid range */ minx = CLAMP(minx, 0, UINT16_MAX); maxx = CLAMP(maxx, 0, UINT16_MAX); miny = CLAMP(miny, 0, UINT16_MAX); maxy = CLAMP(maxy, 0, UINT16_MAX); pan_pack(vpd, VIEWPORT, cfg) { cfg.scissor_minimum_x = minx; cfg.scissor_minimum_y = miny; cfg.scissor_maximum_x = maxx; cfg.scissor_maximum_y = maxy; cfg.minimum_z = minz; cfg.maximum_z = maxz; } } static VkResult panvk_draw_prepare_viewport(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { /* When rasterizerDiscardEnable is active, it is allowed to have viewport and * scissor disabled. * As a result, we define an empty one. */ if (!cmdbuf->state.gfx.vpd || dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) || dyn_gfx_state_dirty(cmdbuf, VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) || dyn_gfx_state_dirty(cmdbuf, VP_SCISSORS) || dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) || dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE)) { struct pan_ptr vp = panvk_cmd_alloc_desc(cmdbuf, VIEWPORT); if (!vp.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; panvk_emit_viewport(cmdbuf, vp.cpu); cmdbuf->state.gfx.vpd = vp.gpu; } draw->viewport = cmdbuf->state.gfx.vpd; return VK_SUCCESS; } static void panvk_emit_vertex_dcd(struct panvk_cmd_buffer *cmdbuf, const struct panvk_draw_data *draw, struct mali_draw_packed *dcd) { const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); const struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; pan_pack(dcd, DRAW, cfg) { cfg.state = panvk_priv_mem_dev_addr(vs->rsd); cfg.attributes = draw->vs.attributes; cfg.attribute_buffers = draw->vs.attribute_bufs; cfg.varyings = draw->vs.varyings; cfg.varying_buffers = draw->varying_bufs; cfg.thread_storage = draw->tls; /* In case of indirect draw, the descriptor will be patched at runtime */ if (!is_indirect_draw(draw)) { cfg.offset_start = draw->info.vertex.raw_offset; cfg.instance_size = draw->info.instance.count > 1 ? draw->padded_vertex_count : 1; } cfg.uniform_buffers = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO]; cfg.push_uniforms = cmdbuf->state.gfx.vs.push_uniforms; cfg.textures = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]; cfg.samplers = vs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]; } } static VkResult panvk_draw_prepare_vertex_job(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { struct panvk_batch *batch = cmdbuf->cur_batch; struct pan_ptr ptr = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB); if (!ptr.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; util_dynarray_append(&batch->jobs, ptr.cpu); draw->jobs.vertex = ptr; memcpy(pan_section_ptr(ptr.cpu, COMPUTE_JOB, INVOCATION), &draw->invocation, pan_size(INVOCATION)); pan_section_pack(ptr.cpu, COMPUTE_JOB, PARAMETERS, cfg) { cfg.job_task_split = 5; } panvk_emit_vertex_dcd(cmdbuf, draw, pan_section_ptr(ptr.cpu, COMPUTE_JOB, DRAW)); return VK_SUCCESS; } static enum mali_draw_mode translate_prim_topology(VkPrimitiveTopology in) { /* Test VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA separately, as it's not * part of the VkPrimitiveTopology enum. */ if (in == VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA) return MALI_DRAW_MODE_TRIANGLES; switch (in) { case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: return MALI_DRAW_MODE_POINTS; case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: return MALI_DRAW_MODE_LINES; case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: return MALI_DRAW_MODE_LINE_STRIP; case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: return MALI_DRAW_MODE_TRIANGLES; case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: return MALI_DRAW_MODE_TRIANGLE_STRIP; case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: return MALI_DRAW_MODE_TRIANGLE_FAN; case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: default: UNREACHABLE("Invalid primitive type"); } } static void panvk_emit_tiler_primitive(struct panvk_cmd_buffer *cmdbuf, const struct panvk_draw_data *draw, struct mali_primitive_packed *prim) { const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf)); const struct vk_dynamic_graphics_state *dyns = &cmdbuf->vk.dynamic_graphics_state; const struct vk_input_assembly_state *ia = &dyns->ia; const struct vk_rasterization_state *rs = &dyns->rs; bool writes_point_size = vs->info.vs.writes_point_size && ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST; bool secondary_shader = vs->info.vs.secondary_enable && fs != NULL; assert(!(vs->info.outputs_written & VARYING_BIT_PRIMITIVE_ID)); bool fs_reads_primitive_id = fs ? fs->info.fs.reads_primitive_id : false; pan_pack(prim, PRIMITIVE, cfg) { cfg.draw_mode = translate_prim_topology(ia->primitive_topology); if (writes_point_size) cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; cfg.primitive_index_enable = fs_reads_primitive_id; cfg.primitive_index_writeback = fs_reads_primitive_id; cfg.first_provoking_vertex = cmdbuf->state.gfx.render.first_provoking_vertex != U_TRISTATE_NO; if (ia->primitive_restart_enable) cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT; cfg.job_task_split = 6; if (draw->info.index.size) { switch (draw->info.index.size) { case 4: cfg.index_type = MALI_INDEX_TYPE_UINT32; break; case 2: cfg.index_type = MALI_INDEX_TYPE_UINT16; break; case 1: cfg.index_type = MALI_INDEX_TYPE_UINT8; break; default: UNREACHABLE("Invalid index size"); } } /* In case of indirect draw, the descriptor will be patched at runtime */ cfg.index_count = is_indirect_draw(draw) ? 1 : draw->info.vertex.count; cfg.low_depth_cull = cfg.high_depth_cull = vk_rasterization_state_depth_clip_enable(rs); cfg.secondary_shader = secondary_shader; } } static void panvk_emit_tiler_primitive_size(struct panvk_cmd_buffer *cmdbuf, const struct panvk_draw_data *draw, struct mali_primitive_size_packed *primsz) { const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); const struct vk_input_assembly_state *ia = &cmdbuf->vk.dynamic_graphics_state.ia; bool writes_point_size = vs->info.vs.writes_point_size && ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST; pan_pack(primsz, PRIMITIVE_SIZE, cfg) { if (writes_point_size) { cfg.size_array = draw->psiz; } else { cfg.fixed_sized = draw->line_width; } } } static uint32_t primitive_vertex_count(enum mali_draw_mode in) { switch (in) { case MALI_DRAW_MODE_POINTS: return 1; case MALI_DRAW_MODE_LINES: case MALI_DRAW_MODE_LINE_STRIP: return 2; case MALI_DRAW_MODE_TRIANGLES: case MALI_DRAW_MODE_TRIANGLE_STRIP: case MALI_DRAW_MODE_TRIANGLE_FAN: return 3; default: UNREACHABLE("Invalid draw mode"); } } static void panvk_emit_tiler_dcd(struct panvk_cmd_buffer *cmdbuf, const struct panvk_draw_data *draw, struct mali_draw_packed *dcd) { struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; const struct vk_rasterization_state *rs = &cmdbuf->vk.dynamic_graphics_state.rs; const struct vk_input_assembly_state *ia = &cmdbuf->vk.dynamic_graphics_state.ia; pan_pack(dcd, DRAW, cfg) { cfg.front_face_ccw = rs->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE; cfg.cull_front_face = (rs->cull_mode & VK_CULL_MODE_FRONT_BIT) != 0; cfg.cull_back_face = (rs->cull_mode & VK_CULL_MODE_BACK_BIT) != 0; cfg.position = draw->position; cfg.state = draw->fs.rsd; cfg.attributes = fs_desc_state->img_attrib_table; cfg.attribute_buffers = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_IMG]; cfg.viewport = draw->viewport; cfg.varyings = draw->fs.varyings; cfg.varying_buffers = cfg.varyings ? draw->varying_bufs : 0; cfg.thread_storage = draw->tls; /* For all primitives but lines DRAW.flat_shading_vertex must * be set to 0 and the provoking vertex is selected with the * PRIMITIVE.first_provoking_vertex field. */ if (ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST || ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP) cfg.flat_shading_vertex = true; /* In case of indirect draw, the descriptor will be patched at runtime */ if (!is_indirect_draw(draw)) { cfg.offset_start = draw->info.vertex.raw_offset; cfg.instance_size = draw->info.instance.count > 1 ? draw->padded_vertex_count : 1; uint32_t primitives_per_instance = DIV_ROUND_UP(draw->padded_vertex_count, primitive_vertex_count( translate_prim_topology(ia->primitive_topology))); /* instance_primitive_size has the same restrictions as * padded_vertex_count, so we can use pan_padded_vertex_count here. */ cfg.instance_primitive_size = pan_padded_vertex_count(primitives_per_instance); } cfg.uniform_buffers = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO]; cfg.push_uniforms = cmdbuf->state.gfx.fs.push_uniforms; cfg.textures = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]; cfg.samplers = fs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]; cfg.occlusion_query = cmdbuf->state.gfx.occlusion_query.mode; cfg.occlusion = cmdbuf->state.gfx.occlusion_query.ptr; } } static void set_provoking_vertex_mode(struct panvk_cmd_buffer *cmdbuf, enum u_tristate first_provoking_vertex) { struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx; if (first_provoking_vertex != U_TRISTATE_UNSET) { /* If this is not the first draw, first_provoking_vertex should match * the one from the previous draws. Unfortunately, we can't check it * when the render pass is inherited. */ assert(state->render.first_provoking_vertex == U_TRISTATE_UNSET || state->render.first_provoking_vertex == first_provoking_vertex); state->render.first_provoking_vertex = first_provoking_vertex; } /* Once we emit the first FBDs/TDs, we need to commit to a state. If we * choose the wrong one, we will fail the assert when the next application * draw happens (with a different state). Use PROVOKING_VERTEX_MODE_FIRST * because it's the vulkan default, and so likely to be right more often. * * TODO: handle this case better */ if (state->render.first_provoking_vertex == U_TRISTATE_UNSET) state->render.first_provoking_vertex = U_TRISTATE_YES; } static VkResult panvk_draw_prepare_tiler_job(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { struct panvk_batch *batch = cmdbuf->cur_batch; const struct panvk_shader_variant *fs = panvk_shader_only_variant(cmdbuf->state.gfx.fs.shader); struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; struct pan_ptr ptr; VkResult result = panvk_per_arch(meta_get_copy_desc_job)( cmdbuf, fs, &cmdbuf->state.gfx.desc_state, fs_desc_state, 0, &ptr); if (result != VK_SUCCESS) return result; if (ptr.cpu) util_dynarray_append(&batch->jobs, ptr.cpu); draw->jobs.frag_copy_desc = ptr; ptr = panvk_cmd_alloc_desc(cmdbuf, TILER_JOB); util_dynarray_append(&batch->jobs, ptr.cpu); draw->jobs.tiler = ptr; memcpy(pan_section_ptr(ptr.cpu, TILER_JOB, INVOCATION), &draw->invocation, pan_size(INVOCATION)); panvk_emit_tiler_primitive(cmdbuf, draw, pan_section_ptr(ptr.cpu, TILER_JOB, PRIMITIVE)); panvk_emit_tiler_primitive_size( cmdbuf, draw, pan_section_ptr(ptr.cpu, TILER_JOB, PRIMITIVE_SIZE)); panvk_emit_tiler_dcd(cmdbuf, draw, pan_section_ptr(ptr.cpu, TILER_JOB, DRAW)); pan_section_pack(ptr.cpu, TILER_JOB, TILER, cfg) { cfg.address = PAN_ARCH >= 9 ? draw->tiler_ctx->valhall.desc : draw->tiler_ctx->bifrost.desc; } pan_section_pack(ptr.cpu, TILER_JOB, PADDING, padding) ; return VK_SUCCESS; } static VkResult panvk_draw_prepare_idvs_job(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { struct panvk_batch *batch = cmdbuf->cur_batch; struct pan_ptr ptr = panvk_cmd_alloc_desc(cmdbuf, INDEXED_VERTEX_JOB); if (!ptr.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; util_dynarray_append(&batch->jobs, ptr.cpu); draw->jobs.idvs = ptr; memcpy(pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, INVOCATION), &draw->invocation, pan_size(INVOCATION)); panvk_emit_tiler_primitive( cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, PRIMITIVE)); panvk_emit_tiler_primitive_size( cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, PRIMITIVE_SIZE)); pan_section_pack(ptr.cpu, INDEXED_VERTEX_JOB, TILER, cfg) { cfg.address = PAN_ARCH >= 9 ? draw->tiler_ctx->valhall.desc : draw->tiler_ctx->bifrost.desc; } pan_section_pack(ptr.cpu, INDEXED_VERTEX_JOB, PADDING, _) { } panvk_emit_tiler_dcd( cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, FRAGMENT_DRAW)); panvk_emit_vertex_dcd( cmdbuf, draw, pan_section_ptr(ptr.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW)); return VK_SUCCESS; } static VkResult panvk_draw_prepare_vs_copy_desc_job(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { struct panvk_batch *batch = cmdbuf->cur_batch; const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); const struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; const struct vk_vertex_input_state *vi = cmdbuf->vk.dynamic_graphics_state.vi; unsigned num_vbs = util_last_bit(vi->bindings_valid); struct pan_ptr ptr; VkResult result = panvk_per_arch(meta_get_copy_desc_job)( cmdbuf, vs, &cmdbuf->state.gfx.desc_state, vs_desc_state, num_vbs * pan_size(ATTRIBUTE_BUFFER) * 2, &ptr); if (result != VK_SUCCESS) return result; if (ptr.cpu) { util_dynarray_append(&batch->jobs, ptr.cpu); } draw->jobs.vertex_copy_desc = ptr; return VK_SUCCESS; } static VkResult panvk_draw_prepare_fs_copy_desc_job(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { const struct panvk_shader_variant *fs = panvk_shader_only_variant(cmdbuf->state.gfx.fs.shader); struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; struct panvk_batch *batch = cmdbuf->cur_batch; struct pan_ptr ptr; VkResult result = panvk_per_arch(meta_get_copy_desc_job)( cmdbuf, fs, &cmdbuf->state.gfx.desc_state, fs_desc_state, 0, &ptr); if (result != VK_SUCCESS) return result; if (ptr.cpu) { util_dynarray_append(&batch->jobs, ptr.cpu); } draw->jobs.frag_copy_desc = ptr; return VK_SUCCESS; } void panvk_per_arch(cmd_preload_fb_after_batch_split)(struct panvk_cmd_buffer *cmdbuf) { for (unsigned i = 0; i < cmdbuf->state.gfx.render.fb.info.rt_count; i++) { if (cmdbuf->state.gfx.render.fb.info.rts[i].view) { cmdbuf->state.gfx.render.fb.info.rts[i].clear = false; cmdbuf->state.gfx.render.fb.info.rts[i].preload = true; } } if (cmdbuf->state.gfx.render.fb.info.zs.view.zs) { cmdbuf->state.gfx.render.fb.info.zs.clear.z = false; cmdbuf->state.gfx.render.fb.info.zs.preload.z = true; } if (cmdbuf->state.gfx.render.fb.info.zs.view.s || (cmdbuf->state.gfx.render.fb.info.zs.view.zs && util_format_is_depth_and_stencil( cmdbuf->state.gfx.render.fb.info.zs.view.zs->format))) { cmdbuf->state.gfx.render.fb.info.zs.clear.s = false; cmdbuf->state.gfx.render.fb.info.zs.preload.s = true; } } static VkResult panvk_cmd_prepare_draw_link_shaders(struct panvk_cmd_buffer *cmd) { struct panvk_cmd_graphics_state *gfx = &cmd->state.gfx; if (!gfx_state_dirty(cmd, VS) && !gfx_state_dirty(cmd, FS)) return VK_SUCCESS; const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmd->state.gfx.vs.shader); const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmd)); VkResult result = panvk_per_arch(link_shaders)(&cmd->desc_pool, vs, fs, &gfx->link); if (result != VK_SUCCESS) { vk_command_buffer_set_error(&cmd->vk, result); return result; } return VK_SUCCESS; } static VkResult prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { struct panvk_batch *batch = cmdbuf->cur_batch; const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc; struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc; struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state; const struct vk_rasterization_state *rs = &cmdbuf->vk.dynamic_graphics_state.rs; VkResult result; const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf)); /* There are only 16 bits in the descriptor for the job ID. Each job has a * pilot shader dealing with descriptor copies, and we need one * pair per draw. */ if (batch->vtc_jc.job_index + (4 * cmdbuf->state.gfx.render.layer_count) >= UINT16_MAX) { panvk_per_arch(cmd_close_batch)(cmdbuf); panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf); batch = panvk_per_arch(cmd_open_batch)(cmdbuf); } if (fs_user_dirty(cmdbuf)) { result = panvk_cmd_prepare_draw_link_shaders(cmdbuf); if (result != VK_SUCCESS) return result; } if (cmdbuf->state.gfx.vk_meta) { /* vk_meta doesn't care about the provoking vertex mode, we should use * the same mode that the application uses. */ set_provoking_vertex_mode(cmdbuf, U_TRISTATE_UNSET); } else { enum u_tristate first_provoking_vertex = u_tristate_make( cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT); set_provoking_vertex_mode(cmdbuf, first_provoking_vertex); } if (!rs->rasterizer_discard_enable) { const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; uint32_t *nr_samples = &cmdbuf->state.gfx.render.fb.nr_samples; uint32_t rasterization_samples = cmdbuf->vk.dynamic_graphics_state.ms.rasterization_samples; /* If there's no attachment, and the FB descriptor hasn't been allocated * yet, we patch nr_samples to match rasterization_samples, otherwise, we * make sure those two numbers match. */ if (!batch->fb.desc.gpu && !cmdbuf->state.gfx.render.bound_attachments) { assert(rasterization_samples > 0); *nr_samples = rasterization_samples; } else { assert(rasterization_samples == *nr_samples); } /* In case we already emitted tiler/framebuffer descriptors, we ensure * that the sample count didn't change * XXX: This currently can happen in case we resume a render pass with no * attachements and without any draw as the FBD is emitted when suspending. */ assert(fbinfo->nr_samples == 0 || fbinfo->nr_samples == cmdbuf->state.gfx.render.fb.nr_samples); result = panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf); if (result != VK_SUCCESS) return result; } panvk_per_arch(cmd_select_tile_size)(cmdbuf); result = panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true); if (result != VK_SUCCESS) return result; uint32_t used_set_mask = vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0); if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS) || gfx_state_dirty(cmdbuf, FS)) { result = panvk_per_arch(cmd_prepare_push_descs)(cmdbuf, desc_state, used_set_mask); if (result != VK_SUCCESS) return result; } if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS)) { result = panvk_per_arch(cmd_prepare_shader_desc_tables)( cmdbuf, desc_state, vs, vs_desc_state); if (result != VK_SUCCESS) return result; } /* No need to setup the FS desc tables if the FS is not executed. */ if (fs && (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, FS))) { result = panvk_per_arch(cmd_prepare_shader_desc_tables)( cmdbuf, desc_state, fs, fs_desc_state); if (result != VK_SUCCESS) return result; result = panvk_draw_prepare_fs_copy_desc_job(cmdbuf, draw); if (result != VK_SUCCESS) return result; } panvk_draw_prepare_attributes(cmdbuf, draw); if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS)) panvk_draw_prepare_vs_copy_desc_job(cmdbuf, draw); draw->tls = batch->tls.gpu; draw->fb = batch->fb.desc.gpu; result = panvk_draw_prepare_fs_rsd(cmdbuf, draw); if (result != VK_SUCCESS) return result; batch->tlsinfo.tls.size = MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, batch->tlsinfo.tls.size); if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS)) { VkResult result = panvk_per_arch(cmd_prepare_dyn_ssbos)( cmdbuf, desc_state, vs, vs_desc_state); if (result != VK_SUCCESS) return result; } if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, FS)) { VkResult result = panvk_per_arch(cmd_prepare_dyn_ssbos)( cmdbuf, desc_state, fs, fs_desc_state); if (result != VK_SUCCESS) return result; } panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, &draw->info); /* Viewport emission requires up-to-date {scale,offset}.z for min/max Z, * so we need to call it after calling cmd_prepare_draw_sysvals(), but * viewports are the same for all layers, so we only emit when layer_id=0. */ result = panvk_draw_prepare_viewport(cmdbuf, draw); if (result != VK_SUCCESS) return result; return VK_SUCCESS; } static void panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); VkResult result; /* If there's no vertex shader, we can skip the draw. */ if (!panvk_priv_mem_check_alloc(vs->rsd)) return; /* Needs to be done before get_fs() is called because it depends on * fs.required being initialized. */ cmdbuf->state.gfx.fs.required = fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state); result = prepare_draw(cmdbuf, draw); if (result != VK_SUCCESS) return; pan_pack_work_groups_compute(&draw->invocation, 1, draw->vertex_range, draw->info.instance.count, 1, 1, 1, true, false); struct panvk_batch *batch = cmdbuf->cur_batch; unsigned copy_desc_job_id = draw->jobs.vertex_copy_desc.gpu ? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, &draw->jobs.vertex_copy_desc, false) : 0; if (draw->jobs.frag_copy_desc.gpu) { /* We don't need to add frag_copy_desc as a dependency because the * tiler job doesn't execute the fragment shader, the fragment job * will, and the tiler/fragment synchronization happens at the batch * level. */ pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, &draw->jobs.frag_copy_desc, false); } uint32_t view_mask = cmdbuf->state.gfx.render.view_mask; assert(view_mask == 0 || util_bitcount(view_mask) <= batch->fb.layer_count); uint32_t enabled_layer_count = view_mask ? util_bitcount(view_mask) : cmdbuf->state.gfx.render.layer_count; const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf)); for (uint32_t i = 0; i < enabled_layer_count; i++) { result = panvk_draw_prepare_varyings(cmdbuf, draw); if (result != VK_SUCCESS) return; draw->info.layer_id = (view_mask != 0) ? u_bit_scan(&view_mask) : i; if (draw->info.layer_id > 0) { cmdbuf->state.gfx.sysvals.layer_id = draw->info.layer_id; gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); } result = panvk_per_arch(cmd_prepare_push_uniforms)( cmdbuf, vs, 1); if (result != VK_SUCCESS) return; if (fs) { result = panvk_per_arch(cmd_prepare_push_uniforms)( cmdbuf, fs, 1); if (result != VK_SUCCESS) return; } result = panvk_draw_prepare_tiler_context(cmdbuf, draw); if (result != VK_SUCCESS) return; if (vs->info.vs.idvs) { result = panvk_draw_prepare_idvs_job(cmdbuf, draw); if (result != VK_SUCCESS) return; pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, copy_desc_job_id, &draw->jobs.idvs, false); } else { result = panvk_draw_prepare_vertex_job(cmdbuf, draw); if (result != VK_SUCCESS) return; unsigned vjob_id = pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_VERTEX, false, false, 0, copy_desc_job_id, &draw->jobs.vertex, false); bool needs_tiling = !cmdbuf->vk.dynamic_graphics_state.rs.rasterizer_discard_enable || cmdbuf->state.gfx.occlusion_query.mode != MALI_OCCLUSION_MODE_DISABLED; if (needs_tiling) { panvk_draw_prepare_tiler_job(cmdbuf, draw); pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_TILER, false, false, vjob_id, 0, &draw->jobs.tiler, false); } } } clear_dirty_after_draw(cmdbuf); cmdbuf->state.gfx.vs.previous_draw_was_indirect = false; } static void panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw) { const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); VkResult result; /* If there's no vertex shader, we can skip the draw. */ if (!panvk_priv_mem_check_alloc(vs->rsd)) return; /* Needs to be done before get_fs() is called because it depends on * fs.required being initialized. */ cmdbuf->state.gfx.fs.required = fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state); result = prepare_draw(cmdbuf, draw); if (result != VK_SUCCESS) return; struct panvk_batch *batch = cmdbuf->cur_batch; const struct vk_input_assembly_state *ia = &cmdbuf->vk.dynamic_graphics_state.ia; const struct vk_vertex_input_state *vi = cmdbuf->vk.dynamic_graphics_state.vi; unsigned copy_desc_job_id = draw->jobs.vertex_copy_desc.gpu ? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, &draw->jobs.vertex_copy_desc, false) : 0; if (draw->jobs.frag_copy_desc.gpu) { /* We don't need to add frag_copy_desc as a dependency because the * tiler job doesn't execute the fragment shader, the fragment job * will, and the tiler/fragment synchronization happens at the batch * level. */ pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, 0, &draw->jobs.frag_copy_desc, false); } uint32_t view_mask = cmdbuf->state.gfx.render.view_mask; assert(view_mask == 0 || util_bitcount(view_mask) <= batch->fb.layer_count); uint32_t enabled_layer_count = view_mask ? util_bitcount(view_mask) : cmdbuf->state.gfx.render.layer_count; const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf)); struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf); uint64_t index_min_max_res_ptr = 0; uint32_t job_before_indirect_helper = copy_desc_job_id; if (draw->info.index.size) { index_min_max_res_ptr = panvk_cmd_alloc_dev_mem( cmdbuf, desc, sizeof(struct libpan_draw_helper_index_min_max_result), 8) .gpu; const struct panlib_draw_index_minmax_search_helper_args args = { .index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr, .cmd = draw->info.indirect.buffer_dev_addr, .min_ptr = index_min_max_res_ptr + offsetof(struct libpan_draw_helper_index_min_max_result, min), .max_ptr = index_min_max_res_ptr + offsetof(struct libpan_draw_helper_index_min_max_result, max), }; struct libpan_draw_helper_index_min_max_result val = { .min = ((uint64_t)1 << (draw->info.index.size * 8)) - 1, .max = 0, }; uint64_t *raw_val = (uint64_t *)&val; struct pan_ptr write_job = pan_pool_alloc_desc(&cmdbuf->desc_pool.base, WRITE_VALUE_JOB); pan_section_pack(write_job.cpu, WRITE_VALUE_JOB, PAYLOAD, payload) { payload.type = MALI_WRITE_VALUE_TYPE_IMMEDIATE_64; payload.address = index_min_max_res_ptr; payload.immediate_value = *raw_val; }; unsigned write_job_id = pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false, false, 0, copy_desc_job_id, &write_job, false); util_dynarray_append(&batch->jobs, write_job.cpu); uint32_t index_count = cmdbuf->state.gfx.ib.size / draw->info.index.size; uint32_t wg_count = DIV_ROUND_UP(index_count, 65536); assert(wg_count <= 65536); panlib_draw_index_minmax_search_helper_struct( &precomp_ctx, panlib_1d_with_jm_deps(wg_count, 0, write_job_id), PANLIB_BARRIER_NONE, args, util_logbase2(draw->info.index.size), ia->primitive_restart_enable); job_before_indirect_helper = batch->vtc_jc.job_index; } for (uint32_t i = 0; i < enabled_layer_count; i++) { /* Force a new push uniform block to be allocated */ gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS); result = panvk_draw_prepare_varyings(cmdbuf, draw); if (result != VK_SUCCESS) return; draw->info.layer_id = (view_mask != 0) ? u_bit_scan(&view_mask) : i; if (draw->info.layer_id > 0) { cmdbuf->state.gfx.sysvals.layer_id = draw->info.layer_id; gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); } result = panvk_per_arch(cmd_prepare_push_uniforms)( cmdbuf, vs, 1); if (result != VK_SUCCESS) return; if (fs) { result = panvk_per_arch(cmd_prepare_push_uniforms)( cmdbuf, fs, 1); if (result != VK_SUCCESS) return; } result = panvk_draw_prepare_tiler_context(cmdbuf, draw); if (result != VK_SUCCESS) return; if (vs->info.vs.idvs) { result = panvk_draw_prepare_idvs_job(cmdbuf, draw); if (result != VK_SUCCESS) return; } else { result = panvk_draw_prepare_vertex_job(cmdbuf, draw); if (result != VK_SUCCESS) return; bool needs_tiling = !cmdbuf->vk.dynamic_graphics_state.rs.rasterizer_discard_enable || cmdbuf->state.gfx.occlusion_query.mode != MALI_OCCLUSION_MODE_DISABLED; if (needs_tiling) { result = panvk_draw_prepare_tiler_job(cmdbuf, draw); if (result != VK_SUCCESS) return; } } assert(draw->info.indirect.buffer_dev_addr != 0 || draw->info.index.size); uint32_t attrib_bufs_valid = vi->bindings_valid; uint32_t attribs_valid = vi->attributes_valid; uint64_t first_vertex_sysval = 0x8ull << 60; uint64_t first_instance_sysval = 0x8ull << 60; uint64_t raw_vertex_offset_sysval = 0x8ull << 60; if (shader_uses_sysval(vs, graphics, vs.first_vertex)) { first_vertex_sysval = cmdbuf->state.gfx.vs.push_uniforms + shader_remapped_sysval_offset( vs, sysval_offset(graphics, vs.first_vertex)); } if (shader_uses_sysval(vs, graphics, vs.base_instance)) { first_instance_sysval = cmdbuf->state.gfx.vs.push_uniforms + shader_remapped_sysval_offset( vs, sysval_offset(graphics, vs.base_instance)); } if (shader_uses_sysval(vs, graphics, vs.raw_vertex_offset)) { raw_vertex_offset_sysval = cmdbuf->state.gfx.vs.push_uniforms + shader_remapped_sysval_offset( vs, sysval_offset(graphics, vs.raw_vertex_offset)); } enum panlib_barrier indirect_barrier = PANLIB_BARRIER_JM_SUPPRESS_PREFETCH; struct panlib_precomp_grid indirect_grid = panlib_1d_with_jm_deps(1, 0, job_before_indirect_helper); if (draw->info.indirect.buffer_dev_addr != 0 && draw->info.index.size) { const struct panlib_draw_indexed_indirect_helper_args args = { .cmd = draw->info.indirect.buffer_dev_addr, .index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr, .index_min_max_res = index_min_max_res_ptr, .index_size = draw->info.index.size, .primitive_vertex_count = primitive_vertex_count( translate_prim_topology(ia->primitive_topology)), .varying_bufs_descs = draw->varying_bufs, .varying_bufs_info = draw->indirect_info.varying_bufs, .attrib_bufs_descs = draw->vs.attribute_bufs, .attrib_bufs_infos = draw->indirect_info.attrib_bufs, .attrib_bufs_valid = attrib_bufs_valid, .attribs_valid = attribs_valid, .attribs_descs = draw->vs.attributes, .attribs_infos = draw->indirect_info.attribs, .first_vertex_sysval = first_vertex_sysval, .first_instance_sysval = first_instance_sysval, .raw_vertex_offset_sysval = raw_vertex_offset_sysval, .idvs_job = vs->info.vs.idvs ? draw->jobs.idvs.gpu : 0, .vertex_job = draw->jobs.vertex.gpu, .tiler_job = draw->jobs.tiler.gpu, }; panlib_draw_indexed_indirect_helper_struct(&precomp_ctx, indirect_grid, indirect_barrier, args); } else if (draw->info.indirect.buffer_dev_addr != 0) { const struct panlib_draw_indirect_helper_args args = { .cmd = draw->info.indirect.buffer_dev_addr, .primitive_vertex_count = primitive_vertex_count( translate_prim_topology(ia->primitive_topology)), .varying_bufs_descs = draw->varying_bufs, .varying_bufs_info = draw->indirect_info.varying_bufs, .attrib_bufs_descs = draw->vs.attribute_bufs, .attrib_bufs_infos = draw->indirect_info.attrib_bufs, .attrib_bufs_valid = attrib_bufs_valid, .attribs_valid = attribs_valid, .attribs_descs = draw->vs.attributes, .attribs_infos = draw->indirect_info.attribs, .first_vertex_sysval = first_vertex_sysval, .first_instance_sysval = first_instance_sysval, .raw_vertex_offset_sysval = raw_vertex_offset_sysval, .idvs_job = vs->info.vs.idvs ? draw->jobs.idvs.gpu : 0, .vertex_job = draw->jobs.vertex.gpu, .tiler_job = draw->jobs.tiler.gpu, }; panlib_draw_indirect_helper_struct(&precomp_ctx, indirect_grid, indirect_barrier, args); } else { assert(false && "Invalid indirect draw"); } /* Grab the index of the indirect helper job */ uint32_t prev_job = batch->vtc_jc.job_index; if (vs->info.vs.idvs) { pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, prev_job, &draw->jobs.idvs, false); } else { unsigned vjob_id = pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_VERTEX, false, true, 0, prev_job, &draw->jobs.vertex, false); if (draw->jobs.tiler.gpu != 0) { pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_TILER, false, false, vjob_id, 0, &draw->jobs.tiler, false); } } } /* * We split every ~1024 indirect draw. * This is here for multiple reasons: * - The indirect varying buffer offset need to be reset at some point to * avoid going outside of bounds. * - It is possible to always end up with timeouts for batches with 4k draws * (see "dEQP-VK.api.command_buffers.many_indirect_draws_on_secondary") At * the same time, because of how TLS works on Mali, we should not split too * much as this will cause the TLS budget to go crazy. */ if (batch->vtc_jc.job_index > (5 * 1024)) { bool preload_fb = cmdbuf->cur_batch && cmdbuf->cur_batch->vtc_jc.first_tiler; panvk_per_arch(cmd_close_batch)(cmdbuf); if (preload_fb) panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf); batch = panvk_per_arch(cmd_open_batch)(cmdbuf); cmdbuf->state.gfx.vs.indirect_varying_bufs_infos = 0; } clear_dirty_after_draw(cmdbuf); cmdbuf->state.gfx.vs.previous_draw_was_indirect = true; } static unsigned padded_vertex_count(struct panvk_cmd_buffer *cmdbuf, uint32_t vertex_count, uint32_t instance_count) { if (instance_count == 1) return vertex_count; const struct panvk_shader_variant *vs = panvk_shader_hw_variant(cmdbuf->state.gfx.vs.shader); bool idvs = vs->info.vs.idvs; /* Index-Driven Vertex Shading requires different instances to * have different cache lines for position results. Each vertex * position is 16 bytes and the Mali cache line is 64 bytes, so * the instance count must be aligned to 4 vertices. */ if (idvs) vertex_count = ALIGN_POT(vertex_count, 4); return pan_padded_vertex_count(vertex_count); } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); if (instanceCount == 0 || vertexCount == 0) return; /* gl_BaseVertexARB is a signed integer, and it should expose the value of * firstVertex in a non-indexed draw. */ assert(firstVertex < INT32_MAX); /* gl_BaseInstance is a signed integer, and it should expose the value of * firstInstnace. */ assert(firstInstance < INT32_MAX); struct panvk_draw_data draw = { .info = { .vertex.base = firstVertex, .vertex.raw_offset = firstVertex, .vertex.count = vertexCount, .instance.base = firstInstance, .instance.count = instanceCount, }, .vertex_range = vertexCount, .padded_vertex_count = padded_vertex_count(cmdbuf, vertexCount, instanceCount), }; panvk_cmd_draw(cmdbuf, &draw); } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); if (instanceCount == 0 || indexCount == 0) return; /* gl_BaseInstance is a signed integer, and it should expose the value of * firstInstnace. */ assert(firstInstance < INT32_MAX); struct pan_ptr indirect_index_alloc = panvk_cmd_alloc_dev_mem( cmdbuf, desc, sizeof(struct VkDrawIndexedIndirectCommand), 8); struct VkDrawIndexedIndirectCommand *indirect_index_alloc_ptr = indirect_index_alloc.cpu; *indirect_index_alloc_ptr = (struct VkDrawIndexedIndirectCommand){ .indexCount = indexCount, .instanceCount = instanceCount, .firstIndex = firstIndex, .vertexOffset = vertexOffset, .firstInstance = firstInstance, }; struct panvk_draw_data draw = { .info = { .index.size = cmdbuf->state.gfx.ib.index_size, .indirect.buffer_dev_addr = indirect_index_alloc.gpu, .indirect.draw_count = 1, .indirect.stride = 0, }, }; panvk_cmd_draw_indirect(cmdbuf, &draw); } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); VK_FROM_HANDLE(panvk_buffer, buffer, _buffer); if (drawCount == 0) return; /* We cannot support arbitrary draw count on JM */ assert(drawCount == 1); struct panvk_draw_data draw = { .info = { .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset), .indirect.draw_count = drawCount, .indirect.stride = stride, }, }; panvk_cmd_draw_indirect(cmdbuf, &draw); } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); VK_FROM_HANDLE(panvk_buffer, buffer, _buffer); if (drawCount == 0) return; /* We cannot support arbitrary draw count on JM */ assert(drawCount == 1); struct panvk_draw_data draw = { .info = { .index.size = cmdbuf->state.gfx.ib.index_size, .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset), .indirect.draw_count = drawCount, .indirect.stride = stride, }, }; panvk_cmd_draw_indirect(cmdbuf, &draw); } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx; bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT; /* When resuming from a suspended pass, the state should be unchanged. */ if (resuming && cmdbuf->cur_batch) { state->render.flags = pRenderingInfo->flags; } else { /* If we're not resuming, cur_batch should be NULL. However, this * currently isn't true because of how events are implemented. * * XXX: Rewrite events to not close and open batch and add an assert here. */ if (cmdbuf->cur_batch) panvk_per_arch(cmd_close_batch)(cmdbuf); panvk_per_arch(cmd_init_render_state)(cmdbuf, pRenderingInfo); if (resuming) panvk_per_arch(cmd_preload_fb_after_batch_split)(cmdbuf); } if (!cmdbuf->cur_batch) panvk_per_arch(cmd_open_batch)(cmdbuf); if (!resuming) panvk_per_arch(cmd_preload_render_area_border)(cmdbuf, pRenderingInfo); } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); if (!(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) { struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s; for (unsigned i = 0; i < fbinfo->rt_count; i++) clear |= fbinfo->rts[i].clear; if (clear) panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf); panvk_per_arch(cmd_close_batch)(cmdbuf); cmdbuf->cur_batch = NULL; panvk_per_arch(cmd_meta_resolve_attachments)(cmdbuf); } }