1 /*
2 * Copyright © 2024 Collabora Ltd.
3 *
4 * Derived from tu_cmd_buffer.c which is:
5 * Copyright © 2016 Red Hat.
6 * Copyright © 2016 Bas Nieuwenhuizen
7 * Copyright © 2015 Intel Corporation
8 *
9 * SPDX-License-Identifier: MIT
10 */
11
12 #include "genxml/gen_macros.h"
13
14 #include "panvk_cmd_alloc.h"
15 #include "panvk_cmd_buffer.h"
16 #include "panvk_cmd_desc_state.h"
17 #include "panvk_device.h"
18 #include "panvk_entrypoints.h"
19 #include "panvk_meta.h"
20 #include "panvk_physical_device.h"
21
22 #include "pan_desc.h"
23 #include "pan_encoder.h"
24 #include "pan_jc.h"
25 #include "pan_props.h"
26
27 #include <vulkan/vulkan_core.h>
28
29 struct panvk_dispatch_info {
30 struct pan_compute_dim wg_count;
31 mali_ptr tsd;
32 mali_ptr push_uniforms;
33 };
34
35 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)36 panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
37 uint32_t baseGroupX, uint32_t baseGroupY,
38 uint32_t baseGroupZ, uint32_t groupCountX,
39 uint32_t groupCountY, uint32_t groupCountZ)
40 {
41 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
42 const struct panvk_shader *shader = cmdbuf->state.compute.shader;
43 VkResult result;
44
45 if (groupCountX == 0 || groupCountY == 0 || groupCountZ == 0)
46 return;
47
48 /* If there's no compute shader, we can skip the dispatch. */
49 if (!panvk_priv_mem_dev_addr(shader->rsd))
50 return;
51
52 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
53 struct panvk_physical_device *phys_dev =
54 to_panvk_physical_device(dev->vk.physical);
55 struct panvk_dispatch_info dispatch = {
56 .wg_count = {groupCountX, groupCountY, groupCountZ},
57 };
58
59 panvk_per_arch(cmd_close_batch)(cmdbuf);
60 struct panvk_batch *batch = panvk_per_arch(cmd_open_batch)(cmdbuf);
61
62 struct panvk_descriptor_state *desc_state =
63 &cmdbuf->state.compute.desc_state;
64 struct panvk_shader_desc_state *cs_desc_state =
65 &cmdbuf->state.compute.cs.desc;
66
67 panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
68 dispatch.tsd = batch->tls.gpu;
69
70 result = panvk_per_arch(cmd_prepare_push_descs)(
71 cmdbuf, desc_state, shader->desc_info.used_set_mask);
72 if (result != VK_SUCCESS)
73 return;
74
75 struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals;
76 sysvals->base.x = baseGroupX;
77 sysvals->base.y = baseGroupY;
78 sysvals->base.z = baseGroupZ;
79 sysvals->num_work_groups.x = groupCountX;
80 sysvals->num_work_groups.y = groupCountY;
81 sysvals->num_work_groups.z = groupCountZ;
82 sysvals->local_group_size.x = shader->local_size.x;
83 sysvals->local_group_size.y = shader->local_size.y;
84 sysvals->local_group_size.z = shader->local_size.z;
85
86 result = panvk_per_arch(cmd_prepare_dyn_ssbos)(cmdbuf, desc_state, shader,
87 cs_desc_state);
88 if (result != VK_SUCCESS)
89 return;
90
91 sysvals->desc.dyn_ssbos = cs_desc_state->dyn_ssbos;
92
93 for (uint32_t i = 0; i < MAX_SETS; i++) {
94 if (shader->desc_info.used_set_mask & BITFIELD_BIT(i))
95 sysvals->desc.sets[i] = desc_state->sets[i]->descs.dev;
96 }
97
98 cmdbuf->state.compute.push_uniforms = 0;
99
100 if (!cmdbuf->state.compute.push_uniforms) {
101 cmdbuf->state.compute.push_uniforms = panvk_per_arch(
102 cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.compute.sysvals,
103 sizeof(cmdbuf->state.compute.sysvals));
104 if (!cmdbuf->state.compute.push_uniforms)
105 return;
106 }
107
108 dispatch.push_uniforms = cmdbuf->state.compute.push_uniforms;
109
110 result = panvk_per_arch(cmd_prepare_shader_desc_tables)(
111 cmdbuf, desc_state, shader, cs_desc_state);
112
113 struct panfrost_ptr copy_desc_job;
114 result = panvk_per_arch(meta_get_copy_desc_job)(
115 cmdbuf, shader, &cmdbuf->state.compute.desc_state, cs_desc_state, 0,
116 ©_desc_job);
117 if (result != VK_SUCCESS)
118 return;
119
120 if (copy_desc_job.cpu)
121 util_dynarray_append(&batch->jobs, void *, copy_desc_job.cpu);
122
123 struct panfrost_ptr job = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB);
124 if (!job.gpu)
125 return;
126
127 util_dynarray_append(&batch->jobs, void *, job.cpu);
128
129 panfrost_pack_work_groups_compute(
130 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), dispatch.wg_count.x,
131 dispatch.wg_count.y, dispatch.wg_count.z, shader->local_size.x,
132 shader->local_size.y, shader->local_size.z, false, false);
133
134 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
135 cfg.job_task_split = util_logbase2_ceil(shader->local_size.x + 1) +
136 util_logbase2_ceil(shader->local_size.y + 1) +
137 util_logbase2_ceil(shader->local_size.z + 1);
138 }
139
140 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
141 cfg.state = panvk_priv_mem_dev_addr(shader->rsd);
142 cfg.attributes = cs_desc_state->img_attrib_table;
143 cfg.attribute_buffers =
144 cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_IMG];
145 cfg.thread_storage = dispatch.tsd;
146 cfg.uniform_buffers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_UBO];
147 cfg.push_uniforms = dispatch.push_uniforms;
148 cfg.textures = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_TEXTURE];
149 cfg.samplers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER];
150 }
151
152 unsigned copy_desc_dep =
153 copy_desc_job.gpu
154 ? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false,
155 0, 0, ©_desc_job, false)
156 : 0;
157
158 pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0,
159 copy_desc_dep, &job, false);
160
161 batch->tlsinfo.tls.size = shader->info.tls_size;
162 batch->tlsinfo.wls.size = shader->info.wls_size;
163 if (batch->tlsinfo.wls.size) {
164 unsigned core_id_range;
165
166 panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
167 batch->tlsinfo.wls.instances = pan_wls_instances(&dispatch.wg_count);
168 batch->wls_total_size = pan_wls_adjust_size(batch->tlsinfo.wls.size) *
169 batch->tlsinfo.wls.instances * core_id_range;
170 }
171
172 panvk_per_arch(cmd_close_batch)(cmdbuf);
173 }
174
175 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchIndirect)176 panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
177 VkBuffer _buffer, VkDeviceSize offset)
178 {
179 panvk_stub();
180 }
181