1 /*
2 * Copyright © 2024 Collabora Ltd.
3 *
4 * Derived from tu_cmd_buffer.c which is:
5 * Copyright © 2016 Red Hat.
6 * Copyright © 2016 Bas Nieuwenhuizen
7 * Copyright © 2015 Intel Corporation
8 *
9 * SPDX-License-Identifier: MIT
10 */
11
12 #include "genxml/gen_macros.h"
13
14 #include "panvk_cmd_alloc.h"
15 #include "panvk_cmd_buffer.h"
16 #include "panvk_cmd_desc_state.h"
17 #include "panvk_cmd_meta.h"
18 #include "panvk_cmd_push_constant.h"
19 #include "panvk_device.h"
20 #include "panvk_entrypoints.h"
21 #include "panvk_meta.h"
22 #include "panvk_physical_device.h"
23
24 #include "pan_desc.h"
25 #include "pan_encoder.h"
26 #include "pan_props.h"
27
28 #include <vulkan/vulkan_core.h>
29
30 static VkResult
prepare_driver_set(struct panvk_cmd_buffer * cmdbuf)31 prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
32 {
33 struct panvk_shader_desc_state *cs_desc_state =
34 &cmdbuf->state.compute.cs.desc;
35
36 if (cs_desc_state->driver_set.dev_addr)
37 return VK_SUCCESS;
38
39 const struct panvk_descriptor_state *desc_state =
40 &cmdbuf->state.compute.desc_state;
41 const struct panvk_shader *cs = cmdbuf->state.compute.shader;
42 uint32_t desc_count = cs->desc_info.dyn_bufs.count + 1;
43 struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
44 cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
45 struct panvk_opaque_desc *descs = driver_set.cpu;
46
47 if (!driver_set.gpu)
48 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
49
50 /* Dummy sampler always comes first. */
51 pan_pack(&descs[0], SAMPLER, _) {
52 }
53
54 panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, cs,
55 (struct mali_buffer_packed *)(&descs[1]));
56
57 cs_desc_state->driver_set.dev_addr = driver_set.gpu;
58 cs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
59 return VK_SUCCESS;
60 }
61
62 static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer * cmdbuf)63 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
64 {
65 cmdbuf->state.compute.push_uniforms = panvk_per_arch(
66 cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.compute.sysvals,
67 sizeof(cmdbuf->state.compute.sysvals));
68 return cmdbuf->state.compute.push_uniforms ? VK_SUCCESS
69 : VK_ERROR_OUT_OF_DEVICE_MEMORY;
70 }
71
72 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)73 panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
74 uint32_t baseGroupX, uint32_t baseGroupY,
75 uint32_t baseGroupZ, uint32_t groupCountX,
76 uint32_t groupCountY, uint32_t groupCountZ)
77 {
78 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
79 const struct panvk_shader *shader = cmdbuf->state.compute.shader;
80 VkResult result;
81
82 /* If there's no compute shader, we can skip the dispatch. */
83 if (!panvk_priv_mem_dev_addr(shader->spd))
84 return;
85
86 struct panvk_physical_device *phys_dev =
87 to_panvk_physical_device(cmdbuf->vk.base.device->physical);
88 struct panvk_descriptor_state *desc_state =
89 &cmdbuf->state.compute.desc_state;
90 struct panvk_shader_desc_state *cs_desc_state =
91 &cmdbuf->state.compute.cs.desc;
92
93 struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
94 if (!tsd.gpu)
95 return;
96
97 struct pan_tls_info tlsinfo = {
98 .tls.size = shader->info.tls_size,
99 .wls.size = shader->info.wls_size,
100 };
101 unsigned core_id_range;
102
103 panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
104
105 if (tlsinfo.wls.size) {
106 /* TODO: Clamp WLS instance to some maximum WLS budget. */
107 struct pan_compute_dim dim = {groupCountX, groupCountY, groupCountZ};
108
109 tlsinfo.wls.instances = pan_wls_instances(&dim);
110
111 unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
112 tlsinfo.wls.instances * core_id_range;
113
114 tlsinfo.wls.ptr =
115 panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
116 if (!tlsinfo.wls.ptr)
117 return;
118 }
119
120 cmdbuf->state.tls.info.tls.size =
121 MAX2(shader->info.tls_size, cmdbuf->state.tls.info.tls.size);
122
123 if (!cmdbuf->state.tls.desc.gpu) {
124 cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
125 if (!cmdbuf->state.tls.desc.gpu)
126 return;
127 }
128
129 GENX(pan_emit_tls)(&tlsinfo, tsd.cpu);
130
131 result = panvk_per_arch(cmd_prepare_push_descs)(
132 cmdbuf, desc_state, shader->desc_info.used_set_mask);
133 if (result != VK_SUCCESS)
134 return;
135
136 struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals;
137 sysvals->num_work_groups.x = groupCountX;
138 sysvals->num_work_groups.y = groupCountY;
139 sysvals->num_work_groups.z = groupCountZ;
140 sysvals->local_group_size.x = shader->local_size.x;
141 sysvals->local_group_size.y = shader->local_size.y;
142 sysvals->local_group_size.z = shader->local_size.z;
143
144 result = prepare_driver_set(cmdbuf);
145 if (result != VK_SUCCESS)
146 return;
147
148 cmdbuf->state.compute.push_uniforms = 0;
149 result = prepare_push_uniforms(cmdbuf);
150 if (result != VK_SUCCESS)
151 return;
152
153 result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
154 shader, cs_desc_state);
155 if (result != VK_SUCCESS)
156 return;
157
158 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
159 unsigned task_axis = MALI_TASK_AXIS_X;
160 unsigned task_increment = 0;
161
162 /* Copy the global TLS pointer to the per-job TSD. */
163 cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu);
164 cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
165 cs_wait_slot(b, SB_ID(LS), false);
166 cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
167 cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
168 cs_wait_slot(b, SB_ID(LS), false);
169
170 cs_update_compute_ctx(b) {
171 cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
172 uint32_t push_size = 256 + sizeof(struct panvk_compute_sysvals);
173 uint64_t fau_count = DIV_ROUND_UP(push_size, 8);
174 mali_ptr fau_ptr =
175 cmdbuf->state.compute.push_uniforms | (fau_count << 56);
176 cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
177 cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd));
178 cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu);
179
180 /* Global attribute offset */
181 cs_move32_to(b, cs_sr_reg32(b, 32), 0);
182
183 struct mali_compute_size_workgroup_packed wg_size;
184 pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
185 cfg.workgroup_size_x = shader->local_size.x;
186 cfg.workgroup_size_y = shader->local_size.y;
187 cfg.workgroup_size_z = shader->local_size.z;
188 cfg.allow_merging_workgroups = false;
189 }
190 cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
191 cs_move32_to(b, cs_sr_reg32(b, 34), baseGroupX * shader->local_size.x);
192 cs_move32_to(b, cs_sr_reg32(b, 35), baseGroupY * shader->local_size.y);
193 cs_move32_to(b, cs_sr_reg32(b, 36), baseGroupZ * shader->local_size.z);
194 cs_move32_to(b, cs_sr_reg32(b, 37), groupCountX);
195 cs_move32_to(b, cs_sr_reg32(b, 38), groupCountY);
196 cs_move32_to(b, cs_sr_reg32(b, 39), groupCountZ);
197
198 /* Pick the task_axis and task_increment to maximize thread utilization. */
199 unsigned threads_per_wg =
200 shader->local_size.x * shader->local_size.y * shader->local_size.z;
201 unsigned max_thread_cnt = panfrost_compute_max_thread_count(
202 &phys_dev->kmod.props, shader->info.work_reg_count);
203 unsigned threads_per_task = threads_per_wg;
204 unsigned local_size[3] = {
205 shader->local_size.x,
206 shader->local_size.y,
207 shader->local_size.z,
208 };
209
210 for (unsigned i = 0; i < 3; i++) {
211 if (threads_per_task * local_size[i] >= max_thread_cnt) {
212 /* We reached out thread limit, stop at the current axis and
213 * calculate the increment so it doesn't exceed the per-core
214 * thread capacity.
215 */
216 task_increment = max_thread_cnt / threads_per_task;
217 break;
218 } else if (task_axis == MALI_TASK_AXIS_Z) {
219 /* We reached the Z axis, and there's still room to stuff more
220 * threads. Pick the current axis grid size as our increment
221 * as there's no point using something bigger.
222 */
223 task_increment = local_size[i];
224 break;
225 }
226
227 threads_per_task *= local_size[i];
228 task_axis++;
229 }
230 }
231
232 assert(task_axis <= MALI_TASK_AXIS_Z);
233 assert(task_increment > 0);
234
235 panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
236
237 cs_req_res(b, CS_COMPUTE_RES);
238 cs_run_compute(b, task_increment, task_axis, false,
239 cs_shader_res_sel(0, 0, 0, 0));
240 cs_req_res(b, 0);
241
242 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
243 struct cs_index iter_sb = cs_scratch_reg32(b, 2);
244 struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
245 struct cs_index add_val = cs_scratch_reg64(b, 4);
246
247 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
248 BITFIELD_MASK(3),
249 offsetof(struct panvk_cs_subqueue_context, syncobjs));
250 cs_wait_slot(b, SB_ID(LS), false);
251
252 cs_add64(b, sync_addr, sync_addr,
253 PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
254 cs_move64_to(b, add_val, 1);
255
256 cs_match(b, iter_sb, cmp_scratch) {
257 #define CASE(x) \
258 cs_case(b, x) { \
259 cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \
260 add_val, sync_addr, \
261 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \
262 cs_move32_to(b, iter_sb, next_iter_sb(x)); \
263 }
264
265 CASE(0)
266 CASE(1)
267 CASE(2)
268 CASE(3)
269 CASE(4)
270 #undef CASE
271 }
272
273 cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
274 offsetof(struct panvk_cs_subqueue_context, iter_sb));
275 cs_wait_slot(b, SB_ID(LS), false);
276
277 ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
278 }
279
280 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchIndirect)281 panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
282 VkBuffer _buffer, VkDeviceSize offset)
283 {
284 panvk_stub();
285 }
286