xref: /aosp_15_r20/external/mesa3d/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  *
4  * Derived from tu_cmd_buffer.c which is:
5  * Copyright © 2016 Red Hat.
6  * Copyright © 2016 Bas Nieuwenhuizen
7  * Copyright © 2015 Intel Corporation
8  *
9  * SPDX-License-Identifier: MIT
10  */
11 
12 #include "genxml/gen_macros.h"
13 
14 #include "panvk_cmd_alloc.h"
15 #include "panvk_cmd_buffer.h"
16 #include "panvk_cmd_desc_state.h"
17 #include "panvk_cmd_meta.h"
18 #include "panvk_cmd_push_constant.h"
19 #include "panvk_device.h"
20 #include "panvk_entrypoints.h"
21 #include "panvk_meta.h"
22 #include "panvk_physical_device.h"
23 
24 #include "pan_desc.h"
25 #include "pan_encoder.h"
26 #include "pan_props.h"
27 
28 #include <vulkan/vulkan_core.h>
29 
30 static VkResult
prepare_driver_set(struct panvk_cmd_buffer * cmdbuf)31 prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
32 {
33    struct panvk_shader_desc_state *cs_desc_state =
34       &cmdbuf->state.compute.cs.desc;
35 
36    if (cs_desc_state->driver_set.dev_addr)
37       return VK_SUCCESS;
38 
39    const struct panvk_descriptor_state *desc_state =
40       &cmdbuf->state.compute.desc_state;
41    const struct panvk_shader *cs = cmdbuf->state.compute.shader;
42    uint32_t desc_count = cs->desc_info.dyn_bufs.count + 1;
43    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
44       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
45    struct panvk_opaque_desc *descs = driver_set.cpu;
46 
47    if (!driver_set.gpu)
48       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
49 
50    /* Dummy sampler always comes first. */
51    pan_pack(&descs[0], SAMPLER, _) {
52    }
53 
54    panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, cs,
55                                      (struct mali_buffer_packed *)(&descs[1]));
56 
57    cs_desc_state->driver_set.dev_addr = driver_set.gpu;
58    cs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
59    return VK_SUCCESS;
60 }
61 
62 static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer * cmdbuf)63 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
64 {
65    cmdbuf->state.compute.push_uniforms = panvk_per_arch(
66       cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.compute.sysvals,
67                                  sizeof(cmdbuf->state.compute.sysvals));
68    return cmdbuf->state.compute.push_uniforms ? VK_SUCCESS
69                                               : VK_ERROR_OUT_OF_DEVICE_MEMORY;
70 }
71 
72 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)73 panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
74                                 uint32_t baseGroupX, uint32_t baseGroupY,
75                                 uint32_t baseGroupZ, uint32_t groupCountX,
76                                 uint32_t groupCountY, uint32_t groupCountZ)
77 {
78    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
79    const struct panvk_shader *shader = cmdbuf->state.compute.shader;
80    VkResult result;
81 
82    /* If there's no compute shader, we can skip the dispatch. */
83    if (!panvk_priv_mem_dev_addr(shader->spd))
84       return;
85 
86    struct panvk_physical_device *phys_dev =
87       to_panvk_physical_device(cmdbuf->vk.base.device->physical);
88    struct panvk_descriptor_state *desc_state =
89       &cmdbuf->state.compute.desc_state;
90    struct panvk_shader_desc_state *cs_desc_state =
91       &cmdbuf->state.compute.cs.desc;
92 
93    struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
94    if (!tsd.gpu)
95       return;
96 
97    struct pan_tls_info tlsinfo = {
98       .tls.size = shader->info.tls_size,
99       .wls.size = shader->info.wls_size,
100    };
101    unsigned core_id_range;
102 
103    panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
104 
105    if (tlsinfo.wls.size) {
106       /* TODO: Clamp WLS instance to some maximum WLS budget. */
107       struct pan_compute_dim dim = {groupCountX, groupCountY, groupCountZ};
108 
109       tlsinfo.wls.instances = pan_wls_instances(&dim);
110 
111       unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
112                                 tlsinfo.wls.instances * core_id_range;
113 
114       tlsinfo.wls.ptr =
115          panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
116       if (!tlsinfo.wls.ptr)
117          return;
118    }
119 
120    cmdbuf->state.tls.info.tls.size =
121       MAX2(shader->info.tls_size, cmdbuf->state.tls.info.tls.size);
122 
123    if (!cmdbuf->state.tls.desc.gpu) {
124       cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
125       if (!cmdbuf->state.tls.desc.gpu)
126          return;
127    }
128 
129    GENX(pan_emit_tls)(&tlsinfo, tsd.cpu);
130 
131    result = panvk_per_arch(cmd_prepare_push_descs)(
132       cmdbuf, desc_state, shader->desc_info.used_set_mask);
133    if (result != VK_SUCCESS)
134       return;
135 
136    struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals;
137    sysvals->num_work_groups.x = groupCountX;
138    sysvals->num_work_groups.y = groupCountY;
139    sysvals->num_work_groups.z = groupCountZ;
140    sysvals->local_group_size.x = shader->local_size.x;
141    sysvals->local_group_size.y = shader->local_size.y;
142    sysvals->local_group_size.z = shader->local_size.z;
143 
144    result = prepare_driver_set(cmdbuf);
145    if (result != VK_SUCCESS)
146       return;
147 
148    cmdbuf->state.compute.push_uniforms = 0;
149    result = prepare_push_uniforms(cmdbuf);
150    if (result != VK_SUCCESS)
151       return;
152 
153    result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
154                                                          shader, cs_desc_state);
155    if (result != VK_SUCCESS)
156       return;
157 
158    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
159    unsigned task_axis = MALI_TASK_AXIS_X;
160    unsigned task_increment = 0;
161 
162    /* Copy the global TLS pointer to the per-job TSD. */
163    cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu);
164    cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
165    cs_wait_slot(b, SB_ID(LS), false);
166    cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
167    cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
168    cs_wait_slot(b, SB_ID(LS), false);
169 
170    cs_update_compute_ctx(b) {
171       cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
172       uint32_t push_size = 256 + sizeof(struct panvk_compute_sysvals);
173       uint64_t fau_count = DIV_ROUND_UP(push_size, 8);
174       mali_ptr fau_ptr =
175          cmdbuf->state.compute.push_uniforms | (fau_count << 56);
176       cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
177       cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd));
178       cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu);
179 
180       /* Global attribute offset */
181       cs_move32_to(b, cs_sr_reg32(b, 32), 0);
182 
183       struct mali_compute_size_workgroup_packed wg_size;
184       pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
185          cfg.workgroup_size_x = shader->local_size.x;
186          cfg.workgroup_size_y = shader->local_size.y;
187          cfg.workgroup_size_z = shader->local_size.z;
188          cfg.allow_merging_workgroups = false;
189       }
190       cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
191       cs_move32_to(b, cs_sr_reg32(b, 34), baseGroupX * shader->local_size.x);
192       cs_move32_to(b, cs_sr_reg32(b, 35), baseGroupY * shader->local_size.y);
193       cs_move32_to(b, cs_sr_reg32(b, 36), baseGroupZ * shader->local_size.z);
194       cs_move32_to(b, cs_sr_reg32(b, 37), groupCountX);
195       cs_move32_to(b, cs_sr_reg32(b, 38), groupCountY);
196       cs_move32_to(b, cs_sr_reg32(b, 39), groupCountZ);
197 
198       /* Pick the task_axis and task_increment to maximize thread utilization. */
199       unsigned threads_per_wg =
200          shader->local_size.x * shader->local_size.y * shader->local_size.z;
201       unsigned max_thread_cnt = panfrost_compute_max_thread_count(
202          &phys_dev->kmod.props, shader->info.work_reg_count);
203       unsigned threads_per_task = threads_per_wg;
204       unsigned local_size[3] = {
205          shader->local_size.x,
206          shader->local_size.y,
207          shader->local_size.z,
208       };
209 
210       for (unsigned i = 0; i < 3; i++) {
211          if (threads_per_task * local_size[i] >= max_thread_cnt) {
212             /* We reached out thread limit, stop at the current axis and
213              * calculate the increment so it doesn't exceed the per-core
214              * thread capacity.
215              */
216             task_increment = max_thread_cnt / threads_per_task;
217             break;
218          } else if (task_axis == MALI_TASK_AXIS_Z) {
219             /* We reached the Z axis, and there's still room to stuff more
220              * threads. Pick the current axis grid size as our increment
221              * as there's no point using something bigger.
222              */
223             task_increment = local_size[i];
224             break;
225          }
226 
227          threads_per_task *= local_size[i];
228          task_axis++;
229       }
230    }
231 
232    assert(task_axis <= MALI_TASK_AXIS_Z);
233    assert(task_increment > 0);
234 
235    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
236 
237    cs_req_res(b, CS_COMPUTE_RES);
238    cs_run_compute(b, task_increment, task_axis, false,
239                   cs_shader_res_sel(0, 0, 0, 0));
240    cs_req_res(b, 0);
241 
242    struct cs_index sync_addr = cs_scratch_reg64(b, 0);
243    struct cs_index iter_sb = cs_scratch_reg32(b, 2);
244    struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
245    struct cs_index add_val = cs_scratch_reg64(b, 4);
246 
247    cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
248               BITFIELD_MASK(3),
249               offsetof(struct panvk_cs_subqueue_context, syncobjs));
250    cs_wait_slot(b, SB_ID(LS), false);
251 
252    cs_add64(b, sync_addr, sync_addr,
253             PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
254    cs_move64_to(b, add_val, 1);
255 
256    cs_match(b, iter_sb, cmp_scratch) {
257 #define CASE(x)                                                                \
258       cs_case(b, x) {                                                          \
259          cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG,                        \
260                        add_val, sync_addr,                                     \
261                        cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));       \
262          cs_move32_to(b, iter_sb, next_iter_sb(x));                            \
263       }
264 
265       CASE(0)
266       CASE(1)
267       CASE(2)
268       CASE(3)
269       CASE(4)
270 #undef CASE
271    }
272 
273    cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
274               offsetof(struct panvk_cs_subqueue_context, iter_sb));
275    cs_wait_slot(b, SB_ID(LS), false);
276 
277    ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
278 }
279 
280 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchIndirect)281 panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
282                                     VkBuffer _buffer, VkDeviceSize offset)
283 {
284    panvk_stub();
285 }
286