xref: /aosp_15_r20/external/mesa3d/src/asahi/vulkan/hk_cmd_buffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "hk_cmd_buffer.h"
8 
9 #include "agx_bo.h"
10 #include "agx_device.h"
11 #include "agx_linker.h"
12 #include "agx_tilebuffer.h"
13 #include "agx_usc.h"
14 #include "hk_buffer.h"
15 #include "hk_cmd_pool.h"
16 #include "hk_descriptor_set.h"
17 #include "hk_descriptor_set_layout.h"
18 #include "hk_device.h"
19 #include "hk_device_memory.h"
20 #include "hk_entrypoints.h"
21 #include "hk_image_view.h"
22 #include "hk_physical_device.h"
23 #include "hk_shader.h"
24 
25 #include "pool.h"
26 #include "shader_enums.h"
27 #include "vk_pipeline_layout.h"
28 #include "vk_synchronization.h"
29 
30 #include "util/list.h"
31 #include "util/macros.h"
32 #include "util/u_dynarray.h"
33 #include "vulkan/vulkan_core.h"
34 
35 static void
hk_descriptor_state_fini(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc)36 hk_descriptor_state_fini(struct hk_cmd_buffer *cmd,
37                          struct hk_descriptor_state *desc)
38 {
39    struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
40 
41    for (unsigned i = 0; i < HK_MAX_SETS; i++) {
42       vk_free(&pool->vk.alloc, desc->push[i]);
43       desc->push[i] = NULL;
44    }
45 }
46 
47 static void
hk_free_resettable_cmd_buffer(struct hk_cmd_buffer * cmd)48 hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd)
49 {
50    struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
51    struct hk_device *dev = hk_cmd_pool_device(pool);
52 
53    hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors);
54    hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors);
55 
56    hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos);
57    hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos);
58 
59    list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) {
60       list_del(&it->node);
61       hk_cs_destroy(it);
62    }
63 
64    util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) {
65       agx_bo_unreference(&dev->dev, *bo);
66    }
67 
68    util_dynarray_clear(&cmd->large_bos);
69 }
70 
71 static void
hk_destroy_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer)72 hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
73 {
74    struct hk_cmd_buffer *cmd =
75       container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
76    struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
77 
78    hk_free_resettable_cmd_buffer(cmd);
79    vk_command_buffer_finish(&cmd->vk);
80    vk_free(&pool->vk.alloc, cmd);
81 }
82 
83 static VkResult
hk_create_cmd_buffer(struct vk_command_pool * vk_pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)84 hk_create_cmd_buffer(struct vk_command_pool *vk_pool,
85                      VkCommandBufferLevel level,
86                      struct vk_command_buffer **cmd_buffer_out)
87 {
88    struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk);
89    struct hk_device *dev = hk_cmd_pool_device(pool);
90    struct hk_cmd_buffer *cmd;
91    VkResult result;
92 
93    cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8,
94                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
95    if (cmd == NULL)
96       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
97 
98    result =
99       vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level);
100    if (result != VK_SUCCESS) {
101       vk_free(&pool->vk.alloc, cmd);
102       return result;
103    }
104 
105    util_dynarray_init(&cmd->large_bos, NULL);
106 
107    cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi;
108    cmd->vk.dynamic_graphics_state.ms.sample_locations =
109       &cmd->state.gfx._dynamic_sl;
110 
111    list_inithead(&cmd->uploader.main.bos);
112    list_inithead(&cmd->uploader.usc.bos);
113    list_inithead(&cmd->control_streams);
114 
115    *cmd_buffer_out = &cmd->vk;
116 
117    return VK_SUCCESS;
118 }
119 
120 static void
hk_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)121 hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
122                     UNUSED VkCommandBufferResetFlags flags)
123 {
124    struct hk_cmd_buffer *cmd =
125       container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
126 
127    vk_command_buffer_reset(&cmd->vk);
128    hk_free_resettable_cmd_buffer(cmd);
129 
130    cmd->uploader.main.map = NULL;
131    cmd->uploader.main.base = 0;
132    cmd->uploader.main.offset = 0;
133    cmd->uploader.usc.map = NULL;
134    cmd->uploader.usc.base = 0;
135    cmd->uploader.usc.offset = 0;
136 
137    cmd->current_cs.gfx = NULL;
138    cmd->current_cs.cs = NULL;
139    cmd->current_cs.post_gfx = NULL;
140    cmd->current_cs.pre_gfx = NULL;
141 
142    /* TODO: clear pool! */
143 
144    memset(&cmd->state, 0, sizeof(cmd->state));
145 }
146 
147 const struct vk_command_buffer_ops hk_cmd_buffer_ops = {
148    .create = hk_create_cmd_buffer,
149    .reset = hk_reset_cmd_buffer,
150    .destroy = hk_destroy_cmd_buffer,
151 };
152 
153 static VkResult
hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer * cmd,bool usc,struct hk_cmd_bo ** bo_out)154 hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc,
155                        struct hk_cmd_bo **bo_out)
156 {
157    VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out);
158    if (result != VK_SUCCESS)
159       return result;
160 
161    if (usc)
162       list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos);
163    else
164       list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos);
165 
166    return VK_SUCCESS;
167 }
168 
169 struct agx_ptr
hk_pool_alloc_internal(struct hk_cmd_buffer * cmd,uint32_t size,uint32_t alignment,bool usc)170 hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
171                        uint32_t alignment, bool usc)
172 {
173    struct hk_device *dev = hk_cmd_buffer_device(cmd);
174    struct hk_uploader *uploader =
175       usc ? &cmd->uploader.usc : &cmd->uploader.main;
176 
177    /* Specially handle large allocations owned by the command buffer, e.g. used
178     * for statically allocated vertex output buffers with geometry shaders.
179     */
180    if (size > HK_CMD_BO_SIZE) {
181       uint32_t flags = usc ? AGX_BO_LOW_VA : 0;
182       struct agx_bo *bo =
183          agx_bo_create(&dev->dev, size, flags, 0, "Large pool allocation");
184 
185       util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo);
186       return (struct agx_ptr){
187          .gpu = bo->va->addr,
188          .cpu = bo->map,
189       };
190    }
191 
192    assert(size <= HK_CMD_BO_SIZE);
193    assert(alignment > 0);
194 
195    uint32_t offset = align(uploader->offset, alignment);
196 
197    assert(offset <= HK_CMD_BO_SIZE);
198    if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) {
199       uploader->offset = offset + size;
200 
201       return (struct agx_ptr){
202          .gpu = uploader->base + offset,
203          .cpu = uploader->map + offset,
204       };
205    }
206 
207    struct hk_cmd_bo *bo;
208    VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo);
209    if (unlikely(result != VK_SUCCESS)) {
210       vk_command_buffer_set_error(&cmd->vk, result);
211       return (struct agx_ptr){0};
212    }
213 
214    /* Pick whichever of the current upload BO and the new BO will have more
215     * room left to be the BO for the next upload.  If our upload size is
216     * bigger than the old offset, we're better off burning the whole new
217     * upload BO on this one allocation and continuing on the current upload
218     * BO.
219     */
220    if (uploader->map == NULL || size < uploader->offset) {
221       uploader->map = bo->bo->map;
222       uploader->base = bo->bo->va->addr;
223       uploader->offset = size;
224    }
225 
226    return (struct agx_ptr){
227       .gpu = bo->bo->va->addr,
228       .cpu = bo->map,
229    };
230 }
231 
232 uint64_t
hk_pool_upload(struct hk_cmd_buffer * cmd,const void * data,uint32_t size,uint32_t alignment)233 hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size,
234                uint32_t alignment)
235 {
236    struct agx_ptr T = hk_pool_alloc(cmd, size, alignment);
237    if (unlikely(T.cpu == NULL))
238       return 0;
239 
240    memcpy(T.cpu, data, size);
241    return T.gpu;
242 }
243 
244 VKAPI_ATTR VkResult VKAPI_CALL
hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)245 hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
246                       const VkCommandBufferBeginInfo *pBeginInfo)
247 {
248    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
249 
250    hk_reset_cmd_buffer(&cmd->vk, 0);
251 
252    hk_cmd_buffer_begin_compute(cmd, pBeginInfo);
253    hk_cmd_buffer_begin_graphics(cmd, pBeginInfo);
254 
255    return VK_SUCCESS;
256 }
257 
258 VKAPI_ATTR VkResult VKAPI_CALL
hk_EndCommandBuffer(VkCommandBuffer commandBuffer)259 hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
260 {
261    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
262 
263    assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL &&
264           "must end rendering before ending the command buffer");
265 
266    hk_cmd_buffer_end_compute(cmd);
267    hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
268 
269    return vk_command_buffer_get_record_result(&cmd->vk);
270 }
271 
272 VKAPI_ATTR void VKAPI_CALL
hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)273 hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
274                        const VkDependencyInfo *pDependencyInfo)
275 {
276    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
277 
278    /* The big hammer. We end both compute and graphics batches. Ending compute
279     * here is necessary to properly handle graphics->compute dependencies.
280     *
281     * XXX: perf. */
282    hk_cmd_buffer_end_compute(cmd);
283    hk_cmd_buffer_end_graphics(cmd);
284 }
285 
286 void
hk_cmd_bind_shaders(struct vk_command_buffer * vk_cmd,uint32_t stage_count,const gl_shader_stage * stages,struct vk_shader ** const shaders)287 hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
288                     const gl_shader_stage *stages,
289                     struct vk_shader **const shaders)
290 {
291    struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk);
292 
293    for (uint32_t i = 0; i < stage_count; i++) {
294       struct hk_api_shader *shader =
295          container_of(shaders[i], struct hk_api_shader, vk);
296 
297       if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL)
298          hk_cmd_bind_compute_shader(cmd, shader);
299       else
300          hk_cmd_bind_graphics_shader(cmd, stages[i], shader);
301    }
302 }
303 
304 static void
hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkBindDescriptorSetsInfoKHR * info)305 hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
306                         struct hk_descriptor_state *desc,
307                         const VkBindDescriptorSetsInfoKHR *info)
308 {
309    VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
310 
311    /* Fro the Vulkan 1.3.275 spec:
312     *
313     *    "When binding a descriptor set (see Descriptor Set Binding) to
314     *    set number N...
315     *
316     *    If, additionally, the previously bound descriptor set for set
317     *    N was bound using a pipeline layout not compatible for set N,
318     *    then all bindings in sets numbered greater than N are
319     *    disturbed."
320     *
321     * This means that, if some earlier set gets bound in such a way that
322     * it changes set_dynamic_buffer_start[s], this binding is implicitly
323     * invalidated.  Therefore, we can always look at the current value
324     * of set_dynamic_buffer_start[s] as the base of our dynamic buffer
325     * range and it's only our responsibility to adjust all
326     * set_dynamic_buffer_start[p] for p > s as needed.
327     */
328    uint8_t dyn_buffer_start =
329       desc->root.set_dynamic_buffer_start[info->firstSet];
330 
331    uint32_t next_dyn_offset = 0;
332    for (uint32_t i = 0; i < info->descriptorSetCount; ++i) {
333       unsigned s = i + info->firstSet;
334       VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]);
335 
336       if (desc->sets[s] != set) {
337          if (set != NULL) {
338             desc->root.sets[s] = hk_descriptor_set_addr(set);
339             desc->set_sizes[s] = set->size;
340          } else {
341             desc->root.sets[s] = 0;
342             desc->set_sizes[s] = 0;
343          }
344          desc->sets[s] = set;
345          desc->sets_dirty |= BITFIELD_BIT(s);
346 
347          /* Binding descriptors invalidates push descriptors */
348          desc->push_dirty &= ~BITFIELD_BIT(s);
349       }
350 
351       desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
352 
353       if (pipeline_layout->set_layouts[s] != NULL) {
354          const struct hk_descriptor_set_layout *set_layout =
355             vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]);
356 
357          if (set != NULL && set_layout->dynamic_buffer_count > 0) {
358             for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) {
359                struct hk_buffer_address addr = set->dynamic_buffers[j];
360                addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j];
361                desc->root.dynamic_buffers[dyn_buffer_start + j] = addr;
362             }
363             next_dyn_offset += set->layout->dynamic_buffer_count;
364          }
365 
366          dyn_buffer_start += set_layout->dynamic_buffer_count;
367       } else {
368          assert(set == NULL);
369       }
370    }
371    assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS);
372    assert(next_dyn_offset <= info->dynamicOffsetCount);
373 
374    for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS;
375         s++)
376       desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
377 
378    desc->root_dirty = true;
379 }
380 
381 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)382 hk_CmdBindDescriptorSets2KHR(
383    VkCommandBuffer commandBuffer,
384    const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
385 {
386    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
387 
388    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
389       hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors,
390                               pBindDescriptorSetsInfo);
391    }
392 
393    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
394       hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors,
395                               pBindDescriptorSetsInfo);
396    }
397 }
398 
399 static void
hk_push_constants(UNUSED struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkPushConstantsInfoKHR * info)400 hk_push_constants(UNUSED struct hk_cmd_buffer *cmd,
401                   struct hk_descriptor_state *desc,
402                   const VkPushConstantsInfoKHR *info)
403 {
404    memcpy(desc->root.push + info->offset, info->pValues, info->size);
405    desc->root_dirty = true;
406 }
407 
408 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)409 hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,
410                         const VkPushConstantsInfoKHR *pPushConstantsInfo)
411 {
412    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
413 
414    if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
415       hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo);
416 
417    if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
418       hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo);
419 }
420 
421 static struct hk_push_descriptor_set *
hk_cmd_push_descriptors(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,uint32_t set)422 hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd,
423                         struct hk_descriptor_state *desc, uint32_t set)
424 {
425    assert(set < HK_MAX_SETS);
426    if (unlikely(desc->push[set] == NULL)) {
427       desc->push[set] =
428          vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8,
429                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
430       if (unlikely(desc->push[set] == NULL)) {
431          vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
432          return NULL;
433       }
434    }
435 
436    /* Pushing descriptors replaces whatever sets are bound */
437    desc->sets[set] = NULL;
438    desc->push_dirty |= BITFIELD_BIT(set);
439 
440    return desc->push[set];
441 }
442 
443 static void
hk_push_descriptor_set(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkPushDescriptorSetInfoKHR * info)444 hk_push_descriptor_set(struct hk_cmd_buffer *cmd,
445                        struct hk_descriptor_state *desc,
446                        const VkPushDescriptorSetInfoKHR *info)
447 {
448    VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
449 
450    struct hk_push_descriptor_set *push_set =
451       hk_cmd_push_descriptors(cmd, desc, info->set);
452    if (unlikely(push_set == NULL))
453       return;
454 
455    struct hk_descriptor_set_layout *set_layout =
456       vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]);
457 
458    hk_push_descriptor_set_update(push_set, set_layout,
459                                  info->descriptorWriteCount,
460                                  info->pDescriptorWrites);
461 }
462 
463 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)464 hk_CmdPushDescriptorSet2KHR(
465    VkCommandBuffer commandBuffer,
466    const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
467 {
468    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
469 
470    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
471       hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors,
472                              pPushDescriptorSetInfo);
473    }
474 
475    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
476       hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors,
477                              pPushDescriptorSetInfo);
478    }
479 }
480 
481 void
hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc)482 hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
483                                      struct hk_descriptor_state *desc)
484 {
485    u_foreach_bit(set_idx, desc->push_dirty) {
486       struct hk_push_descriptor_set *push_set = desc->push[set_idx];
487       uint64_t push_set_addr = hk_pool_upload(
488          cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT);
489 
490       desc->root.sets[set_idx] = push_set_addr;
491       desc->set_sizes[set_idx] = sizeof(push_set->data);
492    }
493 
494    desc->root_dirty = true;
495    desc->push_dirty = 0;
496 }
497 
498 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)499 hk_CmdPushDescriptorSetWithTemplate2KHR(
500    VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR
501                                      *pPushDescriptorSetWithTemplateInfo)
502 {
503    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
504    VK_FROM_HANDLE(vk_descriptor_update_template, template,
505                   pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
506    VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout,
507                   pPushDescriptorSetWithTemplateInfo->layout);
508 
509    struct hk_descriptor_state *desc =
510       hk_get_descriptors_state(cmd, template->bind_point);
511    struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors(
512       cmd, desc, pPushDescriptorSetWithTemplateInfo->set);
513    if (unlikely(push_set == NULL))
514       return;
515 
516    struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(
517       pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]);
518 
519    hk_push_descriptor_set_update_template(
520       push_set, set_layout, template,
521       pPushDescriptorSetWithTemplateInfo->pData);
522 }
523 
524 uint64_t
hk_cmd_buffer_upload_root(struct hk_cmd_buffer * cmd,VkPipelineBindPoint bind_point)525 hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
526                           VkPipelineBindPoint bind_point)
527 {
528    struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
529    struct hk_root_descriptor_table *root = &desc->root;
530 
531    struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8);
532    if (!root_ptr.gpu)
533       return 0;
534 
535    root->root_desc_addr = root_ptr.gpu;
536 
537    memcpy(root_ptr.cpu, root, sizeof(*root));
538    return root_ptr.gpu;
539 }
540 
541 void
hk_usc_upload_spilled_rt_descs(struct agx_usc_builder * b,struct hk_cmd_buffer * cmd)542 hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
543                                struct hk_cmd_buffer *cmd)
544 {
545    struct hk_rendering_state *render = &cmd->state.gfx.render;
546 
547    /* Upload texture/PBE descriptors for each render target so we can clear
548     * spilled render targets.
549     */
550    struct agx_ptr descs =
551       hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64);
552    struct agx_texture_packed *desc = descs.cpu;
553    if (!desc)
554       return;
555 
556    for (unsigned i = 0; i < render->color_att_count; ++i) {
557       struct hk_image_view *iview = render->color_att[i].iview;
558       if (!iview) {
559          /* XXX: probably should emit a null descriptor here...? */
560          continue;
561       }
562 
563       memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc));
564       memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc));
565    }
566 
567    desc = descs.cpu;
568 
569    /* Bind the base as u0_u1 for bindless access */
570    agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8));
571 }
572 
573 void
hk_reserve_scratch(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_shader * s)574 hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
575                    struct hk_shader *s)
576 {
577    struct hk_device *dev = hk_cmd_buffer_device(cmd);
578    uint32_t max_scratch_size =
579       MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size);
580 
581    if (max_scratch_size == 0)
582       return;
583 
584    unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0;
585 
586    /* Note: this uses the hardware stage, not the software stage */
587    hk_device_alloc_scratch(dev, s->b.info.stage, max_scratch_size);
588 
589    switch (s->b.info.stage) {
590    case PIPE_SHADER_FRAGMENT:
591       cs->scratch.fs.main = true;
592       cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size);
593       break;
594    case PIPE_SHADER_VERTEX:
595       cs->scratch.vs.main = true;
596       cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size);
597       break;
598    default:
599       cs->scratch.cs.main = true;
600       cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size);
601       break;
602    }
603 }
604 
605 uint32_t
hk_upload_usc_words(struct hk_cmd_buffer * cmd,struct hk_shader * s,struct hk_linked_shader * linked)606 hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
607                     struct hk_linked_shader *linked)
608 {
609    struct hk_device *dev = hk_cmd_buffer_device(cmd);
610 
611    enum pipe_shader_type sw_stage = s->info.stage;
612    enum pipe_shader_type hw_stage = s->b.info.stage;
613 
614    unsigned constant_push_ranges =
615       DIV_ROUND_UP(s->b.info.immediate_size_16, 64);
616    unsigned push_ranges = 2;
617    unsigned stage_ranges = 3;
618 
619    size_t usc_size =
620       agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4);
621    struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
622    if (!t.cpu)
623       return 0;
624 
625    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
626 
627    uint64_t root_ptr;
628 
629    if (sw_stage == PIPE_SHADER_COMPUTE)
630       root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE);
631    else
632       root_ptr = cmd->state.gfx.root;
633 
634    static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0,
635                  "self-reflective");
636 
637    agx_usc_uniform(&b, HK_ROOT_UNIFORM, 4, root_ptr);
638 
639    if (sw_stage == MESA_SHADER_VERTEX) {
640       unsigned count =
641          DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4);
642 
643       if (count) {
644          agx_usc_uniform(
645             &b, 0, 4 * count,
646             root_ptr + hk_root_descriptor_offset(draw.attrib_base));
647 
648          agx_usc_uniform(
649             &b, 4 * count, 2 * count,
650             root_ptr + hk_root_descriptor_offset(draw.attrib_clamps));
651       }
652 
653       if (cmd->state.gfx.draw_params)
654          agx_usc_uniform(&b, 6 * count, 4, cmd->state.gfx.draw_params);
655 
656       if (cmd->state.gfx.draw_id_ptr)
657          agx_usc_uniform(&b, (6 * count) + 4, 1, cmd->state.gfx.draw_id_ptr);
658 
659       if (hw_stage == MESA_SHADER_COMPUTE) {
660          agx_usc_uniform(
661             &b, (6 * count) + 8, 4,
662             root_ptr + hk_root_descriptor_offset(draw.input_assembly));
663       }
664    } else if (sw_stage == MESA_SHADER_FRAGMENT) {
665       if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) {
666          hk_usc_upload_spilled_rt_descs(&b, cmd);
667       }
668 
669       agx_usc_uniform(
670          &b, 4, 8, root_ptr + hk_root_descriptor_offset(draw.blend_constant));
671 
672       /* The SHARED state is baked into linked->usc for non-fragment shaders. We
673        * don't pass around the information to bake the tilebuffer layout.
674        *
675        * TODO: We probably could with some refactor.
676        */
677       agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc);
678    }
679 
680    agx_usc_push_blob(&b, linked->usc.data, linked->usc.size);
681    return agx_usc_addr(&dev->dev, t.gpu);
682 }
683 
684 /* Specialized variant of hk_upload_usc_words for internal dispatches that do
685  * not use any state except for some directly mapped uniforms.
686  */
687 uint32_t
hk_upload_usc_words_kernel(struct hk_cmd_buffer * cmd,struct hk_shader * s,void * data,size_t data_size)688 hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd, struct hk_shader *s,
689                            void *data, size_t data_size)
690 {
691    struct hk_device *dev = hk_cmd_buffer_device(cmd);
692 
693    assert(s->info.stage == MESA_SHADER_COMPUTE);
694    assert(s->b.info.scratch_size == 0 && "you shouldn't be spilling!");
695    assert(s->b.info.preamble_scratch_size == 0 && "you shouldn't be spilling!");
696 
697    unsigned constant_push_ranges =
698       DIV_ROUND_UP(s->b.info.immediate_size_16, 64);
699 
700    size_t usc_size = agx_usc_size(constant_push_ranges + 7);
701    struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
702    if (!t.cpu)
703       return 0;
704 
705    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
706 
707    /* Map the data directly as uniforms starting at u0 */
708    agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2),
709                    hk_pool_upload(cmd, data, data_size, 4));
710 
711    agx_usc_push_blob(&b, s->only_linked->usc.data, s->only_linked->usc.size);
712    return agx_usc_addr(&dev->dev, t.gpu);
713 }
714 
715 void
hk_cs_init_graphics(struct hk_cmd_buffer * cmd,struct hk_cs * cs)716 hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
717 {
718    struct hk_rendering_state *render = &cmd->state.gfx.render;
719    uint8_t *map = cs->current;
720 
721    cs->tib = render->tilebuffer;
722 
723    /* Assume this is not the first control stream of the render pass, so
724     * initially use the partial background program and ZLS control.
725     * hk_BeginRendering will override.
726     */
727    cs->cr = render->cr;
728    cs->cr.bg.main = render->cr.bg.partial;
729    cs->cr.zls_control = render->cr.zls_control_partial;
730 
731    /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
732     * with another that caused stale data to be cached and the CPU wrote to it
733     * in the meantime.
734     */
735    agx_push(map, VDM_BARRIER, cfg) {
736       cfg.usc_cache_inval = true;
737    }
738 
739    struct AGX_PPP_HEADER present = {
740       .w_clamp = true,
741       .occlusion_query_2 = true,
742       .output_unknown = true,
743       .varying_word_2 = true,
744       .viewport_count = 1, /* irrelevant */
745    };
746 
747    size_t size = agx_ppp_update_size(&present);
748    struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
749    if (!T.cpu)
750       return;
751 
752    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
753 
754    /* clang-format off */
755    agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
756    agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
757    agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
758    agx_ppp_push(&ppp, VARYING_2, cfg);
759    /* clang-format on */
760 
761    agx_ppp_fini(&map, &ppp);
762    cs->current = map;
763 
764    util_dynarray_init(&cs->scissor, NULL);
765    util_dynarray_init(&cs->depth_bias, NULL);
766 
767    /* All graphics state must be reemited in each control stream */
768    hk_cmd_buffer_dirty_all(cmd);
769 }
770 
771 void
hk_ensure_cs_has_space(struct hk_cmd_buffer * cmd,struct hk_cs * cs,size_t space)772 hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
773                        size_t space)
774 {
775    bool vdm = cs->type == HK_CS_VDM;
776 
777    size_t link_length =
778       vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
779 
780    /* Assert that we have space for a link tag */
781    assert((cs->current + link_length) <= cs->end && "Encoder overflowed");
782 
783    /* Always leave room for a link tag, in case we run out of space later,
784     * plus padding because VDM apparently overreads?
785     *
786     * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
787     */
788    space += link_length + 0x800;
789 
790    /* If there is room in the command buffer, we're done */
791    if (likely((cs->end - cs->current) >= space))
792       return;
793 
794    /* Otherwise, we need to allocate a new command buffer. We use memory owned
795     * by the batch to simplify lifetime management for the BO.
796     */
797    size_t size = 65536;
798    struct agx_ptr T = hk_pool_alloc(cmd, size, 256);
799 
800    /* Jump from the old control stream to the new control stream */
801    if (vdm) {
802       agx_pack(cs->current, VDM_STREAM_LINK, cfg) {
803          cfg.target_lo = T.gpu & BITFIELD_MASK(32);
804          cfg.target_hi = T.gpu >> 32;
805       }
806    } else {
807       agx_pack(cs->current, CDM_STREAM_LINK, cfg) {
808          cfg.target_lo = T.gpu & BITFIELD_MASK(32);
809          cfg.target_hi = T.gpu >> 32;
810       }
811    }
812 
813    /* Swap out the control stream */
814    cs->current = T.cpu;
815    cs->end = cs->current + size;
816    cs->stream_linked = true;
817 }
818