xref: /aosp_15_r20/external/mesa3d/src/imagination/vulkan/pvr_query_compute.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <string.h>
29 #include <vulkan/vulkan.h>
30 
31 #include "hwdef/rogue_hw_utils.h"
32 #include "pvr_bo.h"
33 #include "pvr_formats.h"
34 #include "pvr_pds.h"
35 #include "pvr_private.h"
36 #include "pvr_shader_factory.h"
37 #include "pvr_static_shaders.h"
38 #include "pvr_tex_state.h"
39 #include "pvr_types.h"
40 #include "vk_alloc.h"
41 #include "vk_command_pool.h"
42 #include "vk_util.h"
43 
pvr_init_primary_compute_pds_program(struct pvr_pds_compute_shader_program * program)44 static inline void pvr_init_primary_compute_pds_program(
45    struct pvr_pds_compute_shader_program *program)
46 {
47    pvr_pds_compute_shader_program_init(program);
48    program->local_input_regs[0] = 0;
49    /* Workgroup id is in reg0. */
50    program->work_group_input_regs[0] = 0;
51    program->flattened_work_groups = true;
52    program->kick_usc = true;
53 }
54 
pvr_create_compute_secondary_prog(struct pvr_device * device,const struct pvr_shader_factory_info * shader_factory_info,struct pvr_compute_query_shader * query_prog)55 static VkResult pvr_create_compute_secondary_prog(
56    struct pvr_device *device,
57    const struct pvr_shader_factory_info *shader_factory_info,
58    struct pvr_compute_query_shader *query_prog)
59 {
60    const size_t size =
61       pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
62    struct pvr_pds_descriptor_program_input sec_pds_program;
63    struct pvr_pds_info *info = &query_prog->info;
64    uint32_t staging_buffer_size;
65    uint32_t *staging_buffer;
66    VkResult result;
67 
68    info->entries =
69       vk_alloc(&device->vk.alloc, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
70    if (!info->entries)
71       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
72 
73    info->entries_size_in_bytes = size;
74 
75    sec_pds_program = (struct pvr_pds_descriptor_program_input){
76       .buffer_count = 1,
77       .buffers = {
78          [0] = {
79             .buffer_id = 0,
80             .source_offset = 0,
81             .type = PVR_BUFFER_TYPE_COMPILE_TIME,
82             .size_in_dwords = shader_factory_info->const_shared_regs,
83             .destination = shader_factory_info->explicit_const_start_offset,
84          }
85       },
86    };
87 
88    pvr_pds_generate_descriptor_upload_program(&sec_pds_program, NULL, info);
89 
90    staging_buffer_size = info->code_size_in_dwords;
91 
92    staging_buffer = vk_alloc(&device->vk.alloc,
93                              PVR_DW_TO_BYTES(staging_buffer_size),
94                              8,
95                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
96    if (!staging_buffer) {
97       vk_free(&device->vk.alloc, info->entries);
98       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
99    }
100 
101    pvr_pds_generate_descriptor_upload_program(&sec_pds_program,
102                                               staging_buffer,
103                                               info);
104 
105    assert(info->code_size_in_dwords <= staging_buffer_size);
106 
107    /* FIXME: Figure out the define for alignment of 16. */
108    result = pvr_gpu_upload_pds(device,
109                                NULL,
110                                0,
111                                0,
112                                staging_buffer,
113                                info->code_size_in_dwords,
114                                16,
115                                16,
116                                &query_prog->pds_sec_code);
117    if (result != VK_SUCCESS) {
118       vk_free(&device->vk.alloc, staging_buffer);
119       vk_free(&device->vk.alloc, info->entries);
120       return result;
121    }
122 
123    vk_free(&device->vk.alloc, staging_buffer);
124 
125    return VK_SUCCESS;
126 }
127 
128 static void
pvr_destroy_compute_secondary_prog(struct pvr_device * device,struct pvr_compute_query_shader * program)129 pvr_destroy_compute_secondary_prog(struct pvr_device *device,
130                                    struct pvr_compute_query_shader *program)
131 {
132    pvr_bo_suballoc_free(program->pds_sec_code.pvr_bo);
133    vk_free(&device->vk.alloc, program->info.entries);
134 }
135 
pvr_create_compute_query_program(struct pvr_device * device,const struct pvr_shader_factory_info * shader_factory_info,struct pvr_compute_query_shader * query_prog)136 static VkResult pvr_create_compute_query_program(
137    struct pvr_device *device,
138    const struct pvr_shader_factory_info *shader_factory_info,
139    struct pvr_compute_query_shader *query_prog)
140 {
141    const uint32_t cache_line_size =
142       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
143    struct pvr_pds_compute_shader_program pds_primary_prog;
144    VkResult result;
145 
146    /* No support for query constant calc program. */
147    assert(shader_factory_info->const_calc_prog_inst_bytes == 0);
148    /* No support for query coefficient update program. */
149    assert(shader_factory_info->coeff_update_prog_start == PVR_INVALID_INST);
150 
151    result = pvr_gpu_upload_usc(device,
152                                shader_factory_info->shader_code,
153                                shader_factory_info->code_size,
154                                cache_line_size,
155                                &query_prog->usc_bo);
156    if (result != VK_SUCCESS)
157       return result;
158 
159    pvr_init_primary_compute_pds_program(&pds_primary_prog);
160 
161    pvr_pds_setup_doutu(&pds_primary_prog.usc_task_control,
162                        query_prog->usc_bo->dev_addr.addr,
163                        shader_factory_info->temps_required,
164                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
165                        false);
166 
167    result =
168       pvr_pds_compute_shader_create_and_upload(device,
169                                                &pds_primary_prog,
170                                                &query_prog->pds_prim_code);
171    if (result != VK_SUCCESS)
172       goto err_free_usc_bo;
173 
174    query_prog->primary_data_size_dw = pds_primary_prog.data_size;
175    query_prog->primary_num_temps = pds_primary_prog.temps_used;
176 
177    result = pvr_create_compute_secondary_prog(device,
178                                               shader_factory_info,
179                                               query_prog);
180    if (result != VK_SUCCESS)
181       goto err_free_pds_prim_code_bo;
182 
183    return VK_SUCCESS;
184 
185 err_free_pds_prim_code_bo:
186    pvr_bo_suballoc_free(query_prog->pds_prim_code.pvr_bo);
187 
188 err_free_usc_bo:
189    pvr_bo_suballoc_free(query_prog->usc_bo);
190 
191    return result;
192 }
193 
194 /* TODO: See if we can dedup this with pvr_setup_descriptor_mappings() or
195  * pvr_setup_descriptor_mappings().
196  */
pvr_write_compute_query_pds_data_section(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_compute_query_shader * query_prog,struct pvr_private_compute_pipeline * pipeline)197 static VkResult pvr_write_compute_query_pds_data_section(
198    struct pvr_cmd_buffer *cmd_buffer,
199    const struct pvr_compute_query_shader *query_prog,
200    struct pvr_private_compute_pipeline *pipeline)
201 {
202    const struct pvr_pds_info *const info = &query_prog->info;
203    struct pvr_suballoc_bo *pvr_bo;
204    const uint8_t *entries;
205    uint32_t *dword_buffer;
206    uint64_t *qword_buffer;
207    VkResult result;
208 
209    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
210                                      cmd_buffer->device->heaps.pds_heap,
211                                      PVR_DW_TO_BYTES(info->data_size_in_dwords),
212                                      &pvr_bo);
213    if (result != VK_SUCCESS)
214       return result;
215 
216    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
217    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
218 
219    entries = (uint8_t *)info->entries;
220 
221    /* TODO: Remove this when we can test this path and make sure that this is
222     * not needed. If it's needed we should probably be using LITERAL entries for
223     * this instead.
224     */
225    memset(dword_buffer, 0xFE, PVR_DW_TO_BYTES(info->data_size_in_dwords));
226 
227    pipeline->pds_shared_update_data_size_dw = info->data_size_in_dwords;
228 
229    for (uint32_t i = 0; i < info->entry_count; i++) {
230       const struct pvr_const_map_entry *const entry_header =
231          (struct pvr_const_map_entry *)entries;
232 
233       switch (entry_header->type) {
234       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
235          const struct pvr_const_map_entry_literal32 *const literal =
236             (struct pvr_const_map_entry_literal32 *)entries;
237 
238          PVR_WRITE(dword_buffer,
239                    literal->literal_value,
240                    literal->const_offset,
241                    info->data_size_in_dwords);
242 
243          entries += sizeof(*literal);
244          break;
245       }
246       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL64: {
247          const struct pvr_const_map_entry_literal64 *const literal =
248             (struct pvr_const_map_entry_literal64 *)entries;
249 
250          PVR_WRITE(qword_buffer,
251                    literal->literal_value,
252                    literal->const_offset,
253                    info->data_size_in_dwords);
254 
255          entries += sizeof(*literal);
256          break;
257       }
258       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
259          const struct pvr_const_map_entry_doutu_address *const doutu_addr =
260             (struct pvr_const_map_entry_doutu_address *)entries;
261          const pvr_dev_addr_t exec_addr =
262             PVR_DEV_ADDR_OFFSET(query_prog->pds_sec_code.pvr_bo->dev_addr,
263                                 query_prog->pds_sec_code.code_offset);
264          uint64_t addr = 0ULL;
265 
266          pvr_set_usc_execution_address64(&addr, exec_addr.addr);
267 
268          PVR_WRITE(qword_buffer,
269                    addr | doutu_addr->doutu_control,
270                    doutu_addr->const_offset,
271                    info->data_size_in_dwords);
272 
273          entries += sizeof(*doutu_addr);
274          break;
275       }
276       case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
277          const struct pvr_const_map_entry_special_buffer *special_buff_entry =
278             (struct pvr_const_map_entry_special_buffer *)entries;
279 
280          switch (special_buff_entry->buffer_type) {
281          case PVR_BUFFER_TYPE_COMPILE_TIME: {
282             uint64_t addr = pipeline->const_buffer_addr.addr;
283 
284             PVR_WRITE(qword_buffer,
285                       addr,
286                       special_buff_entry->const_offset,
287                       info->data_size_in_dwords);
288             break;
289          }
290 
291          default:
292             unreachable("Unsupported special buffer type.");
293          }
294 
295          entries += sizeof(*special_buff_entry);
296          break;
297       }
298       default:
299          unreachable("Unsupported data section map");
300       }
301    }
302 
303    pipeline->pds_shared_update_data_offset =
304       pvr_bo->dev_addr.addr -
305       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
306 
307    return VK_SUCCESS;
308 }
309 
pvr_write_private_compute_dispatch(struct pvr_cmd_buffer * cmd_buffer,struct pvr_private_compute_pipeline * pipeline,uint32_t num_query_indices)310 static void pvr_write_private_compute_dispatch(
311    struct pvr_cmd_buffer *cmd_buffer,
312    struct pvr_private_compute_pipeline *pipeline,
313    uint32_t num_query_indices)
314 {
315    struct pvr_sub_cmd *sub_cmd = cmd_buffer->state.current_sub_cmd;
316    const uint32_t workgroup_size[PVR_WORKGROUP_DIMENSIONS] = {
317       DIV_ROUND_UP(num_query_indices, 32),
318       1,
319       1,
320    };
321 
322    assert(sub_cmd->type == PVR_SUB_CMD_TYPE_OCCLUSION_QUERY);
323 
324    pvr_compute_update_shared_private(cmd_buffer, &sub_cmd->compute, pipeline);
325    pvr_compute_update_kernel_private(cmd_buffer,
326                                      &sub_cmd->compute,
327                                      pipeline,
328                                      workgroup_size);
329    pvr_compute_generate_fence(cmd_buffer, &sub_cmd->compute, false);
330 }
331 
332 static void
pvr_destroy_compute_query_program(struct pvr_device * device,struct pvr_compute_query_shader * program)333 pvr_destroy_compute_query_program(struct pvr_device *device,
334                                   struct pvr_compute_query_shader *program)
335 {
336    pvr_destroy_compute_secondary_prog(device, program);
337    pvr_bo_suballoc_free(program->pds_prim_code.pvr_bo);
338    pvr_bo_suballoc_free(program->usc_bo);
339 }
340 
pvr_create_multibuffer_compute_query_program(struct pvr_device * device,const struct pvr_shader_factory_info * const * shader_factory_info,struct pvr_compute_query_shader * query_programs)341 static VkResult pvr_create_multibuffer_compute_query_program(
342    struct pvr_device *device,
343    const struct pvr_shader_factory_info *const *shader_factory_info,
344    struct pvr_compute_query_shader *query_programs)
345 {
346    const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
347    VkResult result;
348    uint32_t i;
349 
350    for (i = 0; i < core_count; i++) {
351       result = pvr_create_compute_query_program(device,
352                                                 shader_factory_info[i],
353                                                 &query_programs[i]);
354       if (result != VK_SUCCESS)
355          goto err_destroy_compute_query_program;
356    }
357 
358    return VK_SUCCESS;
359 
360 err_destroy_compute_query_program:
361    for (uint32_t j = 0; j < i; j++)
362       pvr_destroy_compute_query_program(device, &query_programs[j]);
363 
364    return result;
365 }
366 
pvr_device_create_compute_query_programs(struct pvr_device * device)367 VkResult pvr_device_create_compute_query_programs(struct pvr_device *device)
368 {
369    const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
370    VkResult result;
371 
372    result = pvr_create_compute_query_program(device,
373                                              &availability_query_write_info,
374                                              &device->availability_shader);
375    if (result != VK_SUCCESS)
376       return result;
377 
378    device->copy_results_shaders =
379       vk_alloc(&device->vk.alloc,
380                sizeof(*device->copy_results_shaders) * core_count,
381                8,
382                VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
383    if (!device->copy_results_shaders) {
384       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
385       goto err_destroy_availability_query_program;
386    }
387 
388    result = pvr_create_multibuffer_compute_query_program(
389       device,
390       copy_query_results_collection,
391       device->copy_results_shaders);
392    if (result != VK_SUCCESS)
393       goto err_vk_free_copy_results_shaders;
394 
395    device->reset_queries_shaders =
396       vk_alloc(&device->vk.alloc,
397                sizeof(*device->reset_queries_shaders) * core_count,
398                8,
399                VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
400    if (!device->reset_queries_shaders) {
401       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
402       goto err_destroy_copy_results_query_programs;
403    }
404 
405    result = pvr_create_multibuffer_compute_query_program(
406       device,
407       reset_query_collection,
408       device->reset_queries_shaders);
409    if (result != VK_SUCCESS)
410       goto err_vk_free_reset_queries_shaders;
411 
412    return VK_SUCCESS;
413 
414 err_vk_free_reset_queries_shaders:
415    vk_free(&device->vk.alloc, device->reset_queries_shaders);
416 
417 err_destroy_copy_results_query_programs:
418    for (uint32_t i = 0; i < core_count; i++) {
419       pvr_destroy_compute_query_program(device,
420                                         &device->copy_results_shaders[i]);
421    }
422 
423 err_vk_free_copy_results_shaders:
424    vk_free(&device->vk.alloc, device->copy_results_shaders);
425 
426 err_destroy_availability_query_program:
427    pvr_destroy_compute_query_program(device, &device->availability_shader);
428 
429    return result;
430 }
431 
pvr_device_destroy_compute_query_programs(struct pvr_device * device)432 void pvr_device_destroy_compute_query_programs(struct pvr_device *device)
433 {
434    const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
435 
436    pvr_destroy_compute_query_program(device, &device->availability_shader);
437 
438    for (uint32_t i = 0; i < core_count; i++) {
439       pvr_destroy_compute_query_program(device,
440                                         &device->copy_results_shaders[i]);
441       pvr_destroy_compute_query_program(device,
442                                         &device->reset_queries_shaders[i]);
443    }
444 
445    vk_free(&device->vk.alloc, device->copy_results_shaders);
446    vk_free(&device->vk.alloc, device->reset_queries_shaders);
447 }
448 
pvr_init_tex_info(const struct pvr_device_info * dev_info,struct pvr_texture_state_info * tex_info,uint32_t width,pvr_dev_addr_t addr)449 static void pvr_init_tex_info(const struct pvr_device_info *dev_info,
450                               struct pvr_texture_state_info *tex_info,
451                               uint32_t width,
452                               pvr_dev_addr_t addr)
453 {
454    const uint8_t *swizzle_arr = pvr_get_format_swizzle(tex_info->format);
455    bool is_view_1d = !PVR_HAS_FEATURE(dev_info, tpu_extended_integer_lookup) &&
456                      !PVR_HAS_FEATURE(dev_info, tpu_image_state_v2);
457 
458    *tex_info = (struct pvr_texture_state_info){
459       .format = VK_FORMAT_R32_UINT,
460       .mem_layout = PVR_MEMLAYOUT_LINEAR,
461       .flags = PVR_TEXFLAGS_INDEX_LOOKUP,
462       .type = is_view_1d ? VK_IMAGE_VIEW_TYPE_1D : VK_IMAGE_VIEW_TYPE_2D,
463       .is_cube = false,
464       .tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
465       .extent = { .width = width, .height = 1, .depth = 0 },
466       .array_size = 1,
467       .base_level = 0,
468       .mip_levels = 1,
469       .mipmaps_present = false,
470       .sample_count = 1,
471       .stride = width,
472       .offset = 0,
473       .swizzle = { [0] = swizzle_arr[0],
474                    [1] = swizzle_arr[1],
475                    [2] = swizzle_arr[2],
476                    [3] = swizzle_arr[3] },
477       .addr = addr,
478 
479    };
480 }
481 
482 /* TODO: Split this function into per program type functions. */
pvr_add_query_program(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_query_info * query_info)483 VkResult pvr_add_query_program(struct pvr_cmd_buffer *cmd_buffer,
484                                const struct pvr_query_info *query_info)
485 {
486    struct pvr_device *device = cmd_buffer->device;
487    const uint32_t core_count = device->pdevice->dev_runtime_info.core_count;
488    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
489    const struct pvr_shader_factory_info *shader_factory_info;
490    uint64_t sampler_state[ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
491    const struct pvr_compute_query_shader *query_prog;
492    struct pvr_private_compute_pipeline pipeline;
493    const uint32_t buffer_count = core_count;
494    struct pvr_texture_state_info tex_info;
495    uint32_t num_query_indices;
496    uint32_t *const_buffer;
497    struct pvr_suballoc_bo *pvr_bo;
498    VkResult result;
499 
500    pvr_csb_pack (&sampler_state[0U], TEXSTATE_SAMPLER, reg) {
501       reg.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
502       reg.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
503       reg.addrmode_w = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
504       reg.minfilter = PVRX(TEXSTATE_FILTER_POINT);
505       reg.magfilter = PVRX(TEXSTATE_FILTER_POINT);
506       reg.non_normalized_coords = true;
507       reg.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
508    }
509 
510    /* clang-format off */
511    pvr_csb_pack (&sampler_state[1], TEXSTATE_SAMPLER_WORD1, sampler_word1) {}
512    /* clang-format on */
513 
514    switch (query_info->type) {
515    case PVR_QUERY_TYPE_AVAILABILITY_WRITE:
516       /* Adds a compute shader (fenced on the last 3D) that writes a non-zero
517        * value in availability_bo at every index in index_bo.
518        */
519       query_prog = &device->availability_shader;
520       shader_factory_info = &availability_query_write_info;
521       num_query_indices = query_info->availability_write.num_query_indices;
522       break;
523 
524    case PVR_QUERY_TYPE_COPY_QUERY_RESULTS:
525       /* Adds a compute shader to copy availability and query value data. */
526       query_prog = &device->copy_results_shaders[buffer_count - 1];
527       shader_factory_info = copy_query_results_collection[buffer_count - 1];
528       num_query_indices = query_info->copy_query_results.query_count;
529       break;
530 
531    case PVR_QUERY_TYPE_RESET_QUERY_POOL:
532       /* Adds a compute shader to reset availability and query value data. */
533       query_prog = &device->reset_queries_shaders[buffer_count - 1];
534       shader_factory_info = reset_query_collection[buffer_count - 1];
535       num_query_indices = query_info->reset_query_pool.query_count;
536       break;
537 
538    default:
539       unreachable("Invalid query type");
540    }
541 
542    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer,
543                                          PVR_SUB_CMD_TYPE_OCCLUSION_QUERY);
544    if (result != VK_SUCCESS)
545       return result;
546 
547    pipeline.pds_code_offset = query_prog->pds_prim_code.code_offset;
548    pipeline.pds_data_offset = query_prog->pds_prim_code.data_offset;
549 
550    pipeline.pds_shared_update_code_offset =
551       query_prog->pds_sec_code.code_offset;
552    pipeline.pds_data_size_dw = query_prog->primary_data_size_dw;
553    pipeline.pds_temps_used = query_prog->primary_num_temps;
554 
555    pipeline.coeff_regs_count = shader_factory_info->coeff_regs;
556    pipeline.unified_store_regs_count = shader_factory_info->input_regs;
557    pipeline.const_shared_regs_count = shader_factory_info->const_shared_regs;
558 
559    const_buffer =
560       vk_alloc(&cmd_buffer->vk.pool->alloc,
561                PVR_DW_TO_BYTES(shader_factory_info->const_shared_regs),
562                8,
563                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
564    if (!const_buffer) {
565       return vk_command_buffer_set_error(&cmd_buffer->vk,
566                                          VK_ERROR_OUT_OF_HOST_MEMORY);
567    }
568 
569    /* clang-format off */
570 #define DRIVER_CONST(index)                                            \
571    assert(shader_factory_info->driver_const_location_map[index] <      \
572           shader_factory_info->const_shared_regs);                     \
573    const_buffer[shader_factory_info->driver_const_location_map[index]]
574    /* clang-format on */
575 
576    switch (query_info->type) {
577    case PVR_QUERY_TYPE_AVAILABILITY_WRITE: {
578       uint64_t image_sampler_state[3][ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
579       uint32_t image_sampler_idx = 0;
580 
581       memcpy(&image_sampler_state[image_sampler_idx][0],
582              &sampler_state[0],
583              sizeof(sampler_state));
584       image_sampler_idx++;
585 
586       pvr_init_tex_info(dev_info,
587                         &tex_info,
588                         num_query_indices,
589                         query_info->availability_write.index_bo->dev_addr);
590 
591       result = pvr_pack_tex_state(device,
592                                   &tex_info,
593                                   &image_sampler_state[image_sampler_idx][0]);
594       if (result != VK_SUCCESS) {
595          vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
596          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
597       }
598 
599       image_sampler_idx++;
600 
601       pvr_init_tex_info(
602          dev_info,
603          &tex_info,
604          query_info->availability_write.num_queries,
605          query_info->availability_write.availability_bo->dev_addr);
606 
607       result = pvr_pack_tex_state(device,
608                                   &tex_info,
609                                   &image_sampler_state[image_sampler_idx][0]);
610       if (result != VK_SUCCESS) {
611          vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
612          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
613       }
614 
615       image_sampler_idx++;
616 
617       memcpy(&const_buffer[0],
618              &image_sampler_state[0][0],
619              sizeof(image_sampler_state));
620 
621       /* Only PVR_QUERY_AVAILABILITY_WRITE_COUNT driver consts allowed. */
622       assert(shader_factory_info->num_driver_consts ==
623              PVR_QUERY_AVAILABILITY_WRITE_COUNT);
624 
625       DRIVER_CONST(PVR_QUERY_AVAILABILITY_WRITE_INDEX_COUNT) =
626          num_query_indices;
627       break;
628    }
629 
630    case PVR_QUERY_TYPE_COPY_QUERY_RESULTS: {
631       PVR_FROM_HANDLE(pvr_query_pool,
632                       pool,
633                       query_info->copy_query_results.query_pool);
634       PVR_FROM_HANDLE(pvr_buffer,
635                       buffer,
636                       query_info->copy_query_results.dst_buffer);
637       const uint32_t image_sampler_state_arr_size =
638          (buffer_count + 2) * ROGUE_NUM_TEXSTATE_SAMPLER_WORDS;
639       uint32_t image_sampler_idx = 0;
640       pvr_dev_addr_t addr;
641       uint64_t offset;
642 
643       STACK_ARRAY(uint64_t, image_sampler_state, image_sampler_state_arr_size);
644       if (!image_sampler_state) {
645          vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
646 
647          return vk_command_buffer_set_error(&cmd_buffer->vk,
648                                             VK_ERROR_OUT_OF_HOST_MEMORY);
649       }
650 
651 #define SAMPLER_ARR_2D(_arr, _i, _j) \
652    _arr[_i * ROGUE_NUM_TEXSTATE_SAMPLER_WORDS + _j]
653 
654       memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
655              &sampler_state[0],
656              sizeof(sampler_state));
657       image_sampler_idx++;
658 
659       offset = query_info->copy_query_results.first_query * sizeof(uint32_t);
660 
661       addr = PVR_DEV_ADDR_OFFSET(pool->availability_buffer->dev_addr, offset);
662 
663       pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
664 
665       result = pvr_pack_tex_state(
666          device,
667          &tex_info,
668          &SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0));
669       if (result != VK_SUCCESS) {
670          vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
671          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
672       }
673 
674       image_sampler_idx++;
675 
676       for (uint32_t i = 0; i < buffer_count; i++) {
677          addr = PVR_DEV_ADDR_OFFSET(pool->result_buffer->dev_addr,
678                                     offset + i * pool->result_stride);
679 
680          pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
681 
682          result = pvr_pack_tex_state(
683             device,
684             &tex_info,
685             &SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0));
686          if (result != VK_SUCCESS) {
687             vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
688             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
689          }
690 
691          image_sampler_idx++;
692       }
693 
694       memcpy(&const_buffer[0],
695              &SAMPLER_ARR_2D(image_sampler_state, 0, 0),
696              image_sampler_state_arr_size * sizeof(image_sampler_state[0]));
697 
698       STACK_ARRAY_FINISH(image_sampler_state);
699 
700       /* Only PVR_COPY_QUERY_POOL_RESULTS_COUNT driver consts allowed. */
701       assert(shader_factory_info->num_driver_consts ==
702              PVR_COPY_QUERY_POOL_RESULTS_COUNT);
703 
704       /* Assert if no memory is bound to destination buffer. */
705       assert(buffer->dev_addr.addr);
706 
707       addr = buffer->dev_addr;
708       addr.addr += query_info->copy_query_results.dst_offset;
709       addr.addr += query_info->copy_query_results.first_query *
710                    query_info->copy_query_results.stride;
711 
712       DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_INDEX_COUNT) = num_query_indices;
713       DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_BASE_ADDRESS_LOW) = addr.addr &
714                                                                    0xFFFFFFFF;
715       DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_BASE_ADDRESS_HIGH) = addr.addr >>
716                                                                     32;
717       DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_DEST_STRIDE) =
718          query_info->copy_query_results.stride;
719       DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_PARTIAL_RESULT_FLAG) =
720          query_info->copy_query_results.flags & VK_QUERY_RESULT_PARTIAL_BIT;
721       DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_64_BIT_FLAG) =
722          query_info->copy_query_results.flags & VK_QUERY_RESULT_64_BIT;
723       DRIVER_CONST(PVR_COPY_QUERY_POOL_RESULTS_WITH_AVAILABILITY_FLAG) =
724          query_info->copy_query_results.flags &
725          VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
726       break;
727    }
728 
729    case PVR_QUERY_TYPE_RESET_QUERY_POOL: {
730       PVR_FROM_HANDLE(pvr_query_pool,
731                       pool,
732                       query_info->reset_query_pool.query_pool);
733       const uint32_t image_sampler_state_arr_size =
734          (buffer_count + 2) * ROGUE_NUM_TEXSTATE_SAMPLER_WORDS;
735       uint32_t image_sampler_idx = 0;
736       pvr_dev_addr_t addr;
737       uint64_t offset;
738 
739       STACK_ARRAY(uint64_t, image_sampler_state, image_sampler_state_arr_size);
740       if (!image_sampler_state) {
741          vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
742 
743          return vk_command_buffer_set_error(&cmd_buffer->vk,
744                                             VK_ERROR_OUT_OF_HOST_MEMORY);
745       }
746 
747       memcpy(&SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0),
748              &sampler_state[0],
749              sizeof(sampler_state));
750       image_sampler_idx++;
751 
752       offset = query_info->reset_query_pool.first_query * sizeof(uint32_t);
753 
754       for (uint32_t i = 0; i < buffer_count; i++) {
755          addr = PVR_DEV_ADDR_OFFSET(pool->result_buffer->dev_addr,
756                                     offset + i * pool->result_stride);
757 
758          pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
759 
760          result = pvr_pack_tex_state(
761             device,
762             &tex_info,
763             &SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0));
764          if (result != VK_SUCCESS) {
765             vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
766             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
767          }
768 
769          image_sampler_idx++;
770       }
771 
772       addr = PVR_DEV_ADDR_OFFSET(pool->availability_buffer->dev_addr, offset);
773 
774       pvr_init_tex_info(dev_info, &tex_info, num_query_indices, addr);
775 
776       result = pvr_pack_tex_state(
777          device,
778          &tex_info,
779          &SAMPLER_ARR_2D(image_sampler_state, image_sampler_idx, 0));
780       if (result != VK_SUCCESS) {
781          vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
782          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
783       }
784 
785       image_sampler_idx++;
786 
787 #undef SAMPLER_ARR_2D
788 
789       memcpy(&const_buffer[0],
790              &image_sampler_state[0],
791              image_sampler_state_arr_size * sizeof(image_sampler_state[0]));
792 
793       STACK_ARRAY_FINISH(image_sampler_state);
794 
795       /* Only PVR_RESET_QUERY_POOL_COUNT driver consts allowed. */
796       assert(shader_factory_info->num_driver_consts ==
797              PVR_RESET_QUERY_POOL_COUNT);
798 
799       DRIVER_CONST(PVR_RESET_QUERY_POOL_INDEX_COUNT) = num_query_indices;
800       break;
801    }
802 
803    default:
804       unreachable("Invalid query type");
805    }
806 
807 #undef DRIVER_CONST
808 
809    for (uint32_t i = 0; i < shader_factory_info->num_static_const; i++) {
810       const struct pvr_static_buffer *load =
811          &shader_factory_info->static_const_buffer[i];
812 
813       /* Assert if static const is out of range. */
814       assert(load->dst_idx < shader_factory_info->const_shared_regs);
815       const_buffer[load->dst_idx] = load->value;
816    }
817 
818    result = pvr_cmd_buffer_upload_general(
819       cmd_buffer,
820       const_buffer,
821       PVR_DW_TO_BYTES(shader_factory_info->const_shared_regs),
822       &pvr_bo);
823    if (result != VK_SUCCESS) {
824       vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
825 
826       return result;
827    }
828 
829    pipeline.const_buffer_addr = pvr_bo->dev_addr;
830 
831    vk_free(&cmd_buffer->vk.pool->alloc, const_buffer);
832 
833    /* PDS data section for the secondary/constant upload. */
834    result = pvr_write_compute_query_pds_data_section(cmd_buffer,
835                                                      query_prog,
836                                                      &pipeline);
837    if (result != VK_SUCCESS)
838       return result;
839 
840    pipeline.workgroup_size.width = ROGUE_MAX_INSTANCES_PER_TASK;
841    pipeline.workgroup_size.height = 1;
842    pipeline.workgroup_size.depth = 1;
843 
844    pvr_write_private_compute_dispatch(cmd_buffer, &pipeline, num_query_indices);
845 
846    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
847 }
848