xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_query_pool.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  *
5  * Based on anv:
6  * Copyright © 2015 Intel Corporation
7  */
8 
9 #include "tu_query_pool.h"
10 
11 #include <fcntl.h>
12 
13 #include "nir/nir_builder.h"
14 #include "util/os_time.h"
15 
16 #include "vk_util.h"
17 
18 #include "tu_buffer.h"
19 #include "tu_cmd_buffer.h"
20 #include "tu_cs.h"
21 #include "tu_device.h"
22 #include "tu_rmv.h"
23 
24 #include "common/freedreno_gpu_event.h"
25 
26 #define NSEC_PER_SEC 1000000000ull
27 #define WAIT_TIMEOUT 5
28 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
29 
30 struct PACKED query_slot {
31    uint64_t available;
32 };
33 
34 struct PACKED occlusion_query_slot {
35    struct query_slot common;
36    uint64_t _padding0;
37 
38    uint64_t begin;
39    uint64_t result;
40    uint64_t end;
41    uint64_t _padding1;
42 };
43 
44 struct PACKED timestamp_query_slot {
45    struct query_slot common;
46    uint64_t result;
47 };
48 
49 struct PACKED primitive_slot_value {
50    uint64_t values[2];
51 };
52 
53 struct PACKED pipeline_stat_query_slot {
54    struct query_slot common;
55    uint64_t results[STAT_COUNT];
56 
57    uint64_t begin[STAT_COUNT];
58    uint64_t end[STAT_COUNT];
59 };
60 
61 struct PACKED primitive_query_slot {
62    struct query_slot common;
63    /* The result of transform feedback queries is two integer values:
64     *   results[0] is the count of primitives written,
65     *   results[1] is the count of primitives generated.
66     * Also a result for each stream is stored at 4 slots respectively.
67     */
68    uint64_t results[2];
69 
70    /* Primitive counters also need to be 16-byte aligned. */
71    uint64_t _padding;
72 
73    struct primitive_slot_value begin[4];
74    struct primitive_slot_value end[4];
75 };
76 
77 struct PACKED perfcntr_query_slot {
78    uint64_t result;
79    uint64_t begin;
80    uint64_t end;
81 };
82 
83 struct PACKED perf_query_slot {
84    struct query_slot common;
85    struct perfcntr_query_slot perfcntr;
86 };
87 
88 struct PACKED primitives_generated_query_slot {
89    struct query_slot common;
90    uint64_t result;
91    uint64_t begin;
92    uint64_t end;
93 };
94 
95 /* Returns the IOVA or mapped address of a given uint64_t field
96  * in a given slot of a query pool. */
97 #define query_iova(type, pool, query, field)                               \
98    pool->bo->iova + pool->query_stride * (query) + offsetof(type, field)
99 #define query_addr(type, pool, query, field)                               \
100    (uint64_t *) ((char *) pool->bo->map + pool->query_stride * (query) +   \
101                  offsetof(type, field))
102 
103 #define occlusion_query_iova(pool, query, field)                           \
104    query_iova(struct occlusion_query_slot, pool, query, field)
105 #define occlusion_query_addr(pool, query, field)                           \
106    query_addr(struct occlusion_query_slot, pool, query, field)
107 
108 #define pipeline_stat_query_iova(pool, query, field, idx)                  \
109    pool->bo->iova + pool->query_stride * (query) +                         \
110       offsetof_arr(struct pipeline_stat_query_slot, field, (idx))
111 
112 #define primitive_query_iova(pool, query, field, stream_id, i)             \
113    query_iova(struct primitive_query_slot, pool, query, field) +           \
114       sizeof_field(struct primitive_query_slot, field[0]) * (stream_id) +  \
115       offsetof_arr(struct primitive_slot_value, values, (i))
116 
117 #define perf_query_iova(pool, query, field, i)                             \
118    pool->bo->iova + pool->query_stride * (query) +                         \
119    sizeof(struct query_slot) +                                             \
120    sizeof(struct perfcntr_query_slot) * (i) +                              \
121    offsetof(struct perfcntr_query_slot, field)
122 
123 #define primitives_generated_query_iova(pool, query, field)                \
124    query_iova(struct primitives_generated_query_slot, pool, query, field)
125 
126 #define query_available_iova(pool, query)                                  \
127    query_iova(struct query_slot, pool, query, available)
128 
129 #define query_result_iova(pool, query, type, i)                            \
130    pool->bo->iova + pool->query_stride * (query) +                         \
131    sizeof(struct query_slot) + sizeof(type) * (i)
132 
133 #define query_result_addr(pool, query, type, i)                            \
134    (uint64_t *) ((char *) pool->bo->map + pool->query_stride * (query) +   \
135                  sizeof(struct query_slot) + sizeof(type) * (i))
136 
137 #define query_is_available(slot) slot->available
138 
139 static const VkPerformanceCounterUnitKHR
140 fd_perfcntr_type_to_vk_unit[] = {
141    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
142    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
143    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
144    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
145    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
146    /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
147    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
148    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
149    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
150    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
151    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
152    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
153    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
154 };
155 
156 /* TODO. Basically this comes from the freedreno implementation where
157  * only UINT64 is used. We'd better confirm this by the blob vulkan driver
158  * when it starts supporting perf query.
159  */
160 static const VkPerformanceCounterStorageKHR
161 fd_perfcntr_type_to_vk_storage[] = {
162    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
163    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
164    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
165    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
166    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
167    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
168    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
169    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
170    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
171    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
172    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
173    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
174 };
175 
176 /*
177  * Returns a pointer to a given slot in a query pool.
178  */
179 static struct query_slot *
slot_address(struct tu_query_pool * pool,uint32_t query)180 slot_address(struct tu_query_pool *pool, uint32_t query)
181 {
182    return (struct query_slot *) ((char *) pool->bo->map +
183                                  query * pool->query_stride);
184 }
185 
186 static void
perfcntr_index(const struct fd_perfcntr_group * group,uint32_t group_count,uint32_t index,uint32_t * gid,uint32_t * cid)187 perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
188                uint32_t index, uint32_t *gid, uint32_t *cid)
189 
190 {
191    uint32_t i;
192 
193    for (i = 0; i < group_count; i++) {
194       if (group[i].num_countables > index) {
195          *gid = i;
196          *cid = index;
197          break;
198       }
199       index -= group[i].num_countables;
200    }
201 
202    assert(i < group_count);
203 }
204 
205 static int
compare_perfcntr_pass(const void * a,const void * b)206 compare_perfcntr_pass(const void *a, const void *b)
207 {
208    return ((struct tu_perf_query_data *)a)->pass -
209           ((struct tu_perf_query_data *)b)->pass;
210 }
211 
212 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)213 tu_CreateQueryPool(VkDevice _device,
214                    const VkQueryPoolCreateInfo *pCreateInfo,
215                    const VkAllocationCallbacks *pAllocator,
216                    VkQueryPool *pQueryPool)
217 {
218    VK_FROM_HANDLE(tu_device, device, _device);
219    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
220    assert(pCreateInfo->queryCount > 0);
221 
222    uint32_t pool_size, slot_size;
223    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
224 
225    pool_size = sizeof(struct tu_query_pool);
226 
227    switch (pCreateInfo->queryType) {
228    case VK_QUERY_TYPE_OCCLUSION:
229       slot_size = sizeof(struct occlusion_query_slot);
230       break;
231    case VK_QUERY_TYPE_TIMESTAMP:
232       slot_size = sizeof(struct timestamp_query_slot);
233       break;
234    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
235       slot_size = sizeof(struct primitive_query_slot);
236       break;
237    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
238       slot_size = sizeof(struct primitives_generated_query_slot);
239       break;
240    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
241       perf_query_info =
242             vk_find_struct_const(pCreateInfo->pNext,
243                                  QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
244       assert(perf_query_info);
245 
246       slot_size = sizeof(struct perf_query_slot) +
247                   sizeof(struct perfcntr_query_slot) *
248                   (perf_query_info->counterIndexCount - 1);
249 
250       /* Size of the array pool->tu_perf_query_data */
251       pool_size += sizeof(struct tu_perf_query_data) *
252                    perf_query_info->counterIndexCount;
253       break;
254    }
255    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
256       slot_size = sizeof(struct pipeline_stat_query_slot);
257       break;
258    default:
259       unreachable("Invalid query type");
260    }
261 
262    struct tu_query_pool *pool = (struct tu_query_pool *)
263          vk_query_pool_create(&device->vk, pCreateInfo,
264                               pAllocator, pool_size);
265    if (!pool)
266       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
267 
268    if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
269       pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
270                                       &pool->perf_group_count);
271 
272       pool->counter_index_count = perf_query_info->counterIndexCount;
273 
274       /* Build all perf counters data that is requested, so we could get
275        * correct group id, countable id, counter register and pass index with
276        * only a counter index provided by applications at each command submit.
277        *
278        * Also, since this built data will be sorted by pass index later, we
279        * should keep the original indices and store perfcntrs results according
280        * to them so apps can get correct results with their own indices.
281        */
282       uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
283       memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
284       memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
285 
286       for (uint32_t i = 0; i < pool->counter_index_count; i++) {
287          uint32_t gid = 0, cid = 0;
288 
289          perfcntr_index(pool->perf_group, pool->perf_group_count,
290                         perf_query_info->pCounterIndices[i], &gid, &cid);
291 
292          pool->perf_query_data[i].gid = gid;
293          pool->perf_query_data[i].cid = cid;
294          pool->perf_query_data[i].app_idx = i;
295 
296          /* When a counter register is over the capacity(num_counters),
297           * reset it for next pass.
298           */
299          if (regs[gid] < pool->perf_group[gid].num_counters) {
300             pool->perf_query_data[i].cntr_reg = regs[gid]++;
301             pool->perf_query_data[i].pass = pass[gid];
302          } else {
303             pool->perf_query_data[i].pass = ++pass[gid];
304             pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
305             regs[gid]++;
306          }
307       }
308 
309       /* Sort by pass index so we could easily prepare a command stream
310        * with the ascending order of pass index.
311        */
312       qsort(pool->perf_query_data, pool->counter_index_count,
313             sizeof(pool->perf_query_data[0]),
314             compare_perfcntr_pass);
315    }
316 
317    VkResult result = tu_bo_init_new(device, &pool->vk.base, &pool->bo,
318          pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
319    if (result != VK_SUCCESS) {
320       vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
321       return result;
322    }
323 
324    result = tu_bo_map(device, pool->bo, NULL);
325    if (result != VK_SUCCESS) {
326       tu_bo_finish(device, pool->bo);
327       vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
328       return result;
329    }
330 
331    /* Initialize all query statuses to unavailable */
332    memset(pool->bo->map, 0, pool->bo->size);
333 
334    pool->size = pCreateInfo->queryCount;
335    pool->query_stride = slot_size;
336 
337    TU_RMV(query_pool_create, device, pool);
338 
339    *pQueryPool = tu_query_pool_to_handle(pool);
340 
341    return VK_SUCCESS;
342 }
343 
344 VKAPI_ATTR void VKAPI_CALL
tu_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)345 tu_DestroyQueryPool(VkDevice _device,
346                     VkQueryPool _pool,
347                     const VkAllocationCallbacks *pAllocator)
348 {
349    VK_FROM_HANDLE(tu_device, device, _device);
350    VK_FROM_HANDLE(tu_query_pool, pool, _pool);
351 
352    if (!pool)
353       return;
354 
355    TU_RMV(resource_destroy, device, pool);
356 
357    tu_bo_finish(device, pool->bo);
358    vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
359 }
360 
361 static uint32_t
get_result_count(struct tu_query_pool * pool)362 get_result_count(struct tu_query_pool *pool)
363 {
364    switch (pool->vk.query_type) {
365    /* Occulusion and timestamp queries write one integer value */
366    case VK_QUERY_TYPE_OCCLUSION:
367    case VK_QUERY_TYPE_TIMESTAMP:
368    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
369       return 1;
370    /* Transform feedback queries write two integer values */
371    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
372       return 2;
373    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
374       return util_bitcount(pool->vk.pipeline_statistics);
375    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
376       return pool->counter_index_count;
377    default:
378       assert(!"Invalid query type");
379       return 0;
380    }
381 }
382 
383 static uint32_t
statistics_index(uint32_t * statistics)384 statistics_index(uint32_t *statistics)
385 {
386    uint32_t stat;
387    stat = u_bit_scan(statistics);
388 
389    switch (1 << stat) {
390    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
391    case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
392       return 0;
393    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
394       return 1;
395    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
396       return 2;
397    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
398       return 4;
399    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
400       return 5;
401    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
402       return 6;
403    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
404       return 7;
405    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
406       return 8;
407    case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
408       return 9;
409    case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
410       return 10;
411    default:
412       return 0;
413    }
414 }
415 
416 static bool
is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)417 is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
418 {
419    return pipeline_statistics &
420           (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
421            VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
422            VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
423            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
424            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
425            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
426            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
427            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
428            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
429 }
430 
431 static bool
is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)432 is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
433 {
434    return pipeline_statistics &
435           VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
436 }
437 
438 static bool
is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)439 is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
440 {
441    return pipeline_statistics &
442           VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
443 }
444 
445 /* Wait on the the availability status of a query up until a timeout. */
446 static VkResult
wait_for_available(struct tu_device * device,struct tu_query_pool * pool,uint32_t query)447 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
448                    uint32_t query)
449 {
450    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
451     * scheduler friendly way instead of busy polling once the patch has landed
452     * upstream. */
453    struct query_slot *slot = slot_address(pool, query);
454    uint64_t abs_timeout = os_time_get_absolute_timeout(
455          WAIT_TIMEOUT * NSEC_PER_SEC);
456    while(os_time_get_nano() < abs_timeout) {
457       if (query_is_available(slot))
458          return VK_SUCCESS;
459    }
460    return vk_error(device, VK_TIMEOUT);
461 }
462 
463 /* Writes a query value to a buffer from the CPU. */
464 static void
write_query_value_cpu(char * base,uint32_t offset,uint64_t value,VkQueryResultFlags flags)465 write_query_value_cpu(char* base,
466                       uint32_t offset,
467                       uint64_t value,
468                       VkQueryResultFlags flags)
469 {
470    if (flags & VK_QUERY_RESULT_64_BIT) {
471       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
472    } else {
473       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
474    }
475 }
476 
477 static VkResult
get_query_pool_results(struct tu_device * device,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)478 get_query_pool_results(struct tu_device *device,
479                        struct tu_query_pool *pool,
480                        uint32_t firstQuery,
481                        uint32_t queryCount,
482                        size_t dataSize,
483                        void *pData,
484                        VkDeviceSize stride,
485                        VkQueryResultFlags flags)
486 {
487    assert(dataSize >= stride * queryCount);
488 
489    char *result_base = (char *) pData;
490    VkResult result = VK_SUCCESS;
491    for (uint32_t i = 0; i < queryCount; i++) {
492       uint32_t query = firstQuery + i;
493       struct query_slot *slot = slot_address(pool, query);
494       bool available = query_is_available(slot);
495       uint32_t result_count = get_result_count(pool);
496       uint32_t statistics = pool->vk.pipeline_statistics;
497 
498       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
499          VkResult wait_result = wait_for_available(device, pool, query);
500          if (wait_result != VK_SUCCESS)
501             return wait_result;
502          available = true;
503       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
504          /* From the Vulkan 1.1.130 spec:
505           *
506           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
507           *    both not set then no result values are written to pData for
508           *    queries that are in the unavailable state at the time of the
509           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
510           *    availability state is still written to pData for those queries
511           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
512           */
513          result = VK_NOT_READY;
514          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
515             result_base += stride;
516             continue;
517          }
518       }
519 
520       for (uint32_t k = 0; k < result_count; k++) {
521          if (available) {
522             uint64_t *result;
523 
524             if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
525                uint32_t stat_idx = statistics_index(&statistics);
526                result = query_result_addr(pool, query, uint64_t, stat_idx);
527             } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
528                result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
529             } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
530                assert(k == 0);
531                result = occlusion_query_addr(pool, query, result);
532             } else {
533                result = query_result_addr(pool, query, uint64_t, k);
534             }
535 
536             write_query_value_cpu(result_base, k, *result, flags);
537          } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
538              /* From the Vulkan 1.1.130 spec:
539               *
540               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
541               *   is not set, and the query’s status is unavailable, an
542               *   intermediate result value between zero and the final result
543               *   value is written to pData for that query.
544               *
545               * Just return 0 here for simplicity since it's a valid result.
546               */
547             write_query_value_cpu(result_base, k, 0, flags);
548       }
549 
550       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
551          /* From the Vulkan 1.1.130 spec:
552           *
553           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
554           *    integer value written for each query is non-zero if the query’s
555           *    status was available or zero if the status was unavailable.
556           */
557          write_query_value_cpu(result_base, result_count, available, flags);
558 
559       result_base += stride;
560    }
561    return result;
562 }
563 
564 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)565 tu_GetQueryPoolResults(VkDevice _device,
566                        VkQueryPool queryPool,
567                        uint32_t firstQuery,
568                        uint32_t queryCount,
569                        size_t dataSize,
570                        void *pData,
571                        VkDeviceSize stride,
572                        VkQueryResultFlags flags)
573 {
574    VK_FROM_HANDLE(tu_device, device, _device);
575    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
576    assert(firstQuery + queryCount <= pool->size);
577 
578    if (vk_device_is_lost(&device->vk))
579       return VK_ERROR_DEVICE_LOST;
580 
581    switch (pool->vk.query_type) {
582    case VK_QUERY_TYPE_OCCLUSION:
583    case VK_QUERY_TYPE_TIMESTAMP:
584    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
585    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
586    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
587    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
588       return get_query_pool_results(device, pool, firstQuery, queryCount,
589                                     dataSize, pData, stride, flags);
590    default:
591       assert(!"Invalid query type");
592    }
593    return VK_SUCCESS;
594 }
595 
596 /* Copies a query value from one buffer to another from the GPU. */
597 static void
copy_query_value_gpu(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,uint64_t src_iova,uint64_t base_write_iova,uint32_t offset,VkQueryResultFlags flags)598 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
599                      struct tu_cs *cs,
600                      uint64_t src_iova,
601                      uint64_t base_write_iova,
602                      uint32_t offset,
603                      VkQueryResultFlags flags) {
604    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
605          sizeof(uint64_t) : sizeof(uint32_t);
606    uint64_t write_iova = base_write_iova + (offset * element_size);
607 
608    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
609    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
610          CP_MEM_TO_MEM_0_DOUBLE : 0;
611    tu_cs_emit(cs, mem_to_mem_flags);
612    tu_cs_emit_qw(cs, write_iova);
613    tu_cs_emit_qw(cs, src_iova);
614 }
615 
616 template <chip CHIP>
617 static void
emit_copy_query_pool_results(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,struct tu_buffer * buffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)618 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
619                              struct tu_cs *cs,
620                              struct tu_query_pool *pool,
621                              uint32_t firstQuery,
622                              uint32_t queryCount,
623                              struct tu_buffer *buffer,
624                              VkDeviceSize dstOffset,
625                              VkDeviceSize stride,
626                              VkQueryResultFlags flags)
627 {
628    /* Flush cache for the buffer to copy to. */
629    tu_emit_cache_flush<CHIP>(cmdbuf);
630 
631    /* From the Vulkan 1.1.130 spec:
632     *
633     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
634     *    uses of vkCmdResetQueryPool in the same queue, without any additional
635     *    synchronization.
636     *
637     * To ensure that previous writes to the available bit are coherent, first
638     * wait for all writes to complete.
639     */
640    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
641 
642    for (uint32_t i = 0; i < queryCount; i++) {
643       uint32_t query = firstQuery + i;
644       uint64_t available_iova = query_available_iova(pool, query);
645       uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
646       uint32_t result_count = get_result_count(pool);
647       uint32_t statistics = pool->vk.pipeline_statistics;
648 
649       /* Wait for the available bit to be set if executed with the
650        * VK_QUERY_RESULT_WAIT_BIT flag. */
651       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
652          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
653          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
654                         CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
655          tu_cs_emit_qw(cs, available_iova);
656          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
657          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
658          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
659       }
660 
661       for (uint32_t k = 0; k < result_count; k++) {
662          uint64_t result_iova;
663 
664          if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
665             uint32_t stat_idx = statistics_index(&statistics);
666             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
667          } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
668             result_iova = query_result_iova(pool, query,
669                                             struct perfcntr_query_slot, k);
670          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
671             assert(k == 0);
672             result_iova = occlusion_query_iova(pool, query, result);
673          } else {
674             result_iova = query_result_iova(pool, query, uint64_t, k);
675          }
676 
677          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
678             /* Unconditionally copying the bo->result into the buffer here is
679              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
680              * if the query is unavailable, this will copy the correct partial
681              * value of 0.
682              */
683             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
684                                  k /* offset */, flags);
685          } else {
686             /* Conditionally copy bo->result into the buffer based on whether the
687              * query is available.
688              *
689              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
690              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
691              * that 0 < available < 2, aka available == 1.
692              */
693             tu_cs_reserve(cs, 7 + 6);
694             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
695             tu_cs_emit_qw(cs, available_iova);
696             tu_cs_emit_qw(cs, available_iova);
697             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
698             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
699 
700             /* Start of conditional execution */
701             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
702                               k /* offset */, flags);
703             /* End of conditional execution */
704          }
705       }
706 
707       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
708          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
709                               result_count /* offset */, flags);
710       }
711    }
712 }
713 
714 template <chip CHIP>
715 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)716 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
717                            VkQueryPool queryPool,
718                            uint32_t firstQuery,
719                            uint32_t queryCount,
720                            VkBuffer dstBuffer,
721                            VkDeviceSize dstOffset,
722                            VkDeviceSize stride,
723                            VkQueryResultFlags flags)
724 {
725    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
726    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
727    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
728    struct tu_cs *cs = &cmdbuf->cs;
729    assert(firstQuery + queryCount <= pool->size);
730 
731    switch (pool->vk.query_type) {
732    case VK_QUERY_TYPE_OCCLUSION:
733    case VK_QUERY_TYPE_TIMESTAMP:
734    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
735    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
736    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
737       return emit_copy_query_pool_results<CHIP>(cmdbuf, cs, pool, firstQuery,
738                                                 queryCount, buffer, dstOffset,
739                                                 stride, flags);
740    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
741       unreachable("allowCommandBufferQueryCopies is false");
742    default:
743       assert(!"Invalid query type");
744    }
745 }
746 TU_GENX(tu_CmdCopyQueryPoolResults);
747 
748 static void
emit_reset_query_pool(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount)749 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
750                       struct tu_query_pool *pool,
751                       uint32_t firstQuery,
752                       uint32_t queryCount)
753 {
754    struct tu_cs *cs = &cmdbuf->cs;
755 
756    for (uint32_t i = 0; i < queryCount; i++) {
757       uint32_t query = firstQuery + i;
758       uint32_t statistics = pool->vk.pipeline_statistics;
759 
760       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
761       tu_cs_emit_qw(cs, query_available_iova(pool, query));
762       tu_cs_emit_qw(cs, 0x0);
763 
764       for (uint32_t k = 0; k < get_result_count(pool); k++) {
765          uint64_t result_iova;
766 
767          if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
768             uint32_t stat_idx = statistics_index(&statistics);
769             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
770          } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
771             result_iova = query_result_iova(pool, query,
772                                             struct perfcntr_query_slot, k);
773          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
774             assert(k == 0);
775             result_iova = occlusion_query_iova(pool, query, result);
776          } else {
777             result_iova = query_result_iova(pool, query, uint64_t, k);
778          }
779 
780          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
781          tu_cs_emit_qw(cs, result_iova);
782          tu_cs_emit_qw(cs, 0x0);
783       }
784    }
785 
786 }
787 
788 VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)789 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
790                      VkQueryPool queryPool,
791                      uint32_t firstQuery,
792                      uint32_t queryCount)
793 {
794    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
795    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
796 
797    switch (pool->vk.query_type) {
798    case VK_QUERY_TYPE_TIMESTAMP:
799    case VK_QUERY_TYPE_OCCLUSION:
800    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
801    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
802    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
803    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
804       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
805       break;
806    default:
807       assert(!"Invalid query type");
808    }
809 }
810 
811 VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)812 tu_ResetQueryPool(VkDevice device,
813                   VkQueryPool queryPool,
814                   uint32_t firstQuery,
815                   uint32_t queryCount)
816 {
817    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
818 
819    for (uint32_t i = 0; i < queryCount; i++) {
820       struct query_slot *slot = slot_address(pool, i + firstQuery);
821       slot->available = 0;
822 
823       for (uint32_t k = 0; k < get_result_count(pool); k++) {
824          uint64_t *res;
825 
826          if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
827             res = query_result_addr(pool, i + firstQuery,
828                                     struct perfcntr_query_slot, k);
829          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
830             assert(k == 0);
831             res = occlusion_query_addr(pool, i + firstQuery, result);
832          } else {
833             res = query_result_addr(pool, i + firstQuery, uint64_t, k);
834          }
835 
836          *res = 0;
837       }
838    }
839 }
840 
841 template <chip CHIP>
842 static void
emit_begin_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)843 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
844                            struct tu_query_pool *pool,
845                            uint32_t query)
846 {
847    /* From the Vulkan 1.1.130 spec:
848     *
849     *    A query must begin and end inside the same subpass of a render pass
850     *    instance, or must both begin and end outside of a render pass
851     *    instance.
852     *
853     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
854     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
855     * query begins/ends inside the same subpass of a render pass, we need to
856     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
857     * is then run on every tile during render, so we just need to accumulate
858     * sample counts in slot->result to compute the query result.
859     */
860    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
861 
862    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
863 
864    tu_cs_emit_regs(cs,
865                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
866 
867    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
868       tu_cs_emit_regs(cs,
869                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
870       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
871       tu_cs_emit(cs, ZPASS_DONE);
872       if (CHIP == A7XX) {
873          /* Copied from blob's cmdstream, not sure why it is done. */
874          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
875          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
876       }
877    } else {
878       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
879       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
880                                        .write_sample_count = true).value);
881       tu_cs_emit_qw(cs, begin_iova);
882 
883       /* ZPASS_DONE events should come in begin-end pairs. When emitting and
884        * occlusion query outside of a renderpass, we emit a fake end event that
885        * closes the previous one since the autotuner's ZPASS_DONE use could end
886        * up causing problems. This events writes into the end field of the query
887        * slot, but it will be overwritten by events in emit_end_occlusion_query
888        * with the proper value.
889        * When inside a renderpass, the corresponding ZPASS_DONE event will be
890        * emitted in emit_end_occlusion_query. We note the use of ZPASS_DONE on
891        * the state object, enabling autotuner to optimize its own events.
892        */
893       if (!cmdbuf->state.pass) {
894          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
895          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
896                                           .write_sample_count = true,
897                                           .sample_count_end_offset = true,
898                                           .write_accum_sample_count_diff = true).value);
899          tu_cs_emit_qw(cs, begin_iova);
900       } else {
901          cmdbuf->state.rp.has_zpass_done_sample_count_write_in_rp = true;
902       }
903    }
904 }
905 
906 template <chip CHIP>
907 static void
emit_begin_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)908 emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
909                       struct tu_query_pool *pool,
910                       uint32_t query)
911 {
912    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
913    uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin, 0);
914 
915    if (is_pipeline_query_with_vertex_stage(pool->vk.pipeline_statistics)) {
916       bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
917       cmdbuf->state.prim_counters_running++;
918 
919       /* Prevent starting primitive counters when it is supposed to be stopped
920        * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
921        */
922       if (need_cond_exec) {
923          tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
924                         CP_COND_REG_EXEC_0_SYSMEM |
925                         CP_COND_REG_EXEC_0_BINNING);
926       }
927 
928       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
929 
930       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
931       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
932       tu_cs_emit(cs, 0);
933 
934       if (need_cond_exec) {
935          tu_cond_exec_end(cs);
936       }
937    }
938 
939    if (is_pipeline_query_with_fragment_stage(pool->vk.pipeline_statistics)) {
940       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_FRAGMENT_CTRS);
941    }
942 
943    if (is_pipeline_query_with_compute_stage(pool->vk.pipeline_statistics)) {
944       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_COMPUTE_CTRS);
945    }
946 
947    tu_cs_emit_wfi(cs);
948 
949    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
950    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
951                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
952                   CP_REG_TO_MEM_0_64B);
953    tu_cs_emit_qw(cs, begin_iova);
954 }
955 
956 static void
emit_perfcntrs_pass_start(struct tu_cs * cs,uint32_t pass)957 emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
958 {
959    tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
960    tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
961                         REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
962                   A6XX_CP_REG_TEST_0_BIT(pass) |
963                   A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
964    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
965 }
966 
967 static void
emit_begin_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)968 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
969                            struct tu_query_pool *pool,
970                            uint32_t query)
971 {
972    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
973    uint32_t last_pass = ~0;
974 
975    if (cmdbuf->state.pass) {
976       cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
977    }
978 
979    /* Querying perf counters happens in these steps:
980     *
981     *  0) There's a scratch reg to set a pass index for perf counters query.
982     *     Prepare cmd streams to set each pass index to the reg at device
983     *     creation time. See tu_CreateDevice in tu_device.c
984     *  1) Emit command streams to read all requested perf counters at all
985     *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
986     *     reads the scratch reg where pass index is set.
987     *     See emit_perfcntrs_pass_start.
988     *  2) Pick the right cs setting proper pass index to the reg and prepend
989     *     it to the command buffer at each submit time.
990     *     See tu_queue_build_msm_gem_submit_cmds in tu_knl_drm_msm.cc and
991     *     tu_knl_drm_virtio.cc and kgsl_queue_submit in tu_knl_kgsl.cc
992     *  3) If the pass index in the reg is true, then executes the command
993     *     stream below CP_COND_REG_EXEC.
994     */
995 
996    tu_cs_emit_wfi(cs);
997 
998    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
999       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1000 
1001       if (last_pass != data->pass) {
1002          last_pass = data->pass;
1003 
1004          if (data->pass != 0)
1005             tu_cond_exec_end(cs);
1006          emit_perfcntrs_pass_start(cs, data->pass);
1007       }
1008 
1009       const struct fd_perfcntr_counter *counter =
1010             &pool->perf_group[data->gid].counters[data->cntr_reg];
1011       const struct fd_perfcntr_countable *countable =
1012             &pool->perf_group[data->gid].countables[data->cid];
1013 
1014       tu_cs_emit_pkt4(cs, counter->select_reg, 1);
1015       tu_cs_emit(cs, countable->selector);
1016    }
1017    tu_cond_exec_end(cs);
1018 
1019    last_pass = ~0;
1020    tu_cs_emit_wfi(cs);
1021 
1022    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1023       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1024 
1025       if (last_pass != data->pass) {
1026          last_pass = data->pass;
1027 
1028          if (data->pass != 0)
1029             tu_cond_exec_end(cs);
1030          emit_perfcntrs_pass_start(cs, data->pass);
1031       }
1032 
1033       const struct fd_perfcntr_counter *counter =
1034             &pool->perf_group[data->gid].counters[data->cntr_reg];
1035 
1036       uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1037 
1038       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1039       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1040                      CP_REG_TO_MEM_0_64B);
1041       tu_cs_emit_qw(cs, begin_iova);
1042    }
1043    tu_cond_exec_end(cs);
1044 }
1045 
1046 template <chip CHIP>
1047 static void
emit_begin_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1048 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
1049                      struct tu_query_pool *pool,
1050                      uint32_t query,
1051                      uint32_t stream_id)
1052 {
1053    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1054    uint64_t begin_iova = primitive_query_iova(pool, query, begin, 0, 0);
1055 
1056    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
1057    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1058 }
1059 
1060 template <chip CHIP>
1061 static void
emit_begin_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1062 emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1063                                 struct tu_query_pool *pool,
1064                                 uint32_t query)
1065 {
1066    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1067    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1068 
1069    if (cmdbuf->state.pass) {
1070       cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1071    } else {
1072       cmdbuf->state.prim_generated_query_running_before_rp = true;
1073    }
1074 
1075    cmdbuf->state.prim_counters_running++;
1076 
1077    if (cmdbuf->state.pass) {
1078       /* Primitives that passed all tests are still counted in in each
1079        * tile even with HW binning beforehand. Do not permit it.
1080        */
1081       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1082                            CP_COND_REG_EXEC_0_SYSMEM |
1083                            CP_COND_REG_EXEC_0_BINNING);
1084    }
1085 
1086    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
1087 
1088    tu_cs_emit_wfi(cs);
1089 
1090    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1091    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1092                   CP_REG_TO_MEM_0_CNT(2) |
1093                   CP_REG_TO_MEM_0_64B);
1094    tu_cs_emit_qw(cs, begin_iova);
1095 
1096    if (cmdbuf->state.pass) {
1097       tu_cond_exec_end(cs);
1098    }
1099 }
1100 
1101 template <chip CHIP>
1102 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1103 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1104                            VkQueryPool queryPool,
1105                            uint32_t query,
1106                            VkQueryControlFlags flags,
1107                            uint32_t index)
1108 {
1109    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1110    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1111    assert(query < pool->size);
1112 
1113    switch (pool->vk.query_type) {
1114    case VK_QUERY_TYPE_OCCLUSION:
1115       /* In freedreno, there is no implementation difference between
1116        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1117        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1118        */
1119       emit_begin_occlusion_query<CHIP>(cmdbuf, pool, query);
1120       break;
1121    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1122       emit_begin_xfb_query<CHIP>(cmdbuf, pool, query, index);
1123       break;
1124    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1125       emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
1126       break;
1127    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1128       emit_begin_perf_query(cmdbuf, pool, query);
1129       break;
1130    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1131       emit_begin_stat_query<CHIP>(cmdbuf, pool, query);
1132       break;
1133    case VK_QUERY_TYPE_TIMESTAMP:
1134       unreachable("Unimplemented query type");
1135    default:
1136       assert(!"Invalid query type");
1137    }
1138 }
1139 TU_GENX(tu_CmdBeginQueryIndexedEXT);
1140 
1141 template <chip CHIP>
1142 static void
emit_end_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1143 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1144                          struct tu_query_pool *pool,
1145                          uint32_t query)
1146 {
1147    /* Ending an occlusion query happens in a few steps:
1148     *    1) Set the slot->end to UINT64_MAX.
1149     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1150     *       write the current sample count value into slot->end.
1151     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
1152     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1153     *    4) Accumulate the results of the query (slot->end - slot->begin) into
1154     *       slot->result.
1155     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
1156     *       pass, set the slot's available bit since the query is now done.
1157     *    6) If vkCmdEndQuery *is* called from within the scope of a render
1158     *       pass, we cannot mark as available yet since the commands in
1159     *       draw_cs are not run until vkCmdEndRenderPass.
1160     */
1161    const struct tu_render_pass *pass = cmdbuf->state.pass;
1162    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1163 
1164    struct tu_cs *epilogue_cs = &cmdbuf->cs;
1165    if (pass)
1166       /* Technically, queries should be tracked per-subpass, but here we track
1167        * at the render pass level to simply the code a bit. This is safe
1168        * because the only commands that use the available bit are
1169        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1170        * cannot be invoked from inside a render pass scope.
1171        */
1172       epilogue_cs = &cmdbuf->draw_epilogue_cs;
1173 
1174    uint64_t available_iova = query_available_iova(pool, query);
1175    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1176    uint64_t result_iova = occlusion_query_iova(pool, query, result);
1177    uint64_t end_iova = occlusion_query_iova(pool, query, end);
1178 
1179    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1180       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1181       tu_cs_emit_qw(cs, end_iova);
1182       tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1183 
1184       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1185    }
1186 
1187    tu_cs_emit_regs(cs,
1188                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1189 
1190    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1191       tu_cs_emit_regs(cs,
1192                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1193       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1194       tu_cs_emit(cs, ZPASS_DONE);
1195       if (CHIP == A7XX) {
1196          /* Copied from blob's cmdstream, not sure why it is done. */
1197          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1198          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
1199       }
1200 
1201       tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1202       tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1203                      CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
1204       tu_cs_emit_qw(cs, end_iova);
1205       tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1206       tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1207       tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1208 
1209       /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1210       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1211       tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1212       tu_cs_emit_qw(cs, result_iova);
1213       tu_cs_emit_qw(cs, result_iova);
1214       tu_cs_emit_qw(cs, end_iova);
1215       tu_cs_emit_qw(cs, begin_iova);
1216 
1217       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1218    } else {
1219       /* When outside of renderpass, potential autotuner activity can cause
1220        * interference between ZPASS_DONE event pairs. In that case, like at the
1221        * beginning of the occlusion query, a fake ZPASS_DONE event is emitted to
1222        * compose a begin-end event pair. The first event will write into the end
1223        * field, but that will be overwritten by the second ZPASS_DONE which will
1224        * also handle the diff accumulation.
1225        */
1226       if (!cmdbuf->state.pass) {
1227          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
1228          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1229                                           .write_sample_count = true).value);
1230          tu_cs_emit_qw(cs, end_iova);
1231       }
1232 
1233       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
1234       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1235                                        .write_sample_count = true,
1236                                        .sample_count_end_offset = true,
1237                                        .write_accum_sample_count_diff = true).value);
1238       tu_cs_emit_qw(cs, begin_iova);
1239 
1240       tu_cs_emit_wfi(cs);
1241 
1242       if (cmdbuf->device->physical_device->info->a7xx.has_generic_clear) {
1243          /* If the next renderpass uses the same depth attachment, clears it
1244           * with generic clear - ZPASS_DONE may somehow read stale values that
1245           * are apparently invalidated by CCU_INVALIDATE_DEPTH.
1246           * See dEQP-VK.fragment_operations.early_fragment.sample_count_early_fragment_tests_depth_*
1247           */
1248          tu_emit_event_write<CHIP>(cmdbuf, epilogue_cs,
1249                                    FD_CCU_INVALIDATE_DEPTH);
1250       }
1251    }
1252 
1253    tu_cs_emit_pkt7(epilogue_cs, CP_MEM_WRITE, 4);
1254    tu_cs_emit_qw(epilogue_cs, available_iova);
1255    tu_cs_emit_qw(epilogue_cs, 0x1);
1256 }
1257 
1258 /* PRIMITIVE_CTRS is used for two distinct queries:
1259  * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1260  * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1261  * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1262  * only for outer query.
1263  *
1264  * Also, pipeline stat query could run outside of renderpass and prim gen
1265  * query inside of secondary cmd buffer - for such case we ought to track
1266  * the status of pipeline stats query.
1267  */
1268 template <chip CHIP>
1269 static void
emit_stop_primitive_ctrs(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,enum VkQueryType query_type)1270 emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1271                          struct tu_cs *cs,
1272                          enum VkQueryType query_type)
1273 {
1274    bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1275    cmdbuf->state.prim_counters_running--;
1276    if (cmdbuf->state.prim_counters_running == 0) {
1277       bool need_cond_exec =
1278          is_secondary &&
1279          query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1280          is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1281 
1282       if (!need_cond_exec) {
1283          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1284       } else {
1285          tu_cs_reserve(cs, 7 + 2);
1286          /* Check that pipeline stats query is not running, only then
1287           * we count stop the counter.
1288           */
1289          tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1290          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1291          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1292          tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1293          tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1294 
1295          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1296       }
1297    }
1298 
1299    if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1300       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1301       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1302       tu_cs_emit(cs, 1);
1303    }
1304 }
1305 
1306 template <chip CHIP>
1307 static void
emit_end_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1308 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1309                     struct tu_query_pool *pool,
1310                     uint32_t query)
1311 {
1312    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1313    uint64_t end_iova = pipeline_stat_query_iova(pool, query, end, 0);
1314    uint64_t available_iova = query_available_iova(pool, query);
1315    uint64_t result_iova;
1316    uint64_t stat_start_iova;
1317    uint64_t stat_stop_iova;
1318 
1319    if (is_pipeline_query_with_vertex_stage(pool->vk.pipeline_statistics)) {
1320       /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1321        * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1322        * renderpass, because it is already stopped.
1323        */
1324       emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1325    }
1326 
1327    if (is_pipeline_query_with_fragment_stage(pool->vk.pipeline_statistics)) {
1328       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_FRAGMENT_CTRS);
1329    }
1330 
1331    if (is_pipeline_query_with_compute_stage(pool->vk.pipeline_statistics)) {
1332       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_COMPUTE_CTRS);
1333    }
1334 
1335    tu_cs_emit_wfi(cs);
1336 
1337    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1338    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1339                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1340                   CP_REG_TO_MEM_0_64B);
1341    tu_cs_emit_qw(cs, end_iova);
1342 
1343    for (int i = 0; i < STAT_COUNT; i++) {
1344       result_iova = query_result_iova(pool, query, uint64_t, i);
1345       stat_start_iova = pipeline_stat_query_iova(pool, query, begin, i);
1346       stat_stop_iova = pipeline_stat_query_iova(pool, query, end, i);
1347 
1348       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1349       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1350                      CP_MEM_TO_MEM_0_DOUBLE |
1351                      CP_MEM_TO_MEM_0_NEG_C);
1352 
1353       tu_cs_emit_qw(cs, result_iova);
1354       tu_cs_emit_qw(cs, result_iova);
1355       tu_cs_emit_qw(cs, stat_stop_iova);
1356       tu_cs_emit_qw(cs, stat_start_iova);
1357    }
1358 
1359    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1360 
1361    if (cmdbuf->state.pass)
1362       cs = &cmdbuf->draw_epilogue_cs;
1363 
1364    /* Set the availability to 1 */
1365    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1366    tu_cs_emit_qw(cs, available_iova);
1367    tu_cs_emit_qw(cs, 0x1);
1368 }
1369 
1370 static void
emit_end_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1371 emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1372                          struct tu_query_pool *pool,
1373                          uint32_t query)
1374 {
1375    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1376    uint64_t available_iova = query_available_iova(pool, query);
1377    uint64_t end_iova;
1378    uint64_t begin_iova;
1379    uint64_t result_iova;
1380    uint32_t last_pass = ~0;
1381 
1382    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1383       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1384 
1385       if (last_pass != data->pass) {
1386          last_pass = data->pass;
1387 
1388          if (data->pass != 0)
1389             tu_cond_exec_end(cs);
1390          emit_perfcntrs_pass_start(cs, data->pass);
1391       }
1392 
1393       const struct fd_perfcntr_counter *counter =
1394             &pool->perf_group[data->gid].counters[data->cntr_reg];
1395 
1396       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1397 
1398       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1399       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1400                      CP_REG_TO_MEM_0_64B);
1401       tu_cs_emit_qw(cs, end_iova);
1402    }
1403    tu_cond_exec_end(cs);
1404 
1405    last_pass = ~0;
1406    tu_cs_emit_wfi(cs);
1407 
1408    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1409       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1410 
1411       if (last_pass != data->pass) {
1412          last_pass = data->pass;
1413 
1414 
1415          if (data->pass != 0)
1416             tu_cond_exec_end(cs);
1417          emit_perfcntrs_pass_start(cs, data->pass);
1418       }
1419 
1420       result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1421              data->app_idx);
1422       begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1423       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1424 
1425       /* result += end - begin */
1426       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1427       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1428                      CP_MEM_TO_MEM_0_DOUBLE |
1429                      CP_MEM_TO_MEM_0_NEG_C);
1430 
1431       tu_cs_emit_qw(cs, result_iova);
1432       tu_cs_emit_qw(cs, result_iova);
1433       tu_cs_emit_qw(cs, end_iova);
1434       tu_cs_emit_qw(cs, begin_iova);
1435    }
1436    tu_cond_exec_end(cs);
1437 
1438    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1439 
1440    if (cmdbuf->state.pass)
1441       cs = &cmdbuf->draw_epilogue_cs;
1442 
1443    /* Set the availability to 1 */
1444    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1445    tu_cs_emit_qw(cs, available_iova);
1446    tu_cs_emit_qw(cs, 0x1);
1447 }
1448 
1449 template <chip CHIP>
1450 static void
emit_end_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1451 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1452                    struct tu_query_pool *pool,
1453                    uint32_t query,
1454                    uint32_t stream_id)
1455 {
1456    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1457 
1458    uint64_t end_iova = primitive_query_iova(pool, query, end, 0, 0);
1459    uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1460    uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1461    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin, stream_id, 0);
1462    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin, stream_id, 1);
1463    uint64_t end_written_iova = primitive_query_iova(pool, query, end, stream_id, 0);
1464    uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1);
1465    uint64_t available_iova = query_available_iova(pool, query);
1466 
1467    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1468    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1469 
1470    tu_cs_emit_wfi(cs);
1471    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
1472 
1473    /* Set the count of written primitives */
1474    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1475    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1476                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1477    tu_cs_emit_qw(cs, result_written_iova);
1478    tu_cs_emit_qw(cs, result_written_iova);
1479    tu_cs_emit_qw(cs, end_written_iova);
1480    tu_cs_emit_qw(cs, begin_written_iova);
1481 
1482    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
1483 
1484    /* Set the count of generated primitives */
1485    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1486    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1487                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1488    tu_cs_emit_qw(cs, result_generated_iova);
1489    tu_cs_emit_qw(cs, result_generated_iova);
1490    tu_cs_emit_qw(cs, end_generated_iova);
1491    tu_cs_emit_qw(cs, begin_generated_iova);
1492 
1493    /* Set the availability to 1 */
1494    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1495    tu_cs_emit_qw(cs, available_iova);
1496    tu_cs_emit_qw(cs, 0x1);
1497 }
1498 
1499 template <chip CHIP>
1500 static void
emit_end_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1501 emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1502                               struct tu_query_pool *pool,
1503                               uint32_t query)
1504 {
1505    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1506 
1507    if (!cmdbuf->state.pass) {
1508       cmdbuf->state.prim_generated_query_running_before_rp = false;
1509    }
1510 
1511    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1512    uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1513    uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1514    uint64_t available_iova = query_available_iova(pool, query);
1515 
1516    if (cmdbuf->state.pass) {
1517       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1518                              CP_COND_REG_EXEC_0_SYSMEM |
1519                              CP_COND_REG_EXEC_0_BINNING);
1520    }
1521 
1522    tu_cs_emit_wfi(cs);
1523 
1524    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1525    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1526                   CP_REG_TO_MEM_0_CNT(2) |
1527                   CP_REG_TO_MEM_0_64B);
1528    tu_cs_emit_qw(cs, end_iova);
1529 
1530    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1531    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1532                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1533    tu_cs_emit_qw(cs, result_iova);
1534    tu_cs_emit_qw(cs, result_iova);
1535    tu_cs_emit_qw(cs, end_iova);
1536    tu_cs_emit_qw(cs, begin_iova);
1537 
1538    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1539 
1540    /* Should be after waiting for mem writes to have up to date info
1541     * about which query is running.
1542     */
1543    emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1544 
1545    if (cmdbuf->state.pass) {
1546       tu_cond_exec_end(cs);
1547    }
1548 
1549    if (cmdbuf->state.pass)
1550       cs = &cmdbuf->draw_epilogue_cs;
1551 
1552    /* Set the availability to 1 */
1553    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1554    tu_cs_emit_qw(cs, available_iova);
1555    tu_cs_emit_qw(cs, 0x1);
1556 }
1557 
1558 /* Implement this bit of spec text from section 17.2 "Query Operation":
1559  *
1560  *     If queries are used while executing a render pass instance that has
1561  *     multiview enabled, the query uses N consecutive query indices in the
1562  *     query pool (starting at query) where N is the number of bits set in the
1563  *     view mask in the subpass the query is used in. How the numerical
1564  *     results of the query are distributed among the queries is
1565  *     implementation-dependent. For example, some implementations may write
1566  *     each view’s results to a distinct query, while other implementations
1567  *     may write the total result to the first query and write zero to the
1568  *     other queries. However, the sum of the results in all the queries must
1569  *     accurately reflect the total result of the query summed over all views.
1570  *     Applications can sum the results from all the queries to compute the
1571  *     total result.
1572  *
1573  * Since we execute all views at once, we write zero to the other queries.
1574  * Furthermore, because queries must be reset before use, and we set the
1575  * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1576  */
1577 
1578 static void
handle_multiview_queries(struct tu_cmd_buffer * cmd,struct tu_query_pool * pool,uint32_t query)1579 handle_multiview_queries(struct tu_cmd_buffer *cmd,
1580                          struct tu_query_pool *pool,
1581                          uint32_t query)
1582 {
1583    if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1584       return;
1585 
1586    unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1587    struct tu_cs *cs = &cmd->draw_epilogue_cs;
1588 
1589    for (uint32_t i = 1; i < views; i++) {
1590       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1591       tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1592       tu_cs_emit_qw(cs, 0x1);
1593    }
1594 }
1595 
1596 template <chip CHIP>
1597 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1598 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1599                          VkQueryPool queryPool,
1600                          uint32_t query,
1601                          uint32_t index)
1602 {
1603    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1604    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1605    assert(query < pool->size);
1606 
1607    switch (pool->vk.query_type) {
1608    case VK_QUERY_TYPE_OCCLUSION:
1609       emit_end_occlusion_query<CHIP>(cmdbuf, pool, query);
1610       break;
1611    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1612       assert(index <= 4);
1613       emit_end_xfb_query<CHIP>(cmdbuf, pool, query, index);
1614       break;
1615    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1616       emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
1617       break;
1618    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1619       emit_end_perf_query(cmdbuf, pool, query);
1620       break;
1621    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1622       emit_end_stat_query<CHIP>(cmdbuf, pool, query);
1623       break;
1624    case VK_QUERY_TYPE_TIMESTAMP:
1625       unreachable("Unimplemented query type");
1626    default:
1627       assert(!"Invalid query type");
1628    }
1629 
1630    handle_multiview_queries(cmdbuf, pool, query);
1631 }
1632 TU_GENX(tu_CmdEndQueryIndexedEXT);
1633 
1634 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits2 pipelineStage,VkQueryPool queryPool,uint32_t query)1635 tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1636                       VkPipelineStageFlagBits2 pipelineStage,
1637                       VkQueryPool queryPool,
1638                       uint32_t query)
1639 {
1640    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1641    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1642 
1643    /* Inside a render pass, just write the timestamp multiple times so that
1644     * the user gets the last one if we use GMEM. There isn't really much
1645     * better we can do, and this seems to be what the blob does too.
1646     */
1647    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1648 
1649    /* Stages that will already have been executed by the time the CP executes
1650     * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1651     * indirect stage counts as top-of-pipe too.
1652     */
1653    VkPipelineStageFlags2 top_of_pipe_flags =
1654       VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1655       VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1656 
1657    if (pipelineStage & ~top_of_pipe_flags) {
1658       /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1659        * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1660        * complete.
1661        *
1662        * Stalling the CP like this is really unfortunate, but I don't think
1663        * there's a better solution that allows all 48 bits of precision
1664        * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1665        */
1666       tu_cs_emit_wfi(cs);
1667    }
1668 
1669    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1670    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1671                   CP_REG_TO_MEM_0_CNT(2) |
1672                   CP_REG_TO_MEM_0_64B);
1673    tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1674 
1675    /* Only flag availability once the entire renderpass is done, similar to
1676     * the begin/end path.
1677     */
1678    cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1679 
1680    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1681    tu_cs_emit_qw(cs, query_available_iova(pool, query));
1682    tu_cs_emit_qw(cs, 0x1);
1683 
1684    /* From the spec for vkCmdWriteTimestamp:
1685     *
1686     *    If vkCmdWriteTimestamp is called while executing a render pass
1687     *    instance that has multiview enabled, the timestamp uses N consecutive
1688     *    query indices in the query pool (starting at query) where N is the
1689     *    number of bits set in the view mask of the subpass the command is
1690     *    executed in. The resulting query values are determined by an
1691     *    implementation-dependent choice of one of the following behaviors:
1692     *
1693     *    -   The first query is a timestamp value and (if more than one bit is
1694     *        set in the view mask) zero is written to the remaining queries.
1695     *        If two timestamps are written in the same subpass, the sum of the
1696     *        execution time of all views between those commands is the
1697     *        difference between the first query written by each command.
1698     *
1699     *    -   All N queries are timestamp values. If two timestamps are written
1700     *        in the same subpass, the sum of the execution time of all views
1701     *        between those commands is the sum of the difference between
1702     *        corresponding queries written by each command. The difference
1703     *        between corresponding queries may be the execution time of a
1704     *        single view.
1705     *
1706     * We execute all views in the same draw call, so we implement the first
1707     * option, the same as regular queries.
1708     */
1709    handle_multiview_queries(cmd, pool, query);
1710 }
1711 
1712 VKAPI_ATTR VkResult VKAPI_CALL
tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1713 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1714     VkPhysicalDevice                            physicalDevice,
1715     uint32_t                                    queueFamilyIndex,
1716     uint32_t*                                   pCounterCount,
1717     VkPerformanceCounterKHR*                    pCounters,
1718     VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
1719 {
1720    VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1721 
1722    uint32_t desc_count = *pCounterCount;
1723    uint32_t group_count;
1724    const struct fd_perfcntr_group *group =
1725          fd_perfcntrs(&phydev->dev_id, &group_count);
1726 
1727    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1728    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1729                           pCounterDescriptions, &desc_count);
1730 
1731    for (int i = 0; i < group_count; i++) {
1732       for (int j = 0; j < group[i].num_countables; j++) {
1733 
1734          vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1735             counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1736             counter->unit =
1737                   fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1738             counter->storage =
1739                   fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1740 
1741             unsigned char sha1_result[20];
1742             _mesa_sha1_compute(group[i].countables[j].name,
1743                                strlen(group[i].countables[j].name),
1744                                sha1_result);
1745             memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1746          }
1747 
1748          vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1749             desc->flags = 0;
1750 
1751             snprintf(desc->name, sizeof(desc->name),
1752                      "%s", group[i].countables[j].name);
1753             snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1754             snprintf(desc->description, sizeof(desc->description),
1755                      "%s: %s performance counter",
1756                      group[i].name, group[i].countables[j].name);
1757          }
1758       }
1759    }
1760 
1761    return vk_outarray_status(&out);
1762 }
1763 
1764 VKAPI_ATTR void VKAPI_CALL
tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1765 tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1766       VkPhysicalDevice                            physicalDevice,
1767       const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
1768       uint32_t*                                   pNumPasses)
1769 {
1770    VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1771    uint32_t group_count = 0;
1772    uint32_t gid = 0, cid = 0, n_passes;
1773    const struct fd_perfcntr_group *group =
1774          fd_perfcntrs(&phydev->dev_id, &group_count);
1775 
1776    uint32_t counters_requested[group_count];
1777    memset(counters_requested, 0x0, sizeof(counters_requested));
1778    *pNumPasses = 1;
1779 
1780    for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1781       perfcntr_index(group, group_count,
1782                      pPerformanceQueryCreateInfo->pCounterIndices[i],
1783                      &gid, &cid);
1784 
1785       counters_requested[gid]++;
1786    }
1787 
1788    for (uint32_t i = 0; i < group_count; i++) {
1789       n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1790       *pNumPasses = MAX2(*pNumPasses, n_passes);
1791    }
1792 }
1793 
1794 VKAPI_ATTR VkResult VKAPI_CALL
tu_AcquireProfilingLockKHR(VkDevice device,const VkAcquireProfilingLockInfoKHR * pInfo)1795 tu_AcquireProfilingLockKHR(VkDevice device,
1796                            const VkAcquireProfilingLockInfoKHR* pInfo)
1797 {
1798    /* TODO. Probably there's something to do for kgsl. */
1799    return VK_SUCCESS;
1800 }
1801 
1802 VKAPI_ATTR void VKAPI_CALL
tu_ReleaseProfilingLockKHR(VkDevice device)1803 tu_ReleaseProfilingLockKHR(VkDevice device)
1804 {
1805    /* TODO. Probably there's something to do for kgsl. */
1806    return;
1807 }
1808