xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan_hasvk/genX_query.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include "anv_private.h"
31 
32 #include "util/os_time.h"
33 
34 #include "genxml/gen_macros.h"
35 #include "genxml/genX_pack.h"
36 
37 /* We reserve :
38  *    - GPR 14 for perf queries
39  *    - GPR 15 for conditional rendering
40  */
41 #define MI_BUILDER_NUM_ALLOC_GPRS 14
42 #define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
43 #define __gen_get_batch_dwords anv_batch_emit_dwords
44 #define __gen_address_offset anv_address_add
45 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
46 #include "common/mi_builder.h"
47 #include "perf/intel_perf.h"
48 #include "perf/intel_perf_mdapi.h"
49 #include "perf/intel_perf_regs.h"
50 
51 #include "vk_util.h"
52 
53 static struct anv_address
anv_query_address(struct anv_query_pool * pool,uint32_t query)54 anv_query_address(struct anv_query_pool *pool, uint32_t query)
55 {
56    return (struct anv_address) {
57       .bo = pool->bo,
58       .offset = query * pool->stride,
59    };
60 }
61 
genX(CreateQueryPool)62 VkResult genX(CreateQueryPool)(
63     VkDevice                                    _device,
64     const VkQueryPoolCreateInfo*                pCreateInfo,
65     const VkAllocationCallbacks*                pAllocator,
66     VkQueryPool*                                pQueryPool)
67 {
68    ANV_FROM_HANDLE(anv_device, device, _device);
69    const struct anv_physical_device *pdevice = device->physical;
70 #if GFX_VER >= 8
71    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
72    struct intel_perf_counter_pass *counter_pass;
73    struct intel_perf_query_info **pass_query;
74    uint32_t n_passes = 0;
75 #endif
76    uint32_t data_offset = 0;
77    VK_MULTIALLOC(ma);
78    VkResult result;
79 
80    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
81 
82    /* Query pool slots are made up of some number of 64-bit values packed
83     * tightly together. For most query types have the first 64-bit value is
84     * the "available" bit which is 0 when the query is unavailable and 1 when
85     * it is available. The 64-bit values that follow are determined by the
86     * type of query.
87     *
88     * For performance queries, we have a requirement to align OA reports at
89     * 64bytes so we put those first and have the "available" bit behind
90     * together with some other counters.
91     */
92    uint32_t uint64s_per_slot = 0;
93 
94    VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);
95 
96    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
97    switch (pCreateInfo->queryType) {
98    case VK_QUERY_TYPE_OCCLUSION:
99       /* Occlusion queries have two values: begin and end. */
100       uint64s_per_slot = 1 + 2;
101       break;
102    case VK_QUERY_TYPE_TIMESTAMP:
103       /* Timestamps just have the one timestamp value */
104       uint64s_per_slot = 1 + 1;
105       break;
106    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
107       pipeline_statistics = pCreateInfo->pipelineStatistics;
108       /* We're going to trust this field implicitly so we need to ensure that
109        * no unhandled extension bits leak in.
110        */
111       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
112 
113       /* Statistics queries have a min and max for every statistic */
114       uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
115       break;
116    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
117       /* Transform feedback queries are 4 values, begin/end for
118        * written/available.
119        */
120       uint64s_per_slot = 1 + 4;
121       break;
122    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
123       const struct intel_perf_query_field_layout *layout =
124          &pdevice->perf->query_layout;
125 
126       uint64s_per_slot = 2; /* availability + marker */
127       /* Align to the requirement of the layout */
128       uint64s_per_slot = align(uint64s_per_slot,
129                                DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
130       data_offset = uint64s_per_slot * sizeof(uint64_t);
131       /* Add the query data for begin & end commands */
132       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
133       break;
134    }
135 #if GFX_VER >= 8
136    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
137       const struct intel_perf_query_field_layout *layout =
138          &pdevice->perf->query_layout;
139 
140       perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
141                                              QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
142       n_passes = intel_perf_get_n_passes(pdevice->perf,
143                                          perf_query_info->pCounterIndices,
144                                          perf_query_info->counterIndexCount,
145                                          NULL);
146       vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,
147                              perf_query_info->counterIndexCount);
148       vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
149                              n_passes);
150       uint64s_per_slot = 4 /* availability + small batch */;
151       /* Align to the requirement of the layout */
152       uint64s_per_slot = align(uint64s_per_slot,
153                                DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
154       data_offset = uint64s_per_slot * sizeof(uint64_t);
155       /* Add the query data for begin & end commands */
156       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
157       /* Multiply by the number of passes */
158       uint64s_per_slot *= n_passes;
159       break;
160    }
161 #endif
162    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
163       /* Query has two values: begin and end. */
164       uint64s_per_slot = 1 + 2;
165       break;
166    default:
167       assert(!"Invalid query type");
168    }
169 
170    if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
171                              VK_OBJECT_TYPE_QUERY_POOL))
172       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
173 
174    pool->type = pCreateInfo->queryType;
175    pool->pipeline_statistics = pipeline_statistics;
176    pool->stride = uint64s_per_slot * sizeof(uint64_t);
177    pool->slots = pCreateInfo->queryCount;
178 
179    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
180       pool->data_offset = data_offset;
181       pool->snapshot_size = (pool->stride - data_offset) / 2;
182    }
183 #if GFX_VER >= 8
184    else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
185       pool->pass_size = pool->stride / n_passes;
186       pool->data_offset = data_offset;
187       pool->snapshot_size = (pool->pass_size - data_offset) / 2;
188       pool->n_counters = perf_query_info->counterIndexCount;
189       pool->counter_pass = counter_pass;
190       intel_perf_get_counters_passes(pdevice->perf,
191                                      perf_query_info->pCounterIndices,
192                                      perf_query_info->counterIndexCount,
193                                      pool->counter_pass);
194       pool->n_passes = n_passes;
195       pool->pass_query = pass_query;
196       intel_perf_get_n_passes(pdevice->perf,
197                               perf_query_info->pCounterIndices,
198                               perf_query_info->counterIndexCount,
199                               pool->pass_query);
200    }
201 #endif
202 
203    uint64_t size = pool->slots * (uint64_t)pool->stride;
204    result = anv_device_alloc_bo(device, "query-pool", size,
205                                 ANV_BO_ALLOC_MAPPED |
206                                 ANV_BO_ALLOC_SNOOPED,
207                                 0 /* explicit_address */,
208                                 &pool->bo);
209    if (result != VK_SUCCESS)
210       goto fail;
211 
212 #if GFX_VER >= 8
213    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
214       for (uint32_t p = 0; p < pool->n_passes; p++) {
215          struct mi_builder b;
216          struct anv_batch batch = {
217             .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),
218             .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,
219          };
220          batch.next = batch.start;
221 
222          mi_builder_init(&b, device->info, &batch);
223          mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
224                       mi_imm(p * (uint64_t)pool->pass_size));
225          anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
226       }
227    }
228 #endif
229 
230    *pQueryPool = anv_query_pool_to_handle(pool);
231 
232    return VK_SUCCESS;
233 
234  fail:
235    vk_free2(&device->vk.alloc, pAllocator, pool);
236 
237    return result;
238 }
239 
genX(DestroyQueryPool)240 void genX(DestroyQueryPool)(
241     VkDevice                                    _device,
242     VkQueryPool                                 _pool,
243     const VkAllocationCallbacks*                pAllocator)
244 {
245    ANV_FROM_HANDLE(anv_device, device, _device);
246    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
247 
248    if (!pool)
249       return;
250 
251    anv_device_release_bo(device, pool->bo);
252    vk_object_free(&device->vk, pAllocator, pool);
253 }
254 
255 #if GFX_VER >= 8
256 /**
257  * VK_KHR_performance_query layout  :
258  *
259  * --------------------------------------------
260  * |       availability (8b)       | |        |
261  * |-------------------------------| |        |
262  * |      Small batch loading      | |        |
263  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
264  * |            (24b)              | | Pass 0 |
265  * |-------------------------------| |        |
266  * |       some padding (see       | |        |
267  * | query_field_layout:alignment) | |        |
268  * |-------------------------------| |        |
269  * |           query data          | |        |
270  * | (2 * query_field_layout:size) | |        |
271  * |-------------------------------|--        | Query 0
272  * |       availability (8b)       | |        |
273  * |-------------------------------| |        |
274  * |      Small batch loading      | |        |
275  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
276  * |            (24b)              | | Pass 1 |
277  * |-------------------------------| |        |
278  * |       some padding (see       | |        |
279  * | query_field_layout:alignment) | |        |
280  * |-------------------------------| |        |
281  * |           query data          | |        |
282  * | (2 * query_field_layout:size) | |        |
283  * |-------------------------------|-----------
284  * |       availability (8b)       | |        |
285  * |-------------------------------| |        |
286  * |      Small batch loading      | |        |
287  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
288  * |            (24b)              | | Pass 0 |
289  * |-------------------------------| |        |
290  * |       some padding (see       | |        |
291  * | query_field_layout:alignment) | |        |
292  * |-------------------------------| |        |
293  * |           query data          | |        |
294  * | (2 * query_field_layout:size) | |        |
295  * |-------------------------------|--        | Query 1
296  * |               ...             | |        |
297  * --------------------------------------------
298  */
299 
300 static uint64_t
khr_perf_query_availability_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass)301 khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
302 {
303    return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;
304 }
305 
306 static uint64_t
khr_perf_query_data_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)307 khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
308 {
309    return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +
310       pool->data_offset + (end ? pool->snapshot_size : 0);
311 }
312 
313 static struct anv_address
khr_perf_query_availability_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass)314 khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
315 {
316    return anv_address_add(
317       (struct anv_address) { .bo = pool->bo, },
318       khr_perf_query_availability_offset(pool, query, pass));
319 }
320 
321 static struct anv_address
khr_perf_query_data_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)322 khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
323 {
324    return anv_address_add(
325       (struct anv_address) { .bo = pool->bo, },
326       khr_perf_query_data_offset(pool, query, pass, end));
327 }
328 
329 static bool
khr_perf_query_ensure_relocs(struct anv_cmd_buffer * cmd_buffer)330 khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
331 {
332    if (anv_batch_has_error(&cmd_buffer->batch))
333       return false;
334 
335    if (cmd_buffer->self_mod_locations)
336       return true;
337 
338    struct anv_device *device = cmd_buffer->device;
339    const struct anv_physical_device *pdevice = device->physical;
340 
341    cmd_buffer->self_mod_locations =
342       vk_alloc(&cmd_buffer->vk.pool->alloc,
343                pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
344                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
345 
346    if (!cmd_buffer->self_mod_locations) {
347       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
348       return false;
349    }
350 
351    return true;
352 }
353 #endif
354 
355 /**
356  * VK_INTEL_performance_query layout :
357  *
358  * ---------------------------------
359  * |       availability (8b)       |
360  * |-------------------------------|
361  * |          marker (8b)          |
362  * |-------------------------------|
363  * |       some padding (see       |
364  * | query_field_layout:alignment) |
365  * |-------------------------------|
366  * |           query data          |
367  * | (2 * query_field_layout:size) |
368  * ---------------------------------
369  */
370 
371 static uint32_t
intel_perf_marker_offset(void)372 intel_perf_marker_offset(void)
373 {
374    return 8;
375 }
376 
377 static uint32_t
intel_perf_query_data_offset(struct anv_query_pool * pool,bool end)378 intel_perf_query_data_offset(struct anv_query_pool *pool, bool end)
379 {
380    return pool->data_offset + (end ? pool->snapshot_size : 0);
381 }
382 
383 static void
cpu_write_query_result(void * dst_slot,VkQueryResultFlags flags,uint32_t value_index,uint64_t result)384 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
385                        uint32_t value_index, uint64_t result)
386 {
387    if (flags & VK_QUERY_RESULT_64_BIT) {
388       uint64_t *dst64 = dst_slot;
389       dst64[value_index] = result;
390    } else {
391       uint32_t *dst32 = dst_slot;
392       dst32[value_index] = result;
393    }
394 }
395 
396 static void *
query_slot(struct anv_query_pool * pool,uint32_t query)397 query_slot(struct anv_query_pool *pool, uint32_t query)
398 {
399    return pool->bo->map + query * pool->stride;
400 }
401 
402 static bool
query_is_available(struct anv_query_pool * pool,uint32_t query)403 query_is_available(struct anv_query_pool *pool, uint32_t query)
404 {
405 #if GFX_VER >= 8
406    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
407       for (uint32_t p = 0; p < pool->n_passes; p++) {
408          volatile uint64_t *slot =
409             pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
410          if (!slot[0])
411             return false;
412       }
413       return true;
414    }
415 #endif
416 
417    return *(volatile uint64_t *)query_slot(pool, query);
418 }
419 
420 static VkResult
wait_for_available(struct anv_device * device,struct anv_query_pool * pool,uint32_t query)421 wait_for_available(struct anv_device *device,
422                    struct anv_query_pool *pool, uint32_t query)
423 {
424    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(2 * NSEC_PER_SEC);
425 
426    while (os_time_get_nano() < abs_timeout_ns) {
427       if (query_is_available(pool, query))
428          return VK_SUCCESS;
429       VkResult status = vk_device_check_status(&device->vk);
430       if (status != VK_SUCCESS)
431          return status;
432    }
433 
434    return vk_device_set_lost(&device->vk, "query timeout");
435 }
436 
genX(GetQueryPoolResults)437 VkResult genX(GetQueryPoolResults)(
438     VkDevice                                    _device,
439     VkQueryPool                                 queryPool,
440     uint32_t                                    firstQuery,
441     uint32_t                                    queryCount,
442     size_t                                      dataSize,
443     void*                                       pData,
444     VkDeviceSize                                stride,
445     VkQueryResultFlags                          flags)
446 {
447    ANV_FROM_HANDLE(anv_device, device, _device);
448    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
449 
450    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
451           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
452           pool->type == VK_QUERY_TYPE_TIMESTAMP ||
453           pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
454           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
455           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
456           pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
457 
458    if (vk_device_is_lost(&device->vk))
459       return VK_ERROR_DEVICE_LOST;
460 
461    if (pData == NULL)
462       return VK_SUCCESS;
463 
464    void *data_end = pData + dataSize;
465 
466    VkResult status = VK_SUCCESS;
467    for (uint32_t i = 0; i < queryCount; i++) {
468       bool available = query_is_available(pool, firstQuery + i);
469 
470       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
471          status = wait_for_available(device, pool, firstQuery + i);
472          if (status != VK_SUCCESS) {
473             return status;
474          }
475 
476          available = true;
477       }
478 
479       /* From the Vulkan 1.0.42 spec:
480        *
481        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
482        *    both not set then no result values are written to pData for
483        *    queries that are in the unavailable state at the time of the call,
484        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
485        *    availability state is still written to pData for those queries if
486        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
487        *
488        * From VK_KHR_performance_query :
489        *
490        *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
491        *     that the result should contain the number of counters that were recorded
492        *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
493        */
494       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
495 
496       uint32_t idx = 0;
497       switch (pool->type) {
498       case VK_QUERY_TYPE_OCCLUSION:
499       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
500          uint64_t *slot = query_slot(pool, firstQuery + i);
501          if (write_results) {
502             /* From the Vulkan 1.2.132 spec:
503              *
504              *    "If VK_QUERY_RESULT_PARTIAL_BIT is set,
505              *    VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
506              *    is unavailable, an intermediate result value between zero and
507              *    the final result value is written to pData for that query."
508              */
509             uint64_t result = available ? slot[2] - slot[1] : 0;
510             cpu_write_query_result(pData, flags, idx, result);
511          }
512          idx++;
513          break;
514       }
515 
516       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
517          uint64_t *slot = query_slot(pool, firstQuery + i);
518          uint32_t statistics = pool->pipeline_statistics;
519          while (statistics) {
520             uint32_t stat = u_bit_scan(&statistics);
521             if (write_results) {
522                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
523 
524                /* WaDividePSInvocationCountBy4:HSW,BDW */
525                if ((device->info->ver == 8 || device->info->verx10 == 75) &&
526                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
527                   result >>= 2;
528 
529                cpu_write_query_result(pData, flags, idx, result);
530             }
531             idx++;
532          }
533          assert(idx == util_bitcount(pool->pipeline_statistics));
534          break;
535       }
536 
537       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
538          uint64_t *slot = query_slot(pool, firstQuery + i);
539          if (write_results)
540             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
541          idx++;
542          if (write_results)
543             cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
544          idx++;
545          break;
546       }
547 
548       case VK_QUERY_TYPE_TIMESTAMP: {
549          uint64_t *slot = query_slot(pool, firstQuery + i);
550          if (write_results)
551             cpu_write_query_result(pData, flags, idx, slot[1]);
552          idx++;
553          break;
554       }
555 
556 #if GFX_VER >= 8
557       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
558          const struct anv_physical_device *pdevice = device->physical;
559          assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
560                           VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
561          for (uint32_t p = 0; p < pool->n_passes; p++) {
562             const struct intel_perf_query_info *query = pool->pass_query[p];
563             struct intel_perf_query_result result;
564             intel_perf_query_result_clear(&result);
565             intel_perf_query_result_accumulate_fields(&result, query,
566                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
567                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
568                                                       false /* no_oa_accumulate */);
569             anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
570          }
571          break;
572       }
573 #endif
574 
575       case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
576          if (!write_results)
577             break;
578          const void *query_data = query_slot(pool, firstQuery + i);
579          const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
580          struct intel_perf_query_result result;
581          intel_perf_query_result_clear(&result);
582          intel_perf_query_result_accumulate_fields(&result, query,
583                                                    query_data + intel_perf_query_data_offset(pool, false),
584                                                    query_data + intel_perf_query_data_offset(pool, true),
585                                                    false /* no_oa_accumulate */);
586          intel_perf_query_result_write_mdapi(pData, stride,
587                                              device->info,
588                                              query, &result);
589          const uint64_t *marker = query_data + intel_perf_marker_offset();
590          intel_perf_query_mdapi_write_marker(pData, stride, device->info, *marker);
591          break;
592       }
593 
594       default:
595          unreachable("invalid pool type");
596       }
597 
598       if (!write_results)
599          status = VK_NOT_READY;
600 
601       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
602          cpu_write_query_result(pData, flags, idx, available);
603 
604       pData += stride;
605       if (pData >= data_end)
606          break;
607    }
608 
609    return status;
610 }
611 
612 static void
emit_ps_depth_count(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)613 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
614                     struct anv_address addr)
615 {
616    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
617    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
618 
619    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
620       pc.DestinationAddressType  = DAT_PPGTT;
621       pc.PostSyncOperation       = WritePSDepthCount;
622       pc.DepthStallEnable        = true;
623       pc.Address                 = addr;
624    }
625 }
626 
627 static void
emit_query_mi_availability(struct mi_builder * b,struct anv_address addr,bool available)628 emit_query_mi_availability(struct mi_builder *b,
629                            struct anv_address addr,
630                            bool available)
631 {
632    mi_store(b, mi_mem64(addr), mi_imm(available));
633 }
634 
635 static void
emit_query_pc_availability(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool available)636 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
637                            struct anv_address addr,
638                            bool available)
639 {
640    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
641    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
642 
643    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
644       pc.DestinationAddressType  = DAT_PPGTT;
645       pc.PostSyncOperation       = WriteImmediateData;
646       pc.Address                 = addr;
647       pc.ImmediateData           = available;
648    }
649 }
650 
651 /**
652  * Goes through a series of consecutive query indices in the given pool
653  * setting all element values to 0 and emitting them as available.
654  */
655 static void
emit_zero_queries(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_query_pool * pool,uint32_t first_index,uint32_t num_queries)656 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
657                   struct mi_builder *b, struct anv_query_pool *pool,
658                   uint32_t first_index, uint32_t num_queries)
659 {
660    switch (pool->type) {
661    case VK_QUERY_TYPE_OCCLUSION:
662    case VK_QUERY_TYPE_TIMESTAMP:
663       /* These queries are written with a PIPE_CONTROL so clear them using the
664        * PIPE_CONTROL as well so we don't have to synchronize between 2 types
665        * of operations.
666        */
667       assert((pool->stride % 8) == 0);
668       for (uint32_t i = 0; i < num_queries; i++) {
669          struct anv_address slot_addr =
670             anv_query_address(pool, first_index + i);
671 
672          for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
673             emit_query_pc_availability(cmd_buffer,
674                                        anv_address_add(slot_addr, qword * 8),
675                                        false);
676          }
677          emit_query_pc_availability(cmd_buffer, slot_addr, true);
678       }
679       break;
680 
681    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
682    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
683    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
684       for (uint32_t i = 0; i < num_queries; i++) {
685          struct anv_address slot_addr =
686             anv_query_address(pool, first_index + i);
687          mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
688          emit_query_mi_availability(b, slot_addr, true);
689       }
690       break;
691 
692 #if GFX_VER >= 8
693    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
694       for (uint32_t i = 0; i < num_queries; i++) {
695          for (uint32_t p = 0; p < pool->n_passes; p++) {
696             mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),
697                          0, 2 * pool->snapshot_size);
698             emit_query_mi_availability(b,
699                                        khr_perf_query_availability_address(pool, first_index + i, p),
700                                        true);
701          }
702       }
703       break;
704    }
705 #endif
706 
707    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
708       for (uint32_t i = 0; i < num_queries; i++) {
709          struct anv_address slot_addr =
710             anv_query_address(pool, first_index + i);
711          mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
712          emit_query_mi_availability(b, slot_addr, true);
713       }
714       break;
715 
716    default:
717       unreachable("Unsupported query type");
718    }
719 }
720 
genX(CmdResetQueryPool)721 void genX(CmdResetQueryPool)(
722     VkCommandBuffer                             commandBuffer,
723     VkQueryPool                                 queryPool,
724     uint32_t                                    firstQuery,
725     uint32_t                                    queryCount)
726 {
727    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
728    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
729 
730    switch (pool->type) {
731    case VK_QUERY_TYPE_OCCLUSION:
732       for (uint32_t i = 0; i < queryCount; i++) {
733          emit_query_pc_availability(cmd_buffer,
734                                     anv_query_address(pool, firstQuery + i),
735                                     false);
736       }
737       break;
738 
739    case VK_QUERY_TYPE_TIMESTAMP: {
740       for (uint32_t i = 0; i < queryCount; i++) {
741          emit_query_pc_availability(cmd_buffer,
742                                     anv_query_address(pool, firstQuery + i),
743                                     false);
744       }
745 
746       /* Add a CS stall here to make sure the PIPE_CONTROL above has
747        * completed. Otherwise some timestamps written later with MI_STORE_*
748        * commands might race with the PIPE_CONTROL in the loop above.
749        */
750       anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
751                                 "vkCmdResetQueryPool of timestamps");
752       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
753       break;
754    }
755 
756    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
757    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
758    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
759       struct mi_builder b;
760       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
761 
762       for (uint32_t i = 0; i < queryCount; i++)
763          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
764       break;
765    }
766 
767 #if GFX_VER >= 8
768    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
769       struct mi_builder b;
770       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
771 
772       for (uint32_t i = 0; i < queryCount; i++) {
773          for (uint32_t p = 0; p < pool->n_passes; p++) {
774             emit_query_mi_availability(
775                &b,
776                khr_perf_query_availability_address(pool, firstQuery + i, p),
777                false);
778          }
779       }
780       break;
781    }
782 #endif
783 
784    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
785       struct mi_builder b;
786       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
787 
788       for (uint32_t i = 0; i < queryCount; i++)
789          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
790       break;
791    }
792 
793    default:
794       unreachable("Unsupported query type");
795    }
796 }
797 
genX(ResetQueryPool)798 void genX(ResetQueryPool)(
799     VkDevice                                    _device,
800     VkQueryPool                                 queryPool,
801     uint32_t                                    firstQuery,
802     uint32_t                                    queryCount)
803 {
804    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
805 
806    for (uint32_t i = 0; i < queryCount; i++) {
807       if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
808 #if GFX_VER >= 8
809          for (uint32_t p = 0; p < pool->n_passes; p++) {
810             uint64_t *pass_slot = pool->bo->map +
811                khr_perf_query_availability_offset(pool, firstQuery + i, p);
812             *pass_slot = 0;
813          }
814 #endif
815       } else {
816          uint64_t *slot = query_slot(pool, firstQuery + i);
817          *slot = 0;
818       }
819    }
820 }
821 
822 static const uint32_t vk_pipeline_stat_to_reg[] = {
823    GENX(IA_VERTICES_COUNT_num),
824    GENX(IA_PRIMITIVES_COUNT_num),
825    GENX(VS_INVOCATION_COUNT_num),
826    GENX(GS_INVOCATION_COUNT_num),
827    GENX(GS_PRIMITIVES_COUNT_num),
828    GENX(CL_INVOCATION_COUNT_num),
829    GENX(CL_PRIMITIVES_COUNT_num),
830    GENX(PS_INVOCATION_COUNT_num),
831    GENX(HS_INVOCATION_COUNT_num),
832    GENX(DS_INVOCATION_COUNT_num),
833    GENX(CS_INVOCATION_COUNT_num),
834 };
835 
836 static void
emit_pipeline_stat(struct mi_builder * b,uint32_t stat,struct anv_address addr)837 emit_pipeline_stat(struct mi_builder *b, uint32_t stat,
838                    struct anv_address addr)
839 {
840    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
841                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
842 
843    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
844    mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));
845 }
846 
847 static void
emit_xfb_query(struct mi_builder * b,uint32_t stream,struct anv_address addr)848 emit_xfb_query(struct mi_builder *b, uint32_t stream,
849                struct anv_address addr)
850 {
851    assert(stream < MAX_XFB_STREAMS);
852 
853    mi_store(b, mi_mem64(anv_address_add(addr, 0)),
854                mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
855    mi_store(b, mi_mem64(anv_address_add(addr, 16)),
856                mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
857 }
858 
859 static void
emit_perf_intel_query(struct anv_cmd_buffer * cmd_buffer,struct anv_query_pool * pool,struct mi_builder * b,struct anv_address query_addr,bool end)860 emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
861                       struct anv_query_pool *pool,
862                       struct mi_builder *b,
863                       struct anv_address query_addr,
864                       bool end)
865 {
866    const struct intel_perf_query_field_layout *layout =
867       &cmd_buffer->device->physical->perf->query_layout;
868    struct anv_address data_addr =
869       anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));
870 
871    for (uint32_t f = 0; f < layout->n_fields; f++) {
872       const struct intel_perf_query_field *field =
873          &layout->fields[end ? f : (layout->n_fields - 1 - f)];
874 
875       switch (field->type) {
876       case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
877          anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
878             rpc.MemoryAddress = anv_address_add(data_addr, field->location);
879          }
880          break;
881 
882       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
883       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
884       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
885       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
886       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
887          struct anv_address addr = anv_address_add(data_addr, field->location);
888          struct mi_value src = field->size == 8 ?
889             mi_reg64(field->mmio_offset) :
890             mi_reg32(field->mmio_offset);
891          struct mi_value dst = field->size == 8 ?
892             mi_mem64(addr) : mi_mem32(addr);
893          mi_store(b, dst, src);
894          break;
895       }
896 
897       default:
898          unreachable("Invalid query field");
899          break;
900       }
901    }
902 }
903 
genX(CmdBeginQuery)904 void genX(CmdBeginQuery)(
905     VkCommandBuffer                             commandBuffer,
906     VkQueryPool                                 queryPool,
907     uint32_t                                    query,
908     VkQueryControlFlags                         flags)
909 {
910    genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
911 }
912 
genX(CmdBeginQueryIndexedEXT)913 void genX(CmdBeginQueryIndexedEXT)(
914     VkCommandBuffer                             commandBuffer,
915     VkQueryPool                                 queryPool,
916     uint32_t                                    query,
917     VkQueryControlFlags                         flags,
918     uint32_t                                    index)
919 {
920    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
921    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
922    struct anv_address query_addr = anv_query_address(pool, query);
923 
924    struct mi_builder b;
925    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
926 
927    switch (pool->type) {
928    case VK_QUERY_TYPE_OCCLUSION:
929       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
930       break;
931 
932    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
933       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
934          pc.CommandStreamerStallEnable = true;
935          pc.StallAtPixelScoreboard = true;
936       }
937       mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
938                    mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
939       break;
940 
941    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
942       /* TODO: This might only be necessary for certain stats */
943       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
944          pc.CommandStreamerStallEnable = true;
945          pc.StallAtPixelScoreboard = true;
946       }
947 
948       uint32_t statistics = pool->pipeline_statistics;
949       uint32_t offset = 8;
950       while (statistics) {
951          uint32_t stat = u_bit_scan(&statistics);
952          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
953          offset += 16;
954       }
955       break;
956    }
957 
958    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
959       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
960          pc.CommandStreamerStallEnable = true;
961          pc.StallAtPixelScoreboard = true;
962       }
963       emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
964       break;
965 
966 #if GFX_VER >= 8
967    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
968       if (!khr_perf_query_ensure_relocs(cmd_buffer))
969          return;
970 
971       const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
972       const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
973 
974       uint32_t reloc_idx = 0;
975       for (uint32_t end = 0; end < 2; end++) {
976          for (uint32_t r = 0; r < layout->n_fields; r++) {
977             const struct intel_perf_query_field *field =
978                &layout->fields[end ? r : (layout->n_fields - 1 - r)];
979             struct mi_value reg_addr =
980                mi_iadd(
981                   &b,
982                   mi_imm(intel_canonical_address(pool->bo->offset +
983                                                  khr_perf_query_data_offset(pool, query, 0, end) +
984                                                  field->location)),
985                   mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
986             cmd_buffer->self_mod_locations[reloc_idx++] =
987                mi_store_relocated_address_reg64(&b, reg_addr);
988 
989             if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&
990                 field->size == 8) {
991                reg_addr =
992                   mi_iadd(
993                      &b,
994                      mi_imm(intel_canonical_address(pool->bo->offset +
995                                                     khr_perf_query_data_offset(pool, query, 0, end) +
996                                                     field->location + 4)),
997                      mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
998                cmd_buffer->self_mod_locations[reloc_idx++] =
999                   mi_store_relocated_address_reg64(&b, reg_addr);
1000             }
1001          }
1002       }
1003 
1004       struct mi_value availability_write_offset =
1005          mi_iadd(
1006             &b,
1007             mi_imm(
1008                intel_canonical_address(
1009                   pool->bo->offset +
1010                   khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
1011             mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
1012       cmd_buffer->self_mod_locations[reloc_idx++] =
1013          mi_store_relocated_address_reg64(&b, availability_write_offset);
1014 
1015       assert(reloc_idx == pdevice->n_perf_query_commands);
1016 
1017       const struct intel_device_info *devinfo = cmd_buffer->device->info;
1018       const enum intel_engine_class engine_class = cmd_buffer->queue_family->engine_class;
1019       mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]);
1020 
1021       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1022          pc.CommandStreamerStallEnable = true;
1023          pc.StallAtPixelScoreboard = true;
1024       }
1025       cmd_buffer->perf_query_pool = pool;
1026 
1027       cmd_buffer->perf_reloc_idx = 0;
1028       for (uint32_t r = 0; r < layout->n_fields; r++) {
1029          const struct intel_perf_query_field *field =
1030             &layout->fields[layout->n_fields - 1 - r];
1031          void *dws;
1032 
1033          switch (field->type) {
1034          case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1035             dws = anv_batch_emitn(&cmd_buffer->batch,
1036                                   GENX(MI_REPORT_PERF_COUNT_length),
1037                                   GENX(MI_REPORT_PERF_COUNT),
1038                                   .MemoryAddress = query_addr /* Will be overwritten */);
1039             mi_resolve_relocated_address_token(
1040                &b,
1041                cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1042                dws + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1043             break;
1044 
1045          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1046          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1047          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1048          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1049          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1050             dws =
1051                anv_batch_emitn(&cmd_buffer->batch,
1052                                GENX(MI_STORE_REGISTER_MEM_length),
1053                                GENX(MI_STORE_REGISTER_MEM),
1054                                .RegisterAddress = field->mmio_offset,
1055                                .MemoryAddress = query_addr /* Will be overwritten */ );
1056             mi_resolve_relocated_address_token(
1057                &b,
1058                cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1059                dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1060             if (field->size == 8) {
1061                dws =
1062                   anv_batch_emitn(&cmd_buffer->batch,
1063                                   GENX(MI_STORE_REGISTER_MEM_length),
1064                                   GENX(MI_STORE_REGISTER_MEM),
1065                                   .RegisterAddress = field->mmio_offset + 4,
1066                                   .MemoryAddress = query_addr /* Will be overwritten */ );
1067                mi_resolve_relocated_address_token(
1068                   &b,
1069                   cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1070                   dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1071             }
1072             break;
1073 
1074          default:
1075             unreachable("Invalid query field");
1076             break;
1077          }
1078       }
1079       break;
1080    }
1081 #endif
1082 
1083    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1084       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1085          pc.CommandStreamerStallEnable = true;
1086          pc.StallAtPixelScoreboard = true;
1087       }
1088       emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
1089       break;
1090    }
1091 
1092    default:
1093       unreachable("");
1094    }
1095 }
1096 
genX(CmdEndQuery)1097 void genX(CmdEndQuery)(
1098     VkCommandBuffer                             commandBuffer,
1099     VkQueryPool                                 queryPool,
1100     uint32_t                                    query)
1101 {
1102    genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
1103 }
1104 
genX(CmdEndQueryIndexedEXT)1105 void genX(CmdEndQueryIndexedEXT)(
1106     VkCommandBuffer                             commandBuffer,
1107     VkQueryPool                                 queryPool,
1108     uint32_t                                    query,
1109     uint32_t                                    index)
1110 {
1111    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1112    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1113    struct anv_address query_addr = anv_query_address(pool, query);
1114 
1115    struct mi_builder b;
1116    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1117 
1118    switch (pool->type) {
1119    case VK_QUERY_TYPE_OCCLUSION:
1120       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
1121       emit_query_pc_availability(cmd_buffer, query_addr, true);
1122       break;
1123 
1124    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1125       /* Ensure previous commands have completed before capturing the register
1126        * value.
1127        */
1128       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1129          pc.CommandStreamerStallEnable = true;
1130          pc.StallAtPixelScoreboard = true;
1131       }
1132 
1133       mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)),
1134                    mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
1135       emit_query_mi_availability(&b, query_addr, true);
1136       break;
1137 
1138    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1139       /* TODO: This might only be necessary for certain stats */
1140       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1141          pc.CommandStreamerStallEnable = true;
1142          pc.StallAtPixelScoreboard = true;
1143       }
1144 
1145       uint32_t statistics = pool->pipeline_statistics;
1146       uint32_t offset = 16;
1147       while (statistics) {
1148          uint32_t stat = u_bit_scan(&statistics);
1149          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
1150          offset += 16;
1151       }
1152 
1153       emit_query_mi_availability(&b, query_addr, true);
1154       break;
1155    }
1156 
1157    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1158       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1159          pc.CommandStreamerStallEnable = true;
1160          pc.StallAtPixelScoreboard = true;
1161       }
1162 
1163       emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
1164       emit_query_mi_availability(&b, query_addr, true);
1165       break;
1166 
1167 #if GFX_VER >= 8
1168    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1169       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1170          pc.CommandStreamerStallEnable = true;
1171          pc.StallAtPixelScoreboard = true;
1172       }
1173       cmd_buffer->perf_query_pool = pool;
1174 
1175       if (!khr_perf_query_ensure_relocs(cmd_buffer))
1176          return;
1177 
1178       const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
1179       const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
1180 
1181       void *dws;
1182       for (uint32_t r = 0; r < layout->n_fields; r++) {
1183          const struct intel_perf_query_field *field = &layout->fields[r];
1184 
1185          switch (field->type) {
1186          case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1187             dws = anv_batch_emitn(&cmd_buffer->batch,
1188                                   GENX(MI_REPORT_PERF_COUNT_length),
1189                                   GENX(MI_REPORT_PERF_COUNT),
1190                                   .MemoryAddress = query_addr /* Will be overwritten */);
1191             mi_resolve_relocated_address_token(
1192                &b,
1193                cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1194                dws + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1195             break;
1196 
1197          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1198          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1199          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1200          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1201          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1202             dws =
1203                anv_batch_emitn(&cmd_buffer->batch,
1204                                GENX(MI_STORE_REGISTER_MEM_length),
1205                                GENX(MI_STORE_REGISTER_MEM),
1206                                .RegisterAddress = field->mmio_offset,
1207                                .MemoryAddress = query_addr /* Will be overwritten */ );
1208             mi_resolve_relocated_address_token(
1209                &b,
1210                cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1211                dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1212             if (field->size == 8) {
1213                dws =
1214                   anv_batch_emitn(&cmd_buffer->batch,
1215                                   GENX(MI_STORE_REGISTER_MEM_length),
1216                                   GENX(MI_STORE_REGISTER_MEM),
1217                                   .RegisterAddress = field->mmio_offset + 4,
1218                                   .MemoryAddress = query_addr /* Will be overwritten */ );
1219                mi_resolve_relocated_address_token(
1220                   &b,
1221                   cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1222                   dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1223             }
1224             break;
1225 
1226          default:
1227             unreachable("Invalid query field");
1228             break;
1229          }
1230       }
1231 
1232       dws =
1233          anv_batch_emitn(&cmd_buffer->batch,
1234                          GENX(MI_STORE_DATA_IMM_length),
1235                          GENX(MI_STORE_DATA_IMM),
1236                          .ImmediateData = true);
1237       mi_resolve_relocated_address_token(
1238          &b,
1239          cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1240          dws + GENX(MI_STORE_DATA_IMM_Address_start) / 8);
1241 
1242       assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
1243       break;
1244    }
1245 #endif
1246 
1247    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1248       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1249          pc.CommandStreamerStallEnable = true;
1250          pc.StallAtPixelScoreboard = true;
1251       }
1252       uint32_t marker_offset = intel_perf_marker_offset();
1253       mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
1254                    mi_imm(cmd_buffer->intel_perf_marker));
1255       emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);
1256       emit_query_mi_availability(&b, query_addr, true);
1257       break;
1258    }
1259 
1260    default:
1261       unreachable("");
1262    }
1263 
1264    /* When multiview is active the spec requires that N consecutive query
1265     * indices are used, where N is the number of active views in the subpass.
1266     * The spec allows that we only write the results to one of the queries
1267     * but we still need to manage result availability for all the query indices.
1268     * Since we only emit a single query for all active views in the
1269     * first index, mark the other query indices as being already available
1270     * with result 0.
1271     */
1272    if (cmd_buffer->state.gfx.view_mask) {
1273       const uint32_t num_queries =
1274          util_bitcount(cmd_buffer->state.gfx.view_mask);
1275       if (num_queries > 1)
1276          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1277    }
1278 }
1279 
1280 #define TIMESTAMP 0x2358
1281 
genX(CmdWriteTimestamp2)1282 void genX(CmdWriteTimestamp2)(
1283     VkCommandBuffer                             commandBuffer,
1284     VkPipelineStageFlags2                       stage,
1285     VkQueryPool                                 queryPool,
1286     uint32_t                                    query)
1287 {
1288    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1289    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1290    struct anv_address query_addr = anv_query_address(pool, query);
1291 
1292    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1293 
1294    struct mi_builder b;
1295    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1296 
1297    if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
1298       mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
1299                    mi_reg64(TIMESTAMP));
1300       emit_query_mi_availability(&b, query_addr, true);
1301    } else {
1302       /* Everything else is bottom-of-pipe */
1303       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
1304       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1305 
1306       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1307          pc.DestinationAddressType  = DAT_PPGTT;
1308          pc.PostSyncOperation       = WriteTimestamp;
1309          pc.Address                 = anv_address_add(query_addr, 8);
1310 
1311          if (GFX_VER == 9 && cmd_buffer->device->info->gt == 4)
1312             pc.CommandStreamerStallEnable = true;
1313       }
1314       emit_query_pc_availability(cmd_buffer, query_addr, true);
1315    }
1316 
1317 
1318    /* When multiview is active the spec requires that N consecutive query
1319     * indices are used, where N is the number of active views in the subpass.
1320     * The spec allows that we only write the results to one of the queries
1321     * but we still need to manage result availability for all the query indices.
1322     * Since we only emit a single query for all active views in the
1323     * first index, mark the other query indices as being already available
1324     * with result 0.
1325     */
1326    if (cmd_buffer->state.gfx.view_mask) {
1327       const uint32_t num_queries =
1328          util_bitcount(cmd_buffer->state.gfx.view_mask);
1329       if (num_queries > 1)
1330          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1331    }
1332 }
1333 
1334 #if GFX_VERx10 >= 75
1335 
1336 #define MI_PREDICATE_SRC0    0x2400
1337 #define MI_PREDICATE_SRC1    0x2408
1338 #define MI_PREDICATE_RESULT  0x2418
1339 
1340 /**
1341  * Writes the results of a query to dst_addr is the value at poll_addr is equal
1342  * to the reference value.
1343  */
1344 static void
gpu_write_query_result_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address poll_addr,struct anv_address dst_addr,uint64_t ref_value,VkQueryResultFlags flags,uint32_t value_index,struct mi_value query_result)1345 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
1346                             struct mi_builder *b,
1347                             struct anv_address poll_addr,
1348                             struct anv_address dst_addr,
1349                             uint64_t ref_value,
1350                             VkQueryResultFlags flags,
1351                             uint32_t value_index,
1352                             struct mi_value query_result)
1353 {
1354    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));
1355    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));
1356    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1357       mip.LoadOperation    = LOAD_LOAD;
1358       mip.CombineOperation = COMBINE_SET;
1359       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1360    }
1361 
1362    if (flags & VK_QUERY_RESULT_64_BIT) {
1363       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1364       mi_store_if(b, mi_mem64(res_addr), query_result);
1365    } else {
1366       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1367       mi_store_if(b, mi_mem32(res_addr), query_result);
1368    }
1369 }
1370 
1371 static void
gpu_write_query_result(struct mi_builder * b,struct anv_address dst_addr,VkQueryResultFlags flags,uint32_t value_index,struct mi_value query_result)1372 gpu_write_query_result(struct mi_builder *b,
1373                        struct anv_address dst_addr,
1374                        VkQueryResultFlags flags,
1375                        uint32_t value_index,
1376                        struct mi_value query_result)
1377 {
1378    if (flags & VK_QUERY_RESULT_64_BIT) {
1379       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1380       mi_store(b, mi_mem64(res_addr), query_result);
1381    } else {
1382       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1383       mi_store(b, mi_mem32(res_addr), query_result);
1384    }
1385 }
1386 
1387 static struct mi_value
compute_query_result(struct mi_builder * b,struct anv_address addr)1388 compute_query_result(struct mi_builder *b, struct anv_address addr)
1389 {
1390    return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),
1391                      mi_mem64(anv_address_add(addr, 0)));
1392 }
1393 
genX(CmdCopyQueryPoolResults)1394 void genX(CmdCopyQueryPoolResults)(
1395     VkCommandBuffer                             commandBuffer,
1396     VkQueryPool                                 queryPool,
1397     uint32_t                                    firstQuery,
1398     uint32_t                                    queryCount,
1399     VkBuffer                                    destBuffer,
1400     VkDeviceSize                                destOffset,
1401     VkDeviceSize                                destStride,
1402     VkQueryResultFlags                          flags)
1403 {
1404    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1405    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1406    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1407 
1408    struct mi_builder b;
1409    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1410    struct mi_value result;
1411 
1412    /* If render target writes are ongoing, request a render target cache flush
1413     * to ensure proper ordering of the commands from the 3d pipe and the
1414     * command streamer.
1415     */
1416    if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
1417       anv_add_pending_pipe_bits(cmd_buffer,
1418                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
1419                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
1420                                 "CopyQueryPoolResults");
1421    }
1422 
1423    if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
1424        (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
1425        /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
1426         * because we're about to copy values from MI commands, we need to
1427         * stall the command streamer to make sure the PIPE_CONTROL values have
1428         * landed, otherwise we could see inconsistent values & availability.
1429         *
1430         *  From the vulkan spec:
1431         *
1432         *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1433         *     previous uses of vkCmdResetQueryPool in the same queue, without
1434         *     any additional synchronization."
1435         */
1436        pool->type == VK_QUERY_TYPE_OCCLUSION ||
1437        pool->type == VK_QUERY_TYPE_TIMESTAMP) {
1438       anv_add_pending_pipe_bits(cmd_buffer,
1439                                 ANV_PIPE_CS_STALL_BIT,
1440                                 "CopyQueryPoolResults");
1441       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1442    }
1443 
1444    struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
1445    for (uint32_t i = 0; i < queryCount; i++) {
1446       struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
1447       uint32_t idx = 0;
1448       switch (pool->type) {
1449       case VK_QUERY_TYPE_OCCLUSION:
1450       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1451          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1452          /* Like in the case of vkGetQueryPoolResults, if the query is
1453           * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
1454           * conservatively write 0 as the query result. If the
1455           * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1456           */
1457          gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1458                1 /* available */, flags, idx, result);
1459          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1460             gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1461                   0 /* unavailable */, flags, idx, mi_imm(0));
1462          }
1463          idx++;
1464          break;
1465 
1466       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1467          uint32_t statistics = pool->pipeline_statistics;
1468          while (statistics) {
1469             uint32_t stat = u_bit_scan(&statistics);
1470 
1471             result = compute_query_result(&b, anv_address_add(query_addr,
1472                                                               idx * 16 + 8));
1473 
1474             /* WaDividePSInvocationCountBy4:HSW,BDW */
1475             if ((cmd_buffer->device->info->ver == 8 ||
1476                  cmd_buffer->device->info->verx10 == 75) &&
1477                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1478                result = mi_ushr32_imm(&b, result, 2);
1479             }
1480 
1481             gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1482          }
1483          assert(idx == util_bitcount(pool->pipeline_statistics));
1484          break;
1485       }
1486 
1487       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1488          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1489          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1490          result = compute_query_result(&b, anv_address_add(query_addr, 24));
1491          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1492          break;
1493 
1494       case VK_QUERY_TYPE_TIMESTAMP:
1495          result = mi_mem64(anv_address_add(query_addr, 8));
1496          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1497          break;
1498 
1499 #if GFX_VER >= 8
1500       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1501          unreachable("Copy KHR performance query results not implemented");
1502          break;
1503 #endif
1504 
1505       default:
1506          unreachable("unhandled query type");
1507       }
1508 
1509       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1510          gpu_write_query_result(&b, dest_addr, flags, idx,
1511                                 mi_mem64(query_addr));
1512       }
1513 
1514       dest_addr = anv_address_add(dest_addr, destStride);
1515    }
1516 }
1517 
1518 #else
genX(CmdCopyQueryPoolResults)1519 void genX(CmdCopyQueryPoolResults)(
1520     VkCommandBuffer                             commandBuffer,
1521     VkQueryPool                                 queryPool,
1522     uint32_t                                    firstQuery,
1523     uint32_t                                    queryCount,
1524     VkBuffer                                    destBuffer,
1525     VkDeviceSize                                destOffset,
1526     VkDeviceSize                                destStride,
1527     VkQueryResultFlags                          flags)
1528 {
1529    anv_finishme("Queries not yet supported on Ivy Bridge");
1530 }
1531 #endif
1532