1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_query_pool.h"
6
7 #include "nvk_buffer.h"
8 #include "nvk_cmd_buffer.h"
9 #include "nvk_device.h"
10 #include "nvk_entrypoints.h"
11 #include "nvk_event.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvkmd/nvkmd.h"
15
16 #include "vk_common_entrypoints.h"
17 #include "vk_meta.h"
18 #include "vk_pipeline.h"
19
20 #include "compiler/nir/nir.h"
21 #include "compiler/nir/nir_builder.h"
22
23 #include "util/os_time.h"
24
25 #include "nv_push_cl906f.h"
26 #include "nv_push_cl9097.h"
27 #include "nv_push_cla0c0.h"
28 #include "nv_push_clc597.h"
29
30 struct nvk_query_report {
31 uint64_t value;
32 uint64_t timestamp;
33 };
34
35 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateQueryPool(VkDevice device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)36 nvk_CreateQueryPool(VkDevice device,
37 const VkQueryPoolCreateInfo *pCreateInfo,
38 const VkAllocationCallbacks *pAllocator,
39 VkQueryPool *pQueryPool)
40 {
41 VK_FROM_HANDLE(nvk_device, dev, device);
42 struct nvk_physical_device *pdev = nvk_device_physical(dev);
43 struct nvk_query_pool *pool;
44 VkResult result;
45
46 pool = vk_query_pool_create(&dev->vk, pCreateInfo,
47 pAllocator, sizeof(*pool));
48 if (!pool)
49 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
50
51 /* We place the availability first and then data */
52 pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
53 sizeof(struct nvk_query_report));
54
55 uint32_t reports_per_query;
56 switch (pCreateInfo->queryType) {
57 case VK_QUERY_TYPE_OCCLUSION:
58 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
59 reports_per_query = 2;
60 break;
61 case VK_QUERY_TYPE_TIMESTAMP:
62 reports_per_query = 1;
63 break;
64 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
65 reports_per_query = 2 * util_bitcount(pool->vk.pipeline_statistics);
66 break;
67 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
68 // 2 for primitives succeeded 2 for primitives needed
69 reports_per_query = 4;
70 break;
71 default:
72 unreachable("Unsupported query type");
73 }
74 pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);
75
76 if (pool->vk.query_count > 0) {
77 uint32_t mem_size = pool->query_start +
78 pool->query_stride * pool->vk.query_count;
79 result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &dev->vk.base,
80 mem_size, 0 /* align_B */,
81 NVKMD_MEM_GART,
82 NVKMD_MEM_MAP_RDWR,
83 &pool->mem);
84 if (result != VK_SUCCESS) {
85 vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
86 return result;
87 }
88
89 if (pdev->debug_flags & NVK_DEBUG_ZERO_MEMORY)
90 memset(pool->mem->map, 0, mem_size);
91 }
92
93 *pQueryPool = nvk_query_pool_to_handle(pool);
94
95 return VK_SUCCESS;
96 }
97
98 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyQueryPool(VkDevice device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)99 nvk_DestroyQueryPool(VkDevice device,
100 VkQueryPool queryPool,
101 const VkAllocationCallbacks *pAllocator)
102 {
103 VK_FROM_HANDLE(nvk_device, dev, device);
104 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
105
106 if (!pool)
107 return;
108
109 if (pool->mem)
110 nvkmd_mem_unref(pool->mem);
111 vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
112 }
113
114 static uint64_t
nvk_query_available_addr(struct nvk_query_pool * pool,uint32_t query)115 nvk_query_available_addr(struct nvk_query_pool *pool, uint32_t query)
116 {
117 assert(query < pool->vk.query_count);
118 return pool->mem->va->addr + query * sizeof(uint32_t);
119 }
120
121 static nir_def *
nvk_nir_available_addr(nir_builder * b,nir_def * pool_addr,nir_def * query)122 nvk_nir_available_addr(nir_builder *b, nir_def *pool_addr,
123 nir_def *query)
124 {
125 nir_def *offset = nir_imul_imm(b, query, sizeof(uint32_t));
126 return nir_iadd(b, pool_addr, nir_u2u64(b, offset));
127 }
128
129 static uint32_t *
nvk_query_available_map(struct nvk_query_pool * pool,uint32_t query)130 nvk_query_available_map(struct nvk_query_pool *pool, uint32_t query)
131 {
132 assert(query < pool->vk.query_count);
133 return (uint32_t *)pool->mem->map + query;
134 }
135
136 static uint64_t
nvk_query_offset(struct nvk_query_pool * pool,uint32_t query)137 nvk_query_offset(struct nvk_query_pool *pool, uint32_t query)
138 {
139 assert(query < pool->vk.query_count);
140 return pool->query_start + query * pool->query_stride;
141 }
142
143 static uint64_t
nvk_query_report_addr(struct nvk_query_pool * pool,uint32_t query)144 nvk_query_report_addr(struct nvk_query_pool *pool, uint32_t query)
145 {
146 return pool->mem->va->addr + nvk_query_offset(pool, query);
147 }
148
149 static nir_def *
nvk_nir_query_report_addr(nir_builder * b,nir_def * pool_addr,nir_def * query_start,nir_def * query_stride,nir_def * query)150 nvk_nir_query_report_addr(nir_builder *b, nir_def *pool_addr,
151 nir_def *query_start, nir_def *query_stride,
152 nir_def *query)
153 {
154 nir_def *offset =
155 nir_iadd(b, query_start, nir_umul_2x32_64(b, query, query_stride));
156 return nir_iadd(b, pool_addr, offset);
157 }
158
159 static struct nvk_query_report *
nvk_query_report_map(struct nvk_query_pool * pool,uint32_t query)160 nvk_query_report_map(struct nvk_query_pool *pool, uint32_t query)
161 {
162 return (void *)((char *)pool->mem->map + nvk_query_offset(pool, query));
163 }
164
165 /**
166 * Goes through a series of consecutive query indices in the given pool,
167 * setting all element values to 0 and emitting them as available.
168 */
169 static void
emit_zero_queries(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_index,uint32_t num_queries)170 emit_zero_queries(struct nvk_cmd_buffer *cmd, struct nvk_query_pool *pool,
171 uint32_t first_index, uint32_t num_queries)
172 {
173 switch (pool->vk.query_type) {
174 case VK_QUERY_TYPE_OCCLUSION:
175 case VK_QUERY_TYPE_TIMESTAMP:
176 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
177 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
178 for (uint32_t i = 0; i < num_queries; i++) {
179 uint64_t addr = nvk_query_available_addr(pool, first_index + i);
180
181 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
182 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
183 P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
184 P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
185 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
186 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
187 .operation = OPERATION_RELEASE,
188 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
189 .pipeline_location = PIPELINE_LOCATION_ALL,
190 .structure_size = STRUCTURE_SIZE_ONE_WORD,
191 });
192 }
193 break;
194 }
195 default:
196 unreachable("Unsupported query type");
197 }
198 }
199
200 VKAPI_ATTR void VKAPI_CALL
nvk_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)201 nvk_ResetQueryPool(VkDevice device,
202 VkQueryPool queryPool,
203 uint32_t firstQuery,
204 uint32_t queryCount)
205 {
206 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
207
208 uint32_t *available = nvk_query_available_map(pool, firstQuery);
209 memset(available, 0, queryCount * sizeof(*available));
210 }
211
212 VKAPI_ATTR void VKAPI_CALL
nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)213 nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
214 VkQueryPool queryPool,
215 uint32_t firstQuery,
216 uint32_t queryCount)
217 {
218 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
219 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
220
221 for (uint32_t i = 0; i < queryCount; i++) {
222 uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
223
224 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
225 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
226 P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
227 P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
228 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
229 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
230 .operation = OPERATION_RELEASE,
231 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
232 .pipeline_location = PIPELINE_LOCATION_ALL,
233 .structure_size = STRUCTURE_SIZE_ONE_WORD,
234 });
235 }
236
237 /* Wait for the above writes to complete. This prevents WaW hazards on any
238 * later query availability updates and ensures vkCmdCopyQueryPoolResults
239 * will see the query as unavailable if it happens before the query is
240 * completed again.
241 */
242 for (uint32_t i = 0; i < queryCount; i++) {
243 uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
244
245 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
246 __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
247 P_NV906F_SEMAPHOREA(p, addr >> 32);
248 P_NV906F_SEMAPHOREB(p, (addr & UINT32_MAX) >> 2);
249 P_NV906F_SEMAPHOREC(p, 0);
250 P_NV906F_SEMAPHORED(p, {
251 .operation = OPERATION_ACQUIRE,
252 .acquire_switch = ACQUIRE_SWITCH_ENABLED,
253 .release_size = RELEASE_SIZE_4BYTE,
254 });
255 }
256 }
257
258 VKAPI_ATTR void VKAPI_CALL
nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)259 nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
260 VkPipelineStageFlags2 stage,
261 VkQueryPool queryPool,
262 uint32_t query)
263 {
264 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
265 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
266
267 struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
268
269 uint64_t report_addr = nvk_query_report_addr(pool, query);
270 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
271 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
272 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
273 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
274 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
275 .operation = OPERATION_REPORT_ONLY,
276 .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
277 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
278 });
279
280 uint64_t available_addr = nvk_query_available_addr(pool, query);
281 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
282 P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
283 P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
284 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
285 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
286 .operation = OPERATION_RELEASE,
287 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
288 .pipeline_location = PIPELINE_LOCATION_ALL,
289 .structure_size = STRUCTURE_SIZE_ONE_WORD,
290 });
291
292 /* From the Vulkan spec:
293 *
294 * "If vkCmdWriteTimestamp2 is called while executing a render pass
295 * instance that has multiview enabled, the timestamp uses N consecutive
296 * query indices in the query pool (starting at query) where N is the
297 * number of bits set in the view mask of the subpass the command is
298 * executed in. The resulting query values are determined by an
299 * implementation-dependent choice of one of the following behaviors:"
300 *
301 * In our case, only the first query is used, so we emit zeros for the
302 * remaining queries, as described in the first behavior listed in the
303 * Vulkan spec:
304 *
305 * "The first query is a timestamp value and (if more than one bit is set
306 * in the view mask) zero is written to the remaining queries."
307 */
308 if (cmd->state.gfx.render.view_mask != 0) {
309 const uint32_t num_queries =
310 util_bitcount(cmd->state.gfx.render.view_mask);
311 if (num_queries > 1)
312 emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
313 }
314 }
315
316 struct nvk_3d_stat_query {
317 VkQueryPipelineStatisticFlagBits flag;
318 uint8_t loc;
319 uint8_t report;
320 };
321
322 /* This must remain sorted in flag order */
323 static const struct nvk_3d_stat_query nvk_3d_stat_queries[] = {{
324 .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
325 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
326 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
327 }, {
328 .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
329 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
330 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
331 }, {
332 .flag = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
333 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
334 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
335 }, {
336 .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
337 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
338 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
339 }, {
340 .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
341 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
342 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
343 }, {
344 .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
345 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
346 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
347 }, {
348 .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
349 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
350 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
351 }, {
352 .flag = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
353 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
354 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
355 }, {
356 .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
357 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
358 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
359 }, {
360 .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
361 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
362 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
363 }, {
364 .flag = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
365 .loc = UINT8_MAX,
366 .report = UINT8_MAX,
367 }};
368
369 static void
mme_store_global(struct mme_builder * b,struct mme_value64 addr,struct mme_value v)370 mme_store_global(struct mme_builder *b,
371 struct mme_value64 addr,
372 struct mme_value v)
373 {
374 mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
375 mme_emit_addr64(b, addr);
376 mme_emit(b, v);
377 mme_emit(b, mme_imm(0x10000000));
378 }
379
380 void
nvk_mme_write_cs_invocations(struct mme_builder * b)381 nvk_mme_write_cs_invocations(struct mme_builder *b)
382 {
383 struct mme_value64 dst_addr = mme_load_addr64(b);
384
385 struct mme_value accum_hi = mme_state(b,
386 NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI));
387 struct mme_value accum_lo = mme_state(b,
388 NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO));
389 struct mme_value64 accum = mme_value64(accum_lo, accum_hi);
390
391 mme_store_global(b, dst_addr, accum.lo);
392 mme_store_global(b, mme_add64(b, dst_addr, mme_imm64(4)), accum.hi);
393 }
394
395 static void
nvk_cmd_begin_end_query(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t query,uint32_t index,bool end)396 nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
397 struct nvk_query_pool *pool,
398 uint32_t query, uint32_t index,
399 bool end)
400 {
401 uint64_t report_addr = nvk_query_report_addr(pool, query) +
402 end * sizeof(struct nvk_query_report);
403
404 uint32_t end_size = 7 * end;
405
406 struct nv_push *p;
407 switch (pool->vk.query_type) {
408 case VK_QUERY_TYPE_OCCLUSION:
409 p = nvk_cmd_buffer_push(cmd, 7 + end_size);
410
411 P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, !end);
412
413 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
414 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
415 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
416 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
417 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
418 .operation = OPERATION_REPORT_ONLY,
419 .pipeline_location = PIPELINE_LOCATION_ALL,
420 .report = REPORT_ZPASS_PIXEL_CNT64,
421 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
422 .flush_disable = true,
423 });
424 break;
425
426 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
427 uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
428 p = nvk_cmd_buffer_push(cmd, stat_count * 5 + end_size);
429
430 ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
431 for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
432 const struct nvk_3d_stat_query *sq = &nvk_3d_stat_queries[i];
433 if (!(stats_left & sq->flag))
434 continue;
435
436 /* The 3D stat queries array MUST be sorted */
437 assert(!(stats_left & (sq->flag - 1)));
438
439 if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
440 P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
441 P_INLINE_DATA(p, report_addr >> 32);
442 P_INLINE_DATA(p, report_addr);
443 } else {
444 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
445 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
446 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
447 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
448 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
449 .operation = OPERATION_REPORT_ONLY,
450 .pipeline_location = sq->loc,
451 .report = sq->report,
452 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
453 .flush_disable = true,
454 });
455 }
456
457 report_addr += 2 * sizeof(struct nvk_query_report);
458 stats_left &= ~sq->flag;
459 }
460 break;
461 }
462
463 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
464 const uint32_t xfb_reports[] = {
465 NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_SUCCEEDED,
466 NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_NEEDED,
467 };
468 p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports) + end_size);
469 for (uint32_t i = 0; i < ARRAY_SIZE(xfb_reports); ++i) {
470 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
471 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
472 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
473 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
474 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
475 .operation = OPERATION_REPORT_ONLY,
476 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
477 .report = xfb_reports[i],
478 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
479 .sub_report = index,
480 .flush_disable = true,
481 });
482 report_addr += 2 * sizeof(struct nvk_query_report);
483 }
484 break;
485 }
486
487 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
488 p = nvk_cmd_buffer_push(cmd, 5 + end_size);
489
490 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
491 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
492 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
493 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
494 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
495 .operation = OPERATION_REPORT_ONLY,
496 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
497 .report = REPORT_VTG_PRIMITIVES_OUT,
498 .sub_report = index,
499 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
500 .flush_disable = true,
501 });
502 break;
503
504 default:
505 unreachable("Unsupported query type");
506 }
507
508 if (end) {
509 P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
510
511 uint64_t available_addr = nvk_query_available_addr(pool, query);
512 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
513 P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
514 P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
515 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
516 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
517 .operation = OPERATION_RELEASE,
518 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
519 .pipeline_location = PIPELINE_LOCATION_ALL,
520 .structure_size = STRUCTURE_SIZE_ONE_WORD,
521 });
522 }
523 }
524
525 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)526 nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
527 VkQueryPool queryPool,
528 uint32_t query,
529 VkQueryControlFlags flags,
530 uint32_t index)
531 {
532 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
533 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
534
535 nvk_cmd_begin_end_query(cmd, pool, query, index, false);
536 }
537
538 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)539 nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
540 VkQueryPool queryPool,
541 uint32_t query,
542 uint32_t index)
543 {
544 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
545 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
546
547 nvk_cmd_begin_end_query(cmd, pool, query, index, true);
548
549 /* From the Vulkan spec:
550 *
551 * "If queries are used while executing a render pass instance that has
552 * multiview enabled, the query uses N consecutive query indices in
553 * the query pool (starting at query) where N is the number of bits set
554 * in the view mask in the subpass the query is used in. How the
555 * numerical results of the query are distributed among the queries is
556 * implementation-dependent."
557 *
558 * In our case, only the first query is used, so we emit zeros for the
559 * remaining queries.
560 */
561 if (cmd->state.gfx.render.view_mask != 0) {
562 const uint32_t num_queries =
563 util_bitcount(cmd->state.gfx.render.view_mask);
564 if (num_queries > 1)
565 emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
566 }
567 }
568
569 static bool
nvk_query_is_available(struct nvk_query_pool * pool,uint32_t query)570 nvk_query_is_available(struct nvk_query_pool *pool, uint32_t query)
571 {
572 uint32_t *available = nvk_query_available_map(pool, query);
573 return p_atomic_read(available) != 0;
574 }
575
576 #define NVK_QUERY_TIMEOUT 2000000000ull
577
578 static VkResult
nvk_query_wait_for_available(struct nvk_device * dev,struct nvk_query_pool * pool,uint32_t query)579 nvk_query_wait_for_available(struct nvk_device *dev,
580 struct nvk_query_pool *pool,
581 uint32_t query)
582 {
583 uint64_t abs_timeout_ns = os_time_get_absolute_timeout(NVK_QUERY_TIMEOUT);
584
585 while (os_time_get_nano() < abs_timeout_ns) {
586 if (nvk_query_is_available(pool, query))
587 return VK_SUCCESS;
588
589 VkResult status = vk_device_check_status(&dev->vk);
590 if (status != VK_SUCCESS)
591 return status;
592 }
593
594 return vk_device_set_lost(&dev->vk, "query timeout");
595 }
596
597 static void
cpu_write_query_result(void * dst,uint32_t idx,VkQueryResultFlags flags,uint64_t result)598 cpu_write_query_result(void *dst, uint32_t idx,
599 VkQueryResultFlags flags,
600 uint64_t result)
601 {
602 if (flags & VK_QUERY_RESULT_64_BIT) {
603 uint64_t *dst64 = dst;
604 dst64[idx] = result;
605 } else {
606 uint32_t *dst32 = dst;
607 dst32[idx] = result;
608 }
609 }
610
611 static void
cpu_get_query_delta(void * dst,const struct nvk_query_report * src,uint32_t idx,VkQueryResultFlags flags)612 cpu_get_query_delta(void *dst, const struct nvk_query_report *src,
613 uint32_t idx, VkQueryResultFlags flags)
614 {
615 uint64_t delta = src[idx * 2 + 1].value - src[idx * 2].value;
616 cpu_write_query_result(dst, idx, flags, delta);
617 }
618
619 VKAPI_ATTR VkResult VKAPI_CALL
nvk_GetQueryPoolResults(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)620 nvk_GetQueryPoolResults(VkDevice device,
621 VkQueryPool queryPool,
622 uint32_t firstQuery,
623 uint32_t queryCount,
624 size_t dataSize,
625 void *pData,
626 VkDeviceSize stride,
627 VkQueryResultFlags flags)
628 {
629 VK_FROM_HANDLE(nvk_device, dev, device);
630 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
631
632 if (vk_device_is_lost(&dev->vk))
633 return VK_ERROR_DEVICE_LOST;
634
635 VkResult status = VK_SUCCESS;
636 for (uint32_t i = 0; i < queryCount; i++) {
637 const uint32_t query = firstQuery + i;
638
639 bool available = nvk_query_is_available(pool, query);
640
641 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
642 status = nvk_query_wait_for_available(dev, pool, query);
643 if (status != VK_SUCCESS)
644 return status;
645
646 available = true;
647 }
648
649 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
650
651 const struct nvk_query_report *src = nvk_query_report_map(pool, query);
652 assert(i * stride < dataSize);
653 void *dst = (char *)pData + i * stride;
654
655 uint32_t available_dst_idx = 1;
656 switch (pool->vk.query_type) {
657 case VK_QUERY_TYPE_OCCLUSION:
658 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
659 if (write_results)
660 cpu_get_query_delta(dst, src, 0, flags);
661 break;
662 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
663 uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
664 available_dst_idx = stat_count;
665 if (write_results) {
666 for (uint32_t j = 0; j < stat_count; j++)
667 cpu_get_query_delta(dst, src, j, flags);
668 }
669 break;
670 }
671 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
672 const int prims_succeeded_idx = 0;
673 const int prims_needed_idx = 1;
674 available_dst_idx = 2;
675 if (write_results) {
676 cpu_get_query_delta(dst, src, prims_succeeded_idx, flags);
677 cpu_get_query_delta(dst, src, prims_needed_idx, flags);
678 }
679 break;
680 }
681 case VK_QUERY_TYPE_TIMESTAMP:
682 if (write_results)
683 cpu_write_query_result(dst, 0, flags, src->timestamp);
684 break;
685 default:
686 unreachable("Unsupported query type");
687 }
688
689 if (!write_results)
690 status = VK_NOT_READY;
691
692 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
693 cpu_write_query_result(dst, available_dst_idx, flags, available);
694 }
695
696 return status;
697 }
698
699 struct nvk_copy_query_push {
700 uint64_t pool_addr;
701 uint32_t query_start;
702 uint32_t query_stride;
703 uint32_t first_query;
704 uint32_t query_count;
705 uint64_t dst_addr;
706 uint64_t dst_stride;
707 uint32_t flags;
708 };
709
710 static nir_def *
load_struct_var(nir_builder * b,nir_variable * var,uint32_t field)711 load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
712 {
713 nir_deref_instr *deref =
714 nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
715 return nir_load_deref(b, deref);
716 }
717
718 static void
nir_write_query_result(nir_builder * b,nir_def * dst_addr,nir_def * idx,nir_def * flags,nir_def * result)719 nir_write_query_result(nir_builder *b, nir_def *dst_addr,
720 nir_def *idx, nir_def *flags,
721 nir_def *result)
722 {
723 assert(result->num_components == 1);
724 assert(result->bit_size == 64);
725
726 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
727 {
728 nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 8));
729 nir_store_global(b, nir_iadd(b, dst_addr, offset), 8, result, 0x1);
730 }
731 nir_push_else(b, NULL);
732 {
733 nir_def *result32 = nir_u2u32(b, result);
734 nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 4));
735 nir_store_global(b, nir_iadd(b, dst_addr, offset), 4, result32, 0x1);
736 }
737 nir_pop_if(b, NULL);
738 }
739
740 static void
nir_get_query_delta(nir_builder * b,nir_def * dst_addr,nir_def * report_addr,nir_def * idx,nir_def * flags)741 nir_get_query_delta(nir_builder *b, nir_def *dst_addr,
742 nir_def *report_addr, nir_def *idx,
743 nir_def *flags)
744 {
745 nir_def *offset =
746 nir_imul_imm(b, idx, 2 * sizeof(struct nvk_query_report));
747 nir_def *begin_addr =
748 nir_iadd(b, report_addr, nir_i2i64(b, offset));
749 nir_def *end_addr =
750 nir_iadd_imm(b, begin_addr, sizeof(struct nvk_query_report));
751
752 /* nvk_query_report::timestamp is the first uint64_t */
753 nir_def *begin = nir_load_global(b, begin_addr, 16, 1, 64);
754 nir_def *end = nir_load_global(b, end_addr, 16, 1, 64);
755
756 nir_def *delta = nir_isub(b, end, begin);
757
758 nir_write_query_result(b, dst_addr, idx, flags, delta);
759 }
760
761 static void
nvk_nir_copy_query(nir_builder * b,nir_variable * push,nir_def * i)762 nvk_nir_copy_query(nir_builder *b, nir_variable *push, nir_def *i)
763 {
764 nir_def *pool_addr = load_struct_var(b, push, 0);
765 nir_def *query_start = nir_u2u64(b, load_struct_var(b, push, 1));
766 nir_def *query_stride = load_struct_var(b, push, 2);
767 nir_def *first_query = load_struct_var(b, push, 3);
768 nir_def *dst_addr = load_struct_var(b, push, 5);
769 nir_def *dst_stride = load_struct_var(b, push, 6);
770 nir_def *flags = load_struct_var(b, push, 7);
771
772 nir_def *query = nir_iadd(b, first_query, i);
773
774 nir_def *avail_addr = nvk_nir_available_addr(b, pool_addr, query);
775 nir_def *available =
776 nir_i2b(b, nir_load_global(b, avail_addr, 4, 1, 32));
777
778 nir_def *partial = nir_test_mask(b, flags, VK_QUERY_RESULT_PARTIAL_BIT);
779 nir_def *write_results = nir_ior(b, available, partial);
780
781 nir_def *report_addr =
782 nvk_nir_query_report_addr(b, pool_addr, query_start, query_stride,
783 query);
784 nir_def *dst_offset = nir_imul(b, nir_u2u64(b, i), dst_stride);
785
786 /* Timestamp queries are the only ones use a single report */
787 nir_def *is_timestamp =
788 nir_ieq_imm(b, query_stride, sizeof(struct nvk_query_report));
789
790 nir_def *one = nir_imm_int(b, 1);
791 nir_def *num_reports;
792 nir_push_if(b, is_timestamp);
793 {
794 nir_push_if(b, write_results);
795 {
796 /* This is the timestamp case. We add 8 because we're loading
797 * nvk_query_report::timestamp.
798 */
799 nir_def *timestamp =
800 nir_load_global(b, nir_iadd_imm(b, report_addr, 8), 8, 1, 64);
801
802 nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
803 nir_imm_int(b, 0), flags, timestamp);
804 }
805 nir_pop_if(b, NULL);
806 }
807 nir_push_else(b, NULL);
808 {
809 /* Everything that isn't a timestamp has the invariant that the
810 * number of destination entries is equal to the query stride divided
811 * by the size of two reports.
812 */
813 num_reports = nir_udiv_imm(b, query_stride,
814 2 * sizeof(struct nvk_query_report));
815
816 nir_push_if(b, write_results);
817 {
818 nir_variable *r =
819 nir_local_variable_create(b->impl, glsl_uint_type(), "r");
820 nir_store_var(b, r, nir_imm_int(b, 0), 0x1);
821
822 nir_push_loop(b);
823 {
824 nir_break_if(b, nir_ige(b, nir_load_var(b, r), num_reports));
825
826 nir_get_query_delta(b, nir_iadd(b, dst_addr, dst_offset),
827 report_addr, nir_load_var(b, r), flags);
828
829 nir_store_var(b, r, nir_iadd_imm(b, nir_load_var(b, r), 1), 0x1);
830 }
831 nir_pop_loop(b, NULL);
832 }
833 nir_pop_if(b, NULL);
834 }
835 nir_pop_if(b, NULL);
836
837 num_reports = nir_if_phi(b, one, num_reports);
838
839 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
840 {
841 nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
842 num_reports, flags, nir_b2i64(b, available));
843 }
844 nir_pop_if(b, NULL);
845 }
846
847 static nir_shader *
build_copy_queries_shader(void)848 build_copy_queries_shader(void)
849 {
850 nir_builder build =
851 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
852 "nvk-meta-copy-queries");
853 nir_builder *b = &build;
854
855 struct glsl_struct_field push_fields[] = {
856 { .type = glsl_uint64_t_type(), .name = "pool_addr", .offset = 0 },
857 { .type = glsl_uint_type(), .name = "query_start", .offset = 8 },
858 { .type = glsl_uint_type(), .name = "query_stride", .offset = 12 },
859 { .type = glsl_uint_type(), .name = "first_query", .offset = 16 },
860 { .type = glsl_uint_type(), .name = "query_count", .offset = 20 },
861 { .type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24 },
862 { .type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32 },
863 { .type = glsl_uint_type(), .name = "flags", .offset = 40 },
864 };
865 const struct glsl_type *push_iface_type =
866 glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
867 GLSL_INTERFACE_PACKING_STD140,
868 false /* row_major */, "push");
869 nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
870 push_iface_type, "push");
871
872 b->shader->info.workgroup_size[0] = 32;
873 nir_def *wg_id = nir_load_workgroup_id(b);
874 nir_def *i = nir_iadd(b, nir_load_subgroup_invocation(b),
875 nir_imul_imm(b, nir_channel(b, wg_id, 0), 32));
876
877 nir_def *query_count = load_struct_var(b, push, 4);
878 nir_push_if(b, nir_ilt(b, i, query_count));
879 {
880 nvk_nir_copy_query(b, push, i);
881 }
882 nir_pop_if(b, NULL);
883
884 return build.shader;
885 }
886
887 static struct nvk_shader *
atomic_set_or_destroy_shader(struct nvk_device * dev,struct nvk_shader ** shader_ptr,struct nvk_shader * shader,const VkAllocationCallbacks * alloc)888 atomic_set_or_destroy_shader(struct nvk_device *dev,
889 struct nvk_shader **shader_ptr,
890 struct nvk_shader *shader,
891 const VkAllocationCallbacks *alloc)
892 {
893 struct nvk_shader *old_shader = p_atomic_cmpxchg(shader_ptr, NULL, shader);
894 if (old_shader == NULL) {
895 return shader;
896 } else {
897 vk_shader_destroy(&dev->vk, &shader->vk, alloc);
898 return old_shader;
899 }
900 }
901
902 static VkResult
get_copy_queries_shader(struct nvk_device * dev,struct nvk_shader ** shader_out)903 get_copy_queries_shader(struct nvk_device *dev,
904 struct nvk_shader **shader_out)
905 {
906 struct nvk_shader *shader = p_atomic_read(&dev->copy_queries);
907 if (shader != NULL) {
908 *shader_out = shader;
909 return VK_SUCCESS;
910 }
911
912 nir_shader *nir = build_copy_queries_shader();
913 VkResult result = nvk_compile_nir_shader(dev, nir, &dev->vk.alloc, &shader);
914 if (result != VK_SUCCESS)
915 return result;
916
917 *shader_out = atomic_set_or_destroy_shader(dev, &dev->copy_queries,
918 shader, &dev->vk.alloc);
919
920 return VK_SUCCESS;
921 }
922
923 static void
nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_query,uint32_t query_count,uint64_t dst_addr,uint64_t dst_stride,VkQueryResultFlags flags)924 nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd,
925 struct nvk_query_pool *pool,
926 uint32_t first_query,
927 uint32_t query_count,
928 uint64_t dst_addr,
929 uint64_t dst_stride,
930 VkQueryResultFlags flags)
931 {
932 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
933
934 struct nvk_shader *shader;
935 VkResult result = get_copy_queries_shader(dev, &shader);
936 if (result != VK_SUCCESS) {
937 vk_command_buffer_set_error(&cmd->vk, result);
938 return;
939 }
940
941 const struct nvk_copy_query_push push = {
942 .pool_addr = pool->mem->va->addr,
943 .query_start = pool->query_start,
944 .query_stride = pool->query_stride,
945 .first_query = first_query,
946 .query_count = query_count,
947 .dst_addr = dst_addr,
948 .dst_stride = dst_stride,
949 .flags = flags,
950 };
951 nvk_cmd_dispatch_shader(cmd, shader, &push, sizeof(push),
952 DIV_ROUND_UP(query_count, 32), 1, 1);
953 }
954
955 VKAPI_ATTR void VKAPI_CALL
nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)956 nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
957 VkQueryPool queryPool,
958 uint32_t firstQuery,
959 uint32_t queryCount,
960 VkBuffer dstBuffer,
961 VkDeviceSize dstOffset,
962 VkDeviceSize stride,
963 VkQueryResultFlags flags)
964 {
965 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
966 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
967 VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);
968
969 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
970 for (uint32_t i = 0; i < queryCount; i++) {
971 uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);
972
973 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
974 __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
975 P_NV906F_SEMAPHOREA(p, avail_addr >> 32);
976 P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2);
977 P_NV906F_SEMAPHOREC(p, 1);
978 P_NV906F_SEMAPHORED(p, {
979 .operation = OPERATION_ACQ_GEQ,
980 .acquire_switch = ACQUIRE_SWITCH_ENABLED,
981 .release_size = RELEASE_SIZE_4BYTE,
982 });
983 }
984 }
985
986 uint64_t dst_addr = nvk_buffer_address(dst_buffer, dstOffset);
987 nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
988 dst_addr, stride, flags);
989 }
990
991