1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9 #include <assert.h>
10 #include <fcntl.h>
11 #include <stdbool.h>
12 #include <string.h>
13
14 #include "bvh/bvh.h"
15 #include "meta/radv_meta.h"
16 #include "nir/nir_builder.h"
17 #include "util/u_atomic.h"
18 #include "vulkan/vulkan_core.h"
19 #include "radv_cs.h"
20 #include "radv_entrypoints.h"
21 #include "radv_perfcounter.h"
22 #include "radv_query.h"
23 #include "radv_rmv.h"
24 #include "sid.h"
25 #include "vk_acceleration_structure.h"
26 #include "vk_common_entrypoints.h"
27 #include "vk_shader_module.h"
28
29 #define TIMESTAMP_NOT_READY UINT64_MAX
30
31 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10, 13, 11, 12};
32
33 static unsigned
radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)34 radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
35 {
36 uint32_t idx = ffs(query) - 1;
37 return pipeline_statistics_indices[idx] * 8;
38 }
39
40 static unsigned
radv_get_pipelinestat_query_size(struct radv_device * device)41 radv_get_pipelinestat_query_size(struct radv_device *device)
42 {
43 /* GFX10_3 only has 11 valid pipeline statistics queries but in order to emulate mesh/task shader
44 * invocations, it's easier to use the same size as GFX11.
45 */
46 const struct radv_physical_device *pdev = radv_device_physical(device);
47 unsigned num_results = pdev->info.gfx_level >= GFX10_3 ? 14 : 11;
48 return num_results * 8;
49 }
50
51 static bool
radv_occlusion_query_use_l2(const struct radv_physical_device * pdev)52 radv_occlusion_query_use_l2(const struct radv_physical_device *pdev)
53 {
54 /* Occlusion query writes don't go through L2 on GFX6-8 which means the driver would need to
55 * flush caches before every read in shaders or use MTYPE=3 (ie. uncached) in the buffer
56 * descriptor to bypass L2. Use the WAIT_REG_MEM logic instead which is easier to implement.
57 */
58 return pdev->info.gfx_level >= GFX9;
59 }
60
61 static void
radv_store_availability(nir_builder * b,nir_def * flags,nir_def * dst_buf,nir_def * offset,nir_def * value32)62 radv_store_availability(nir_builder *b, nir_def *flags, nir_def *dst_buf, nir_def *offset, nir_def *value32)
63 {
64 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
65
66 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
67
68 nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
69
70 nir_push_else(b, NULL);
71
72 nir_store_ssbo(b, value32, dst_buf, offset);
73
74 nir_pop_if(b, NULL);
75
76 nir_pop_if(b, NULL);
77 }
78
79 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)80 build_occlusion_query_shader(struct radv_device *device)
81 {
82 /* the shader this builds is roughly
83 *
84 * push constants {
85 * uint32_t flags;
86 * uint32_t dst_stride;
87 * };
88 *
89 * uint32_t src_stride = 16 * db_count;
90 *
91 * location(binding = 0) buffer dst_buf;
92 * location(binding = 1) buffer src_buf;
93 *
94 * void main() {
95 * uint64_t result = 0;
96 * uint64_t src_offset = src_stride * global_id.x;
97 * uint64_t dst_offset = dst_stride * global_id.x;
98 * bool available = true;
99 * for (int i = 0; i < db_count; ++i) {
100 * if (enabled_rb_mask & BITFIELD64_BIT(i)) {
101 * uint64_t start = src_buf[src_offset + 16 * i];
102 * uint64_t end = src_buf[src_offset + 16 * i + 8];
103 * if ((start & (1ull << 63)) && (end & (1ull << 63)))
104 * result += end - start;
105 * else
106 * available = false;
107 * }
108 * }
109 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
110 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
111 * if (flags & VK_QUERY_RESULT_64_BIT)
112 * dst_buf[dst_offset] = result;
113 * else
114 * dst_buf[dst_offset] = (uint32_t)result.
115 * }
116 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
117 * dst_buf[dst_offset + elem_size] = available;
118 * }
119 * }
120 */
121 const struct radv_physical_device *pdev = radv_device_physical(device);
122 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
123 b.shader->info.workgroup_size[0] = 64;
124
125 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
126 nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
127 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
128 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
129 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
130 uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
131 unsigned db_count = pdev->info.max_render_backends;
132
133 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
134
135 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
136 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
137
138 nir_def *global_id = get_global_ids(&b, 1);
139
140 nir_def *input_stride = nir_imm_int(&b, db_count * 16);
141 nir_def *input_base = nir_imul(&b, input_stride, global_id);
142 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
143 nir_def *output_base = nir_imul(&b, output_stride, global_id);
144
145 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
146 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
147 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
148
149 if (radv_occlusion_query_use_l2(pdev)) {
150 nir_def *query_result_wait = nir_test_mask(&b, flags, VK_QUERY_RESULT_WAIT_BIT);
151 nir_push_if(&b, query_result_wait);
152 {
153 /* Wait on the upper word of the last DB entry. */
154 nir_push_loop(&b);
155 {
156 const uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
157
158 /* Prevent the SSBO load to be moved out of the loop. */
159 nir_scoped_memory_barrier(&b, SCOPE_INVOCATION, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo);
160
161 nir_def *load_offset = nir_iadd_imm(&b, input_base, rb_avail_offset);
162 nir_def *load = nir_load_ssbo(&b, 1, 32, src_buf, load_offset, .align_mul = 4, .access = ACCESS_COHERENT);
163
164 nir_push_if(&b, nir_ige_imm(&b, load, 0x80000000));
165 {
166 nir_jump(&b, nir_jump_break);
167 }
168 nir_pop_if(&b, NULL);
169 }
170 nir_pop_loop(&b, NULL);
171 }
172 nir_pop_if(&b, NULL);
173 }
174
175 nir_push_loop(&b);
176
177 nir_def *current_outer_count = nir_load_var(&b, outer_counter);
178 radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
179
180 nir_def *enabled_cond = nir_iand_imm(&b, nir_ishl(&b, nir_imm_int64(&b, 1), current_outer_count), enabled_rb_mask);
181
182 nir_push_if(&b, nir_i2b(&b, enabled_cond));
183
184 nir_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
185 load_offset = nir_iadd(&b, input_base, load_offset);
186
187 nir_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
188
189 nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
190 nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
191
192 nir_def *start_done = nir_ilt_imm(&b, nir_load_var(&b, start), 0);
193 nir_def *end_done = nir_ilt_imm(&b, nir_load_var(&b, end), 0);
194
195 nir_push_if(&b, nir_iand(&b, start_done, end_done));
196
197 nir_store_var(&b, result,
198 nir_iadd(&b, nir_load_var(&b, result), nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
199 0x1);
200
201 nir_push_else(&b, NULL);
202
203 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
204
205 nir_pop_if(&b, NULL);
206 nir_pop_if(&b, NULL);
207 nir_pop_loop(&b, NULL);
208
209 /* Store the result if complete or if partial results have been requested. */
210
211 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
212 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
213 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
214
215 nir_push_if(&b, result_is_64bit);
216
217 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
218
219 nir_push_else(&b, NULL);
220
221 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, .align_mul = 8);
222
223 nir_pop_if(&b, NULL);
224 nir_pop_if(&b, NULL);
225
226 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
227 nir_b2i32(&b, nir_load_var(&b, available)));
228
229 return b.shader;
230 }
231
232 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)233 build_pipeline_statistics_query_shader(struct radv_device *device)
234 {
235 unsigned pipelinestat_block_size = +radv_get_pipelinestat_query_size(device);
236
237 /* the shader this builds is roughly
238 *
239 * push constants {
240 * uint32_t flags;
241 * uint32_t dst_stride;
242 * uint32_t stats_mask;
243 * uint32_t avail_offset;
244 * };
245 *
246 * uint32_t src_stride = pipelinestat_block_size * 2;
247 *
248 * location(binding = 0) buffer dst_buf;
249 * location(binding = 1) buffer src_buf;
250 *
251 * void main() {
252 * uint64_t src_offset = src_stride * global_id.x;
253 * uint64_t dst_base = dst_stride * global_id.x;
254 * uint64_t dst_offset = dst_base;
255 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
256 * uint32_t elem_count = stats_mask >> 16;
257 * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
258 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
259 * dst_buf[dst_offset + elem_count * elem_size] = available32;
260 * }
261 * if ((bool)available32) {
262 * // repeat 11 times:
263 * if (stats_mask & (1 << 0)) {
264 * uint64_t start = src_buf[src_offset + 8 * indices[0]];
265 * uint64_t end = src_buf[src_offset + 8 * indices[0] +
266 * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
267 * dst_buf[dst_offset] = result;
268 * else
269 * dst_buf[dst_offset] = (uint32_t)result.
270 * dst_offset += elem_size;
271 * }
272 * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
273 * // Set everything to 0 as we don't know what is valid.
274 * for (int i = 0; i < elem_count; ++i)
275 * dst_buf[dst_base + elem_size * i] = 0;
276 * }
277 * }
278 */
279 const struct radv_physical_device *pdev = radv_device_physical(device);
280 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
281 b.shader->info.workgroup_size[0] = 64;
282
283 nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
284 nir_variable *result = nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
285 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
286
287 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
288 nir_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
289 nir_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
290
291 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
292 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
293
294 nir_def *global_id = get_global_ids(&b, 1);
295
296 nir_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
297 nir_def *input_base = nir_imul(&b, input_stride, global_id);
298 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
299 nir_def *output_base = nir_imul(&b, output_stride, global_id);
300
301 avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
302
303 nir_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
304 nir_store_var(&b, available, nir_i2b(&b, available32), 0x1);
305
306 if (pdev->emulate_mesh_shader_queries) {
307 nir_push_if(&b, nir_test_mask(&b, stats_mask, VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT));
308 {
309 const uint32_t idx = ffs(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT) - 1;
310
311 nir_def *avail_start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + 4);
312 nir_def *avail_start = nir_load_ssbo(&b, 1, 32, src_buf, avail_start_offset);
313
314 nir_def *avail_end_offset =
315 nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + pipelinestat_block_size + 4);
316 nir_def *avail_end = nir_load_ssbo(&b, 1, 32, src_buf, avail_end_offset);
317
318 nir_def *task_invoc_result_available =
319 nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avail_start, avail_end), 0x80000000));
320
321 nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), task_invoc_result_available), 0x1);
322 }
323 nir_pop_if(&b, NULL);
324 }
325
326 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
327 nir_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
328 nir_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
329
330 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
331 nir_b2i32(&b, nir_load_var(&b, available)));
332
333 nir_push_if(&b, nir_load_var(&b, available));
334
335 nir_store_var(&b, output_offset, output_base, 0x1);
336 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
337 nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
338
339 nir_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
340 nir_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
341
342 nir_def *end_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
343 nir_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
344
345 nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
346
347 /* Store result */
348 nir_push_if(&b, result_is_64bit);
349
350 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
351
352 nir_push_else(&b, NULL);
353
354 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
355
356 nir_pop_if(&b, NULL);
357
358 nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size), 0x1);
359
360 nir_pop_if(&b, NULL);
361 }
362
363 nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
364
365 nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
366
367 /* Stores zeros in all outputs. */
368
369 nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
370 nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
371
372 nir_loop *loop = nir_push_loop(&b);
373
374 nir_def *current_counter = nir_load_var(&b, counter);
375 radv_break_on_count(&b, counter, elem_count);
376
377 nir_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
378 nir_push_if(&b, result_is_64bit);
379
380 nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
381
382 nir_push_else(&b, NULL);
383
384 nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
385
386 nir_pop_if(&b, NULL);
387
388 nir_pop_loop(&b, loop);
389 nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
390 nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
391 return b.shader;
392 }
393
394 static nir_shader *
build_tfb_query_shader(struct radv_device * device)395 build_tfb_query_shader(struct radv_device *device)
396 {
397 /* the shader this builds is roughly
398 *
399 * uint32_t src_stride = 32;
400 *
401 * location(binding = 0) buffer dst_buf;
402 * location(binding = 1) buffer src_buf;
403 *
404 * void main() {
405 * uint64_t result[2] = {};
406 * bool available = false;
407 * uint64_t src_offset = src_stride * global_id.x;
408 * uint64_t dst_offset = dst_stride * global_id.x;
409 * uint64_t *src_data = src_buf[src_offset];
410 * uint32_t avail = (src_data[0] >> 32) &
411 * (src_data[1] >> 32) &
412 * (src_data[2] >> 32) &
413 * (src_data[3] >> 32);
414 * if (avail & 0x80000000) {
415 * result[0] = src_data[3] - src_data[1];
416 * result[1] = src_data[2] - src_data[0];
417 * available = true;
418 * }
419 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
420 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
421 * if (flags & VK_QUERY_RESULT_64_BIT) {
422 * dst_buf[dst_offset] = result;
423 * } else {
424 * dst_buf[dst_offset] = (uint32_t)result;
425 * }
426 * }
427 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
428 * dst_buf[dst_offset + result_size] = available;
429 * }
430 * }
431 */
432 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
433 b.shader->info.workgroup_size[0] = 64;
434
435 /* Create and initialize local variables. */
436 nir_variable *result = nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
437 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
438
439 nir_store_var(&b, result, nir_replicate(&b, nir_imm_int64(&b, 0), 2), 0x3);
440 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
441
442 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
443
444 /* Load resources. */
445 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
446 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
447
448 /* Compute global ID. */
449 nir_def *global_id = get_global_ids(&b, 1);
450
451 /* Compute src/dst strides. */
452 nir_def *input_stride = nir_imm_int(&b, 32);
453 nir_def *input_base = nir_imul(&b, input_stride, global_id);
454 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
455 nir_def *output_base = nir_imul(&b, output_stride, global_id);
456
457 /* Load data from the query pool. */
458 nir_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
459 nir_def *load2 = nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
460
461 /* Check if result is available. */
462 nir_def *avails[2];
463 avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
464 avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
465 nir_def *result_is_available = nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
466
467 /* Only compute result if available. */
468 nir_push_if(&b, result_is_available);
469
470 /* Pack values. */
471 nir_def *packed64[4];
472 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
473 packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
474 packed64[2] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
475 packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
476
477 /* Compute result. */
478 nir_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
479 nir_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
480
481 nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
482 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
483
484 nir_pop_if(&b, NULL);
485
486 /* Determine if result is 64 or 32 bit. */
487 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
488 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
489
490 /* Store the result if complete or partial results have been requested. */
491 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
492
493 /* Store result. */
494 nir_push_if(&b, result_is_64bit);
495
496 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
497
498 nir_push_else(&b, NULL);
499
500 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
501
502 nir_pop_if(&b, NULL);
503 nir_pop_if(&b, NULL);
504
505 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
506 nir_b2i32(&b, nir_load_var(&b, available)));
507
508 return b.shader;
509 }
510
511 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)512 build_timestamp_query_shader(struct radv_device *device)
513 {
514 /* the shader this builds is roughly
515 *
516 * uint32_t src_stride = 8;
517 *
518 * location(binding = 0) buffer dst_buf;
519 * location(binding = 1) buffer src_buf;
520 *
521 * void main() {
522 * uint64_t result = 0;
523 * bool available = false;
524 * uint64_t src_offset = src_stride * global_id.x;
525 * uint64_t dst_offset = dst_stride * global_id.x;
526 * uint64_t timestamp = src_buf[src_offset];
527 * if (timestamp != TIMESTAMP_NOT_READY) {
528 * result = timestamp;
529 * available = true;
530 * }
531 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
532 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
533 * if (flags & VK_QUERY_RESULT_64_BIT) {
534 * dst_buf[dst_offset] = result;
535 * } else {
536 * dst_buf[dst_offset] = (uint32_t)result;
537 * }
538 * }
539 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
540 * dst_buf[dst_offset + result_size] = available;
541 * }
542 * }
543 */
544 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
545 b.shader->info.workgroup_size[0] = 64;
546
547 /* Create and initialize local variables. */
548 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
549 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
550
551 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
552 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
553
554 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
555
556 /* Load resources. */
557 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
558 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
559
560 /* Compute global ID. */
561 nir_def *global_id = get_global_ids(&b, 1);
562
563 /* Compute src/dst strides. */
564 nir_def *input_stride = nir_imm_int(&b, 8);
565 nir_def *input_base = nir_imul(&b, input_stride, global_id);
566 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
567 nir_def *output_base = nir_imul(&b, output_stride, global_id);
568
569 /* Load data from the query pool. */
570 nir_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
571
572 /* Pack the timestamp. */
573 nir_def *timestamp;
574 timestamp = nir_pack_64_2x32(&b, nir_trim_vector(&b, load, 2));
575
576 /* Check if result is available. */
577 nir_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
578
579 /* Only store result if available. */
580 nir_push_if(&b, result_is_available);
581
582 nir_store_var(&b, result, timestamp, 0x1);
583 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
584
585 nir_pop_if(&b, NULL);
586
587 /* Determine if result is 64 or 32 bit. */
588 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
589 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
590
591 /* Store the result if complete or partial results have been requested. */
592 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
593
594 /* Store result. */
595 nir_push_if(&b, result_is_64bit);
596
597 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
598
599 nir_push_else(&b, NULL);
600
601 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
602
603 nir_pop_if(&b, NULL);
604
605 nir_pop_if(&b, NULL);
606
607 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
608 nir_b2i32(&b, nir_load_var(&b, available)));
609
610 return b.shader;
611 }
612
613 #define RADV_PGQ_STRIDE 32
614 #define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 8 * 2)
615
616 static nir_shader *
build_pg_query_shader(struct radv_device * device)617 build_pg_query_shader(struct radv_device *device)
618 {
619 /* the shader this builds is roughly
620 *
621 * uint32_t src_stride = 32;
622 *
623 * location(binding = 0) buffer dst_buf;
624 * location(binding = 1) buffer src_buf;
625 *
626 * void main() {
627 * uint64_t result = {};
628 * bool available = false;
629 * uint64_t src_offset = src_stride * global_id.x;
630 * uint64_t dst_offset = dst_stride * global_id.x;
631 * uint64_t *src_data = src_buf[src_offset];
632 * uint32_t avail = (src_data[0] >> 32) &
633 * (src_data[2] >> 32);
634 * if (avail & 0x80000000) {
635 * result = src_data[2] - src_data[0];
636 * if (use_gds) {
637 * uint32_t ngg_gds_result = 0;
638 * ngg_gds_result += src_data[9] - src_data[8];
639 * result += (uint64_t)ngg_gds_result;
640 * }
641 * available = true;
642 * }
643 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
644 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
645 * if (flags & VK_QUERY_RESULT_64_BIT) {
646 * dst_buf[dst_offset] = result;
647 * } else {
648 * dst_buf[dst_offset] = (uint32_t)result;
649 * }
650 * }
651 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
652 * dst_buf[dst_offset + result_size] = available;
653 * }
654 * }
655 */
656 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
657 b.shader->info.workgroup_size[0] = 64;
658
659 /* Create and initialize local variables. */
660 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
661 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
662
663 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
664 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
665
666 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
667
668 /* Load resources. */
669 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
670 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
671
672 /* Compute global ID. */
673 nir_def *global_id = get_global_ids(&b, 1);
674
675 /* Determine if the query pool uses GDS for NGG. */
676 nir_def *uses_gds = nir_i2b(&b, nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20));
677
678 /* Compute src/dst strides. */
679 nir_def *input_stride =
680 nir_bcsel(&b, uses_gds, nir_imm_int(&b, RADV_PGQ_STRIDE_GDS), nir_imm_int(&b, RADV_PGQ_STRIDE));
681 nir_def *input_base = nir_imul(&b, input_stride, global_id);
682 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
683 nir_def *output_base = nir_imul(&b, output_stride, global_id);
684
685 /* Load data from the query pool. */
686 nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
687 nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
688
689 /* Check if result is available. */
690 nir_def *avails[2];
691 avails[0] = nir_channel(&b, load1, 1);
692 avails[1] = nir_channel(&b, load2, 1);
693 nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1);
694
695 nir_push_if(&b, uses_gds);
696 {
697 nir_def *gds_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4);
698 nir_def *gds_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4);
699 nir_def *gds_result_available =
700 nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, gds_avail_start, gds_avail_end), 0x80000000));
701
702 nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), gds_result_available), 0x1);
703 }
704 nir_pop_if(&b, NULL);
705
706 /* Only compute result if available. */
707 nir_push_if(&b, nir_load_var(&b, available));
708
709 /* Pack values. */
710 nir_def *packed64[2];
711 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
712 packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
713
714 /* Compute result. */
715 nir_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
716
717 nir_store_var(&b, result, primitive_storage_needed, 0x1);
718
719 nir_push_if(&b, uses_gds);
720 {
721 nir_def *gds_start =
722 nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
723 nir_def *gds_end =
724 nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4);
725
726 nir_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
727
728 nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), nir_u2u64(&b, ngg_gds_result)), 0x1);
729 }
730 nir_pop_if(&b, NULL);
731
732 nir_pop_if(&b, NULL);
733
734 /* Determine if result is 64 or 32 bit. */
735 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
736 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
737
738 /* Store the result if complete or partial results have been requested. */
739 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
740
741 /* Store result. */
742 nir_push_if(&b, result_is_64bit);
743
744 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
745
746 nir_push_else(&b, NULL);
747
748 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
749
750 nir_pop_if(&b, NULL);
751 nir_pop_if(&b, NULL);
752
753 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
754 nir_b2i32(&b, nir_load_var(&b, available)));
755
756 return b.shader;
757 }
758
759 static nir_shader *
build_ms_prim_gen_query_shader(struct radv_device * device)760 build_ms_prim_gen_query_shader(struct radv_device *device)
761 {
762 /* the shader this builds is roughly
763 *
764 * uint32_t src_stride = 32;
765 *
766 * location(binding = 0) buffer dst_buf;
767 * location(binding = 1) buffer src_buf;
768 *
769 * void main() {
770 * uint64_t result = {};
771 * bool available = false;
772 * uint64_t src_offset = src_stride * global_id.x;
773 * uint64_t dst_offset = dst_stride * global_id.x;
774 * uint64_t *src_data = src_buf[src_offset];
775 * uint32_t avail = (src_data[0] >> 32) & (src_data[1] >> 32);
776 * if (avail & 0x80000000) {
777 * result = src_data[1] - src_data[0];
778 * available = true;
779 * }
780 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
781 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
782 * if (flags & VK_QUERY_RESULT_64_BIT) {
783 * dst_buf[dst_offset] = result;
784 * } else {
785 * dst_buf[dst_offset] = (uint32_t)result;
786 * }
787 * }
788 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
789 * dst_buf[dst_offset + result_size] = available;
790 * }
791 * }
792 */
793 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "ms_prim_gen_query");
794 b.shader->info.workgroup_size[0] = 64;
795
796 /* Create and initialize local variables. */
797 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
798 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
799
800 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
801 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
802
803 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
804
805 /* Load resources. */
806 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
807 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
808
809 /* Compute global ID. */
810 nir_def *global_id = get_global_ids(&b, 1);
811
812 /* Compute src/dst strides. */
813 nir_def *input_base = nir_imul_imm(&b, global_id, 16);
814 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
815 nir_def *output_base = nir_imul(&b, output_stride, global_id);
816
817 /* Load data from the query pool. */
818 nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
819 nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd_imm(&b, input_base, 8), .align_mul = 16);
820
821 /* Check if result is available. */
822 nir_def *avails[2];
823 avails[0] = nir_channel(&b, load1, 1);
824 avails[1] = nir_channel(&b, load2, 1);
825 nir_def *result_is_available = nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000));
826
827 /* Only compute result if available. */
828 nir_push_if(&b, result_is_available);
829
830 /* Pack values. */
831 nir_def *packed64[2];
832 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
833 packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
834
835 /* Compute result. */
836 nir_def *ms_prim_gen = nir_isub(&b, packed64[1], packed64[0]);
837
838 nir_store_var(&b, result, ms_prim_gen, 0x1);
839
840 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
841
842 nir_pop_if(&b, NULL);
843
844 /* Determine if result is 64 or 32 bit. */
845 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
846 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
847
848 /* Store the result if complete or partial results have been requested. */
849 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
850
851 /* Store result. */
852 nir_push_if(&b, result_is_64bit);
853
854 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
855
856 nir_push_else(&b, NULL);
857
858 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
859
860 nir_pop_if(&b, NULL);
861 nir_pop_if(&b, NULL);
862
863 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
864 nir_b2i32(&b, nir_load_var(&b, available)));
865
866 return b.shader;
867 }
868
869 static VkResult
radv_device_init_meta_query_state_internal(struct radv_device * device)870 radv_device_init_meta_query_state_internal(struct radv_device *device)
871 {
872 const struct radv_physical_device *pdev = radv_device_physical(device);
873 VkResult result;
874 nir_shader *occlusion_cs = NULL;
875 nir_shader *pipeline_statistics_cs = NULL;
876 nir_shader *tfb_cs = NULL;
877 nir_shader *timestamp_cs = NULL;
878 nir_shader *pg_cs = NULL;
879 nir_shader *ms_prim_gen_cs = NULL;
880
881 mtx_lock(&device->meta_state.mtx);
882 if (device->meta_state.query.pipeline_statistics_query_pipeline) {
883 mtx_unlock(&device->meta_state.mtx);
884 return VK_SUCCESS;
885 }
886 occlusion_cs = build_occlusion_query_shader(device);
887 pipeline_statistics_cs = build_pipeline_statistics_query_shader(device);
888 tfb_cs = build_tfb_query_shader(device);
889 timestamp_cs = build_timestamp_query_shader(device);
890 pg_cs = build_pg_query_shader(device);
891
892 if (pdev->emulate_mesh_shader_queries)
893 ms_prim_gen_cs = build_ms_prim_gen_query_shader(device);
894
895 const VkDescriptorSetLayoutBinding bindings[] = {
896 {.binding = 0,
897 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
898 .descriptorCount = 1,
899 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT},
900 {
901 .binding = 1,
902 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
903 .descriptorCount = 1,
904 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
905 },
906 };
907
908 result = radv_meta_create_descriptor_set_layout(device, 2, bindings, &device->meta_state.query.ds_layout);
909 if (result != VK_SUCCESS)
910 goto fail;
911
912 const VkPushConstantRange pc_range = {
913 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
914 .size = 20,
915 };
916
917 result = radv_meta_create_pipeline_layout(device, &device->meta_state.query.ds_layout, 1, &pc_range,
918 &device->meta_state.query.p_layout);
919 if (result != VK_SUCCESS)
920 goto fail;
921
922 result = radv_meta_create_compute_pipeline(device, occlusion_cs, device->meta_state.query.p_layout,
923 &device->meta_state.query.occlusion_query_pipeline);
924 if (result != VK_SUCCESS)
925 goto fail;
926
927 result = radv_meta_create_compute_pipeline(device, pipeline_statistics_cs, device->meta_state.query.p_layout,
928 &device->meta_state.query.pipeline_statistics_query_pipeline);
929 if (result != VK_SUCCESS)
930 goto fail;
931
932 result = radv_meta_create_compute_pipeline(device, tfb_cs, device->meta_state.query.p_layout,
933 &device->meta_state.query.tfb_query_pipeline);
934 if (result != VK_SUCCESS)
935 goto fail;
936
937 result = radv_meta_create_compute_pipeline(device, timestamp_cs, device->meta_state.query.p_layout,
938 &device->meta_state.query.timestamp_query_pipeline);
939 if (result != VK_SUCCESS)
940 goto fail;
941
942 result = radv_meta_create_compute_pipeline(device, pg_cs, device->meta_state.query.p_layout,
943 &device->meta_state.query.pg_query_pipeline);
944
945 if (pdev->emulate_mesh_shader_queries) {
946 result = radv_meta_create_compute_pipeline(device, ms_prim_gen_cs, device->meta_state.query.p_layout,
947 &device->meta_state.query.ms_prim_gen_query_pipeline);
948 }
949
950 fail:
951 ralloc_free(occlusion_cs);
952 ralloc_free(pipeline_statistics_cs);
953 ralloc_free(tfb_cs);
954 ralloc_free(pg_cs);
955 ralloc_free(ms_prim_gen_cs);
956 ralloc_free(timestamp_cs);
957 mtx_unlock(&device->meta_state.mtx);
958 return result;
959 }
960
961 VkResult
radv_device_init_meta_query_state(struct radv_device * device,bool on_demand)962 radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
963 {
964 if (on_demand)
965 return VK_SUCCESS;
966
967 return radv_device_init_meta_query_state_internal(device);
968 }
969
970 void
radv_device_finish_meta_query_state(struct radv_device * device)971 radv_device_finish_meta_query_state(struct radv_device *device)
972 {
973 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.tfb_query_pipeline,
974 &device->meta_state.alloc);
975 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.pipeline_statistics_query_pipeline,
976 &device->meta_state.alloc);
977 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.occlusion_query_pipeline,
978 &device->meta_state.alloc);
979 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.timestamp_query_pipeline,
980 &device->meta_state.alloc);
981 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.pg_query_pipeline,
982 &device->meta_state.alloc);
983 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.ms_prim_gen_query_pipeline,
984 &device->meta_state.alloc);
985 radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout,
986 &device->meta_state.alloc);
987 device->vk.dispatch_table.DestroyDescriptorSetLayout(radv_device_to_handle(device),
988 device->meta_state.query.ds_layout, &device->meta_state.alloc);
989 }
990
991 static void
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkPipeline * pipeline,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,size_t dst_size,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset,bool uses_gds)992 radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline, struct radeon_winsys_bo *src_bo,
993 struct radeon_winsys_bo *dst_bo, uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
994 uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags, uint32_t pipeline_stats_mask,
995 uint32_t avail_offset, bool uses_gds)
996 {
997 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
998 struct radv_meta_saved_state saved_state;
999 struct radv_buffer src_buffer, dst_buffer;
1000
1001 if (!*pipeline) {
1002 VkResult ret = radv_device_init_meta_query_state_internal(device);
1003 if (ret != VK_SUCCESS) {
1004 vk_command_buffer_set_error(&cmd_buffer->vk, ret);
1005 return;
1006 }
1007 }
1008
1009 /* VK_EXT_conditional_rendering says that copy commands should not be
1010 * affected by conditional rendering.
1011 */
1012 radv_meta_save(&saved_state, cmd_buffer,
1013 RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS |
1014 RADV_META_SUSPEND_PREDICATING);
1015
1016 uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
1017 uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
1018
1019 radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
1020 radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
1021
1022 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1023
1024 radv_meta_push_descriptor_set(
1025 cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, 2,
1026 (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1027 .dstBinding = 0,
1028 .dstArrayElement = 0,
1029 .descriptorCount = 1,
1030 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1031 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1032 .offset = 0,
1033 .range = VK_WHOLE_SIZE}},
1034 {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1035 .dstBinding = 1,
1036 .dstArrayElement = 0,
1037 .descriptorCount = 1,
1038 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1039 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1040 .offset = 0,
1041 .range = VK_WHOLE_SIZE}}});
1042
1043 /* Encode the number of elements for easy access by the shader. */
1044 pipeline_stats_mask &= (1 << (radv_get_pipelinestat_query_size(device) / 8)) - 1;
1045 pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1046
1047 avail_offset -= src_offset;
1048
1049 struct {
1050 uint32_t flags;
1051 uint32_t dst_stride;
1052 uint32_t pipeline_stats_mask;
1053 uint32_t avail_offset;
1054 uint32_t uses_gds;
1055 } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_gds};
1056
1057 vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout,
1058 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
1059
1060 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1061
1062 if (flags & VK_QUERY_RESULT_WAIT_BIT)
1063 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1064
1065 radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1066
1067 /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1068 * there is an implicit execution dependency from each such query command to all query commands
1069 * previously submitted to the same queue.
1070 */
1071 cmd_buffer->active_query_flush_bits |=
1072 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1073
1074 radv_buffer_finish(&src_buffer);
1075 radv_buffer_finish(&dst_buffer);
1076
1077 radv_meta_restore(&saved_state, cmd_buffer);
1078 }
1079
1080 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1081 radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1082 struct radv_query_pool *pool)
1083 {
1084 if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1085 radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1086
1087 if (pool->bo)
1088 radv_bo_destroy(device, &pool->vk.base, pool->bo);
1089
1090 radv_rmv_log_resource_destroy(device, (uint64_t)radv_query_pool_to_handle(pool));
1091 vk_query_pool_finish(&pool->vk);
1092 vk_free2(&device->vk.alloc, pAllocator, pool);
1093 }
1094
1095 static VkResult
radv_create_query_pool(struct radv_device * device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1096 radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *pCreateInfo,
1097 const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1098 {
1099 struct radv_physical_device *pdev = radv_device_physical(device);
1100 VkResult result;
1101 size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1102 ? sizeof(struct radv_pc_query_pool)
1103 : sizeof(struct radv_query_pool);
1104
1105 struct radv_query_pool *pool =
1106 vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1107
1108 if (!pool)
1109 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1110
1111 vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
1112
1113 /* The number of primitives generated by geometry shader invocations is only counted by the
1114 * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1115 * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1116 *
1117 * The number of geometry shader invocations is correctly counted by the hardware for both NGG
1118 * and the legacy GS path but it increments for NGG VS/TES because they are merged with GS. To
1119 * avoid this counter to increment, it's also emulated.
1120 */
1121 pool->uses_gds = (pdev->emulate_ngg_gs_query_pipeline_stat &&
1122 (pool->vk.pipeline_statistics & (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
1123 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT))) ||
1124 (pdev->use_ngg && pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) ||
1125 (pdev->emulate_mesh_shader_queries &&
1126 (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT ||
1127 pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT));
1128
1129 /* The number of task shader invocations needs to be queried on ACE. */
1130 pool->uses_ace = (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1131
1132 switch (pCreateInfo->queryType) {
1133 case VK_QUERY_TYPE_OCCLUSION:
1134 pool->stride = 16 * pdev->info.max_render_backends;
1135 break;
1136 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1137 pool->stride = radv_get_pipelinestat_query_size(device) * 2;
1138 break;
1139 case VK_QUERY_TYPE_TIMESTAMP:
1140 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1141 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1142 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1143 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1144 pool->stride = 8;
1145 break;
1146 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1147 pool->stride = 32;
1148 break;
1149 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1150 if (pool->uses_gds && pdev->info.gfx_level < GFX11) {
1151 /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
1152 * allocate 2x64-bit values for the GDS counters.
1153 */
1154 pool->stride = RADV_PGQ_STRIDE_GDS;
1155 } else {
1156 pool->stride = RADV_PGQ_STRIDE;
1157 }
1158 break;
1159 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1160 result = radv_pc_init_query_pool(pdev, pCreateInfo, (struct radv_pc_query_pool *)pool);
1161
1162 if (result != VK_SUCCESS) {
1163 radv_destroy_query_pool(device, pAllocator, pool);
1164 return vk_error(device, result);
1165 }
1166 break;
1167 }
1168 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1169 if (pdev->info.gfx_level >= GFX11) {
1170 /* GFX11 natively supports mesh generated primitives with pipeline statistics. */
1171 pool->stride = radv_get_pipelinestat_query_size(device) * 2;
1172 } else {
1173 assert(pdev->emulate_mesh_shader_queries);
1174 pool->stride = 16;
1175 }
1176 break;
1177 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
1178 pool->stride = 48;
1179 break;
1180 default:
1181 unreachable("creating unhandled query type");
1182 }
1183
1184 pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1185 pool->size = pool->availability_offset;
1186 if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1187 (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11))
1188 pool->size += 4 * pCreateInfo->queryCount;
1189
1190 result = radv_bo_create(device, &pool->vk.base, pool->size, 64, RADEON_DOMAIN_GTT,
1191 RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_QUERY_POOL, 0, false, &pool->bo);
1192 if (result != VK_SUCCESS) {
1193 radv_destroy_query_pool(device, pAllocator, pool);
1194 return vk_error(device, result);
1195 }
1196
1197 pool->ptr = radv_buffer_map(device->ws, pool->bo);
1198 if (!pool->ptr) {
1199 radv_destroy_query_pool(device, pAllocator, pool);
1200 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1201 }
1202
1203 *pQueryPool = radv_query_pool_to_handle(pool);
1204 radv_rmv_log_query_pool_create(device, *pQueryPool);
1205 return VK_SUCCESS;
1206 }
1207
1208 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1209 radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
1210 const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1211 {
1212 VK_FROM_HANDLE(radv_device, device, _device);
1213 return radv_create_query_pool(device, pCreateInfo, pAllocator, pQueryPool);
1214 }
1215
1216 VKAPI_ATTR void VKAPI_CALL
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)1217 radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
1218 {
1219 VK_FROM_HANDLE(radv_device, device, _device);
1220 VK_FROM_HANDLE(radv_query_pool, pool, _pool);
1221
1222 if (!pool)
1223 return;
1224
1225 radv_destroy_query_pool(device, pAllocator, pool);
1226 }
1227
1228 static inline uint64_t
radv_get_rel_timeout_for_query(VkQueryType type)1229 radv_get_rel_timeout_for_query(VkQueryType type)
1230 {
1231 /*
1232 * Certain queries are only possible on certain types of queues
1233 * so pick the TDR timeout of the highest possible type
1234 * and double it to ensure GetQueryPoolResults completes in finite-time.
1235 *
1236 * (compute has longer TDR than gfx, other rings)
1237 */
1238 switch (type) {
1239 case VK_QUERY_TYPE_OCCLUSION:
1240 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1241 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1242 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1243 return radv_get_tdr_timeout_for_ip(AMD_IP_GFX) * 2;
1244 default:
1245 return radv_get_tdr_timeout_for_ip(AMD_IP_COMPUTE) * 2;
1246 }
1247 }
1248
1249 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)1250 radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount,
1251 size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)
1252 {
1253 VK_FROM_HANDLE(radv_device, device, _device);
1254 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
1255 const struct radv_physical_device *pdev = radv_device_physical(device);
1256 char *data = pData;
1257 VkResult result = VK_SUCCESS;
1258
1259 if (vk_device_is_lost(&device->vk))
1260 return VK_ERROR_DEVICE_LOST;
1261
1262 for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
1263 char *dest = data;
1264 unsigned query = firstQuery + query_idx;
1265 char *src = pool->ptr + query * pool->stride;
1266 uint32_t available;
1267 bool timed_out = false;
1268 uint64_t atimeout = os_time_get_absolute_timeout(radv_get_rel_timeout_for_query(pool->vk.query_type));
1269
1270 switch (pool->vk.query_type) {
1271 case VK_QUERY_TYPE_TIMESTAMP:
1272 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1273 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1274 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1275 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
1276 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1277 uint64_t value;
1278
1279 do {
1280 value = p_atomic_read(&src64->value);
1281 } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1282 !(timed_out = (atimeout < os_time_get_nano())));
1283
1284 available = value != TIMESTAMP_NOT_READY;
1285
1286 if (timed_out)
1287 result = VK_ERROR_DEVICE_LOST;
1288 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1289 result = VK_NOT_READY;
1290
1291 if (flags & VK_QUERY_RESULT_64_BIT) {
1292 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1293 *(uint64_t *)dest = value;
1294 dest += 8;
1295 } else {
1296 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1297 *(uint32_t *)dest = (uint32_t)value;
1298 dest += 4;
1299 }
1300 break;
1301 }
1302 case VK_QUERY_TYPE_OCCLUSION: {
1303 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1304 uint32_t db_count = pdev->info.max_render_backends;
1305 uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
1306 uint64_t sample_count = 0;
1307 available = 1;
1308
1309 for (int i = 0; i < db_count; ++i) {
1310 uint64_t start, end;
1311
1312 if (!(enabled_rb_mask & (1ull << i)))
1313 continue;
1314
1315 do {
1316 start = p_atomic_read(&src64[2 * i].value);
1317 end = p_atomic_read(&src64[2 * i + 1].value);
1318 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1319 !(timed_out = (atimeout < os_time_get_nano())));
1320
1321 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1322 available = 0;
1323 else {
1324 sample_count += end - start;
1325 }
1326 }
1327
1328 if (timed_out)
1329 result = VK_ERROR_DEVICE_LOST;
1330 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1331 result = VK_NOT_READY;
1332
1333 if (flags & VK_QUERY_RESULT_64_BIT) {
1334 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1335 *(uint64_t *)dest = sample_count;
1336 dest += 8;
1337 } else {
1338 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1339 *(uint32_t *)dest = sample_count;
1340 dest += 4;
1341 }
1342 break;
1343 }
1344 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1345 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1346 const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1347
1348 do {
1349 available = p_atomic_read(avail_ptr);
1350
1351 if (pool->uses_ace && pdev->emulate_mesh_shader_queries) {
1352 const uint32_t task_invoc_offset =
1353 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1354 const uint32_t *avail_ptr_start = (const uint32_t *)(src + task_invoc_offset + 4);
1355 const uint32_t *avail_ptr_stop =
1356 (const uint32_t *)(src + pipelinestat_block_size + task_invoc_offset + 4);
1357
1358 if (!(p_atomic_read(avail_ptr_start) & 0x80000000) || !(p_atomic_read(avail_ptr_stop) & 0x80000000))
1359 available = 0;
1360 }
1361 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1362
1363 if (timed_out)
1364 result = VK_ERROR_DEVICE_LOST;
1365 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1366 result = VK_NOT_READY;
1367
1368 const uint64_t *start = (uint64_t *)src;
1369 const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1370 if (flags & VK_QUERY_RESULT_64_BIT) {
1371 uint64_t *dst = (uint64_t *)dest;
1372 dest += util_bitcount(pool->vk.pipeline_statistics) * 8;
1373 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1374 if (pool->vk.pipeline_statistics & (1u << i)) {
1375 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1376 *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
1377 }
1378 dst++;
1379 }
1380 }
1381
1382 } else {
1383 uint32_t *dst = (uint32_t *)dest;
1384 dest += util_bitcount(pool->vk.pipeline_statistics) * 4;
1385 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1386 if (pool->vk.pipeline_statistics & (1u << i)) {
1387 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1388 *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
1389 }
1390 dst++;
1391 }
1392 }
1393 }
1394 break;
1395 }
1396 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1397 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1398 uint64_t num_primitives_written;
1399 uint64_t primitive_storage_needed;
1400
1401 /* SAMPLE_STREAMOUTSTATS stores this structure:
1402 * {
1403 * u64 NumPrimitivesWritten;
1404 * u64 PrimitiveStorageNeeded;
1405 * }
1406 */
1407 do {
1408 available = 1;
1409 for (int j = 0; j < 4; j++) {
1410 if (!(p_atomic_read(&src64[j].value) & 0x8000000000000000UL))
1411 available = 0;
1412 }
1413 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1414
1415 if (timed_out)
1416 result = VK_ERROR_DEVICE_LOST;
1417 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1418 result = VK_NOT_READY;
1419
1420 num_primitives_written = p_atomic_read_relaxed(&src64[3].value) - p_atomic_read_relaxed(&src64[1].value);
1421 primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
1422
1423 if (flags & VK_QUERY_RESULT_64_BIT) {
1424 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1425 *(uint64_t *)dest = num_primitives_written;
1426 dest += 8;
1427 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1428 *(uint64_t *)dest = primitive_storage_needed;
1429 dest += 8;
1430 } else {
1431 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1432 *(uint32_t *)dest = num_primitives_written;
1433 dest += 4;
1434 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1435 *(uint32_t *)dest = primitive_storage_needed;
1436 dest += 4;
1437 }
1438 break;
1439 }
1440 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1441 const bool uses_gds_query = pool->uses_gds && pdev->info.gfx_level < GFX11;
1442 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1443 uint64_t primitive_storage_needed;
1444
1445 /* SAMPLE_STREAMOUTSTATS stores this structure:
1446 * {
1447 * u64 NumPrimitivesWritten;
1448 * u64 PrimitiveStorageNeeded;
1449 * }
1450 */
1451 do {
1452 available = 1;
1453 if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
1454 !(p_atomic_read(&src64[2].value) & 0x8000000000000000UL)) {
1455 available = 0;
1456 }
1457 if (uses_gds_query && (!(p_atomic_read(&src64[4].value) & 0x8000000000000000UL) ||
1458 !(p_atomic_read(&src64[5].value) & 0x8000000000000000UL))) {
1459 available = 0;
1460 }
1461 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1462
1463 if (timed_out)
1464 result = VK_ERROR_DEVICE_LOST;
1465 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1466 result = VK_NOT_READY;
1467
1468 primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
1469
1470 if (uses_gds_query) {
1471 /* Accumulate the result that was copied from GDS in case NGG shader has been used. */
1472 primitive_storage_needed += p_atomic_read_relaxed(&src64[5].value) - p_atomic_read_relaxed(&src64[4].value);
1473 }
1474
1475 if (flags & VK_QUERY_RESULT_64_BIT) {
1476 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1477 *(uint64_t *)dest = primitive_storage_needed;
1478 dest += 8;
1479 } else {
1480 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1481 *(uint32_t *)dest = primitive_storage_needed;
1482 dest += 4;
1483 }
1484 break;
1485 }
1486 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1487 struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
1488 const p_atomic_uint64_t *src64 = (const p_atomic_uint64_t *)src;
1489 bool avail;
1490 do {
1491 avail = true;
1492 for (unsigned i = 0; i < pc_pool->num_passes; ++i)
1493 if (!p_atomic_read(&src64[pool->stride / 8 - i - 1].value))
1494 avail = false;
1495 } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1496
1497 available = avail;
1498
1499 radv_pc_get_results(pc_pool, &src64->value, dest);
1500 dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
1501 break;
1502 }
1503 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
1504 uint64_t ms_prim_gen;
1505
1506 if (pdev->info.gfx_level >= GFX11) {
1507 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1508 const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1509
1510 do {
1511 available = p_atomic_read(avail_ptr);
1512 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1513 !(timed_out = (atimeout < os_time_get_nano())));
1514
1515 const uint64_t *start = (uint64_t *)src;
1516 const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1517
1518 ms_prim_gen = stop[pipeline_statistics_indices[13]] - start[pipeline_statistics_indices[13]];
1519 } else {
1520 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1521
1522 do {
1523 available = 1;
1524 if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
1525 !(p_atomic_read(&src64[1].value) & 0x8000000000000000UL)) {
1526 available = 0;
1527 }
1528 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1529 !(timed_out = (atimeout < os_time_get_nano())));
1530
1531 ms_prim_gen = p_atomic_read_relaxed(&src64[1].value) - p_atomic_read_relaxed(&src64[0].value);
1532 }
1533
1534 if (timed_out)
1535 result = VK_ERROR_DEVICE_LOST;
1536 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1537 result = VK_NOT_READY;
1538
1539 if (flags & VK_QUERY_RESULT_64_BIT) {
1540 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1541 *(uint64_t *)dest = ms_prim_gen;
1542 dest += 8;
1543 } else {
1544 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1545 *(uint32_t *)dest = ms_prim_gen;
1546 dest += 4;
1547 }
1548 break;
1549 }
1550 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR: {
1551 uint32_t *src32 = (uint32_t *)src;
1552 uint32_t value;
1553 do {
1554 value = p_atomic_read(&src32[1]);
1555 } while (value != 1 && (flags & VK_QUERY_RESULT_WAIT_BIT));
1556
1557 available = value != 0;
1558
1559 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1560 result = VK_NOT_READY;
1561
1562 if (flags & VK_QUERY_RESULT_64_BIT) {
1563 uint64_t *dest64 = (uint64_t *)dest;
1564 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1565 dest64[0] = src32[5];
1566 dest64[1] = src32[6];
1567 }
1568 dest += 16;
1569 if (flags & VK_QUERY_RESULT_WITH_STATUS_BIT_KHR) {
1570 dest64[2] = 1;
1571 dest += 8;
1572 }
1573 } else {
1574 uint32_t *dest32 = (uint32_t *)dest;
1575 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1576 dest32[0] = src32[5];
1577 dest32[1] = src32[6];
1578 }
1579 dest += 8;
1580 if (flags & VK_QUERY_RESULT_WITH_STATUS_BIT_KHR) {
1581 dest32[2] = 1;
1582 dest += 4;
1583 }
1584 }
1585 break;
1586 }
1587 default:
1588 unreachable("trying to get results of unhandled query type");
1589 }
1590
1591 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1592 if (flags & VK_QUERY_RESULT_64_BIT) {
1593 *(uint64_t *)dest = available;
1594 } else {
1595 *(uint32_t *)dest = available;
1596 }
1597 }
1598 }
1599
1600 if (result == VK_ERROR_DEVICE_LOST)
1601 vk_device_set_lost(&device->vk, "GetQueryPoolResults timed out");
1602
1603 return result;
1604 }
1605
1606 static void
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)1607 emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
1608 {
1609 if (cmd_buffer->pending_reset_query) {
1610 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1611 /* Only need to flush caches if the query pool size is
1612 * large enough to be reset using the compute shader
1613 * path. Small pools don't need any cache flushes
1614 * because we use a CP dma clear.
1615 */
1616 radv_emit_cache_flush(cmd_buffer);
1617 }
1618 }
1619 }
1620
1621 static size_t
radv_query_result_size(const struct radv_query_pool * pool,VkQueryResultFlags flags)1622 radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
1623 {
1624 unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
1625 switch (pool->vk.query_type) {
1626 case VK_QUERY_TYPE_TIMESTAMP:
1627 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1628 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1629 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1630 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1631 case VK_QUERY_TYPE_OCCLUSION:
1632 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1633 values += 1;
1634 break;
1635 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1636 values += util_bitcount(pool->vk.pipeline_statistics);
1637 break;
1638 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1639 values += 2;
1640 break;
1641 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1642 values += 1;
1643 break;
1644 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
1645 values += 1;
1646 break;
1647 default:
1648 unreachable("trying to get size of unhandled query type");
1649 }
1650 return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
1651 }
1652
1653 VKAPI_ATTR void VKAPI_CALL
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1654 radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
1655 uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride,
1656 VkQueryResultFlags flags)
1657 {
1658 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1659 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
1660 VK_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1661 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1662 const struct radv_physical_device *pdev = radv_device_physical(device);
1663 const struct radv_instance *instance = radv_physical_device_instance(pdev);
1664 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1665 uint64_t va = radv_buffer_get_va(pool->bo);
1666 uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1667 size_t dst_size = radv_query_result_size(pool, flags);
1668 dest_va += dst_buffer->offset + dstOffset;
1669
1670 if (!queryCount)
1671 return;
1672
1673 radv_cs_add_buffer(device->ws, cmd_buffer->cs, pool->bo);
1674 radv_cs_add_buffer(device->ws, cmd_buffer->cs, dst_buffer->bo);
1675
1676 /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
1677 * synchronizes before query copy.
1678 */
1679 if (instance->drirc.flush_before_query_copy)
1680 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1681
1682 /* From the Vulkan spec 1.1.108:
1683 *
1684 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1685 * previous uses of vkCmdResetQueryPool in the same queue, without any
1686 * additional synchronization."
1687 *
1688 * So, we have to flush the caches if the compute shader path was used.
1689 */
1690 emit_query_flush(cmd_buffer, pool);
1691
1692 switch (pool->vk.query_type) {
1693 case VK_QUERY_TYPE_OCCLUSION:
1694 if (!radv_occlusion_query_use_l2(pdev)) {
1695 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1696 uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
1697 uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
1698 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1699 unsigned query = firstQuery + i;
1700 uint64_t src_va = va + query * pool->stride + rb_avail_offset;
1701
1702 radeon_check_space(device->ws, cs, 7);
1703
1704 /* Waits on the upper word of the last DB entry */
1705 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff);
1706 }
1707 }
1708 }
1709
1710 radv_query_shader(cmd_buffer, &device->meta_state.query.occlusion_query_pipeline, pool->bo, dst_buffer->bo,
1711 firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1712 queryCount, flags, 0, 0, false);
1713 break;
1714 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1715 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1716 const uint32_t task_invoc_offset =
1717 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1718 const unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1719
1720 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1721 unsigned query = firstQuery + i;
1722
1723 radeon_check_space(device->ws, cs, 7);
1724
1725 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1726
1727 /* This waits on the ME. All copies below are done on the ME */
1728 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1729
1730 if (pool->uses_ace && pdev->emulate_mesh_shader_queries) {
1731 const uint64_t src_va = va + query * pool->stride;
1732 const uint64_t start_va = src_va + task_invoc_offset + 4;
1733 const uint64_t stop_va = start_va + pipelinestat_block_size;
1734
1735 radeon_check_space(device->ws, cs, 7 * 2);
1736
1737 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, start_va, 0x80000000, 0xffffffff);
1738 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, stop_va, 0x80000000, 0xffffffff);
1739 }
1740 }
1741 }
1742 radv_query_shader(cmd_buffer, &device->meta_state.query.pipeline_statistics_query_pipeline, pool->bo,
1743 dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
1744 dst_size, queryCount, flags, pool->vk.pipeline_statistics,
1745 pool->availability_offset + 4 * firstQuery, false);
1746 break;
1747 case VK_QUERY_TYPE_TIMESTAMP:
1748 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1749 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1750 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1751 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1752 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1753 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1754 unsigned query = firstQuery + i;
1755 uint64_t local_src_va = va + query * pool->stride;
1756
1757 radeon_check_space(device->ws, cs, 7);
1758
1759 /* Wait on the high 32 bits of the timestamp in
1760 * case the low part is 0xffffffff.
1761 */
1762 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4, TIMESTAMP_NOT_READY >> 32,
1763 0xffffffff);
1764 }
1765 }
1766
1767 radv_query_shader(cmd_buffer, &device->meta_state.query.timestamp_query_pipeline, pool->bo, dst_buffer->bo,
1768 firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1769 queryCount, flags, 0, 0, false);
1770 break;
1771 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1772 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1773 for (unsigned i = 0; i < queryCount; i++) {
1774 unsigned query = firstQuery + i;
1775 uint64_t src_va = va + query * pool->stride;
1776
1777 radeon_check_space(device->ws, cs, 7 * 4);
1778
1779 /* Wait on the upper word of all results. */
1780 for (unsigned j = 0; j < 4; j++, src_va += 8) {
1781 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1782 }
1783 }
1784 }
1785
1786 radv_query_shader(cmd_buffer, &device->meta_state.query.tfb_query_pipeline, pool->bo, dst_buffer->bo,
1787 firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1788 queryCount, flags, 0, 0, false);
1789 break;
1790 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1791 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1792 const bool uses_gds_query = pool->uses_gds && pdev->info.gfx_level < GFX11;
1793
1794 for (unsigned i = 0; i < queryCount; i++) {
1795 unsigned query = firstQuery + i;
1796 uint64_t src_va = va + query * pool->stride;
1797
1798 radeon_check_space(device->ws, cs, 7 * 4);
1799
1800 /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1801 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1802 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1803
1804 if (uses_gds_query) {
1805 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff);
1806 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff);
1807 }
1808 }
1809 }
1810
1811 radv_query_shader(cmd_buffer, &device->meta_state.query.pg_query_pipeline, pool->bo, dst_buffer->bo,
1812 firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1813 queryCount, flags, 0, 0, pool->uses_gds && pdev->info.gfx_level < GFX11);
1814 break;
1815 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1816 if (pdev->info.gfx_level >= GFX11) {
1817 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1818 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1819 unsigned query = firstQuery + i;
1820
1821 radeon_check_space(device->ws, cs, 7);
1822
1823 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1824
1825 /* This waits on the ME. All copies below are done on the ME */
1826 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1827 }
1828 }
1829 radv_query_shader(cmd_buffer, &device->meta_state.query.pipeline_statistics_query_pipeline, pool->bo,
1830 dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride,
1831 stride, dst_size, queryCount, flags, 1 << 13, pool->availability_offset + 4 * firstQuery,
1832 false);
1833 } else {
1834 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1835 for (unsigned i = 0; i < queryCount; i++) {
1836 unsigned query = firstQuery + i;
1837 uint64_t src_va = va + query * pool->stride;
1838
1839 radeon_check_space(device->ws, cs, 7 * 2);
1840
1841 /* Wait on the upper word. */
1842 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1843 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 12, 0x80000000, 0xffffffff);
1844 }
1845 }
1846
1847 radv_query_shader(cmd_buffer, &device->meta_state.query.ms_prim_gen_query_pipeline, pool->bo, dst_buffer->bo,
1848 firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1849 queryCount, flags, 0, 0, false);
1850 }
1851 break;
1852 default:
1853 unreachable("trying to get results of unhandled query type");
1854 }
1855 }
1856
1857 static uint32_t
query_clear_value(VkQueryType type)1858 query_clear_value(VkQueryType type)
1859 {
1860 switch (type) {
1861 case VK_QUERY_TYPE_TIMESTAMP:
1862 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1863 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1864 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1865 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1866 return (uint32_t)TIMESTAMP_NOT_READY;
1867 default:
1868 return 0;
1869 }
1870 }
1871
1872 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1873 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
1874 {
1875 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1876 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
1877 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1878 const struct radv_physical_device *pdev = radv_device_physical(device);
1879 uint32_t value = query_clear_value(pool->vk.query_type);
1880 uint32_t flush_bits = 0;
1881
1882 if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC || cmd_buffer->qf == RADV_QUEUE_VIDEO_ENC)
1883 /* video queries don't work like this */
1884 return;
1885
1886 /* Make sure to sync all previous work if the given command buffer has
1887 * pending active queries. Otherwise the GPU might write queries data
1888 * after the reset operation.
1889 */
1890 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1891
1892 flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
1893 queryCount * pool->stride, value);
1894
1895 if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1896 (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11)) {
1897 flush_bits |=
1898 radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1899 radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4, queryCount * 4, 0);
1900 }
1901
1902 if (flush_bits) {
1903 /* Only need to flush caches for the compute shader path. */
1904 cmd_buffer->pending_reset_query = true;
1905 cmd_buffer->state.flush_bits |= flush_bits;
1906 }
1907 }
1908
1909 VKAPI_ATTR void VKAPI_CALL
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1910 radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
1911 {
1912 VK_FROM_HANDLE(radv_device, device, _device);
1913 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
1914 const struct radv_physical_device *pdev = radv_device_physical(device);
1915
1916 uint32_t value = query_clear_value(pool->vk.query_type);
1917 uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
1918 uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1919
1920 for (uint32_t *p = data; p != data_end; ++p)
1921 *p = value;
1922
1923 if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1924 (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11)) {
1925 memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
1926 }
1927 }
1928
1929 static unsigned
event_type_for_stream(unsigned stream)1930 event_type_for_stream(unsigned stream)
1931 {
1932 switch (stream) {
1933 default:
1934 case 0:
1935 return V_028A90_SAMPLE_STREAMOUTSTATS;
1936 case 1:
1937 return V_028A90_SAMPLE_STREAMOUTSTATS1;
1938 case 2:
1939 return V_028A90_SAMPLE_STREAMOUTSTATS2;
1940 case 3:
1941 return V_028A90_SAMPLE_STREAMOUTSTATS3;
1942 }
1943 }
1944
1945 static void
emit_sample_streamout(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1946 emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1947 {
1948 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1949 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1950
1951 radeon_check_space(device->ws, cs, 4);
1952
1953 assert(index < MAX_SO_STREAMS);
1954
1955 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1956 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1957 radeon_emit(cs, va);
1958 radeon_emit(cs, va >> 32);
1959 }
1960
1961 static void
gfx10_copy_gds_query(struct radeon_cmdbuf * cs,uint32_t gds_offset,uint64_t va)1962 gfx10_copy_gds_query(struct radeon_cmdbuf *cs, uint32_t gds_offset, uint64_t va)
1963 {
1964 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1965 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
1966 radeon_emit(cs, gds_offset);
1967 radeon_emit(cs, 0);
1968 radeon_emit(cs, va);
1969 radeon_emit(cs, va >> 32);
1970 }
1971
1972 static void
gfx10_copy_gds_query_gfx(struct radv_cmd_buffer * cmd_buffer,uint32_t gds_offset,uint64_t va)1973 gfx10_copy_gds_query_gfx(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
1974 {
1975 /* Make sure GDS is idle before copying the value. */
1976 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
1977 radv_emit_cache_flush(cmd_buffer);
1978
1979 gfx10_copy_gds_query(cmd_buffer->cs, gds_offset, va);
1980 }
1981
1982 static void
gfx10_copy_gds_query_ace(struct radv_cmd_buffer * cmd_buffer,uint32_t gds_offset,uint64_t va)1983 gfx10_copy_gds_query_ace(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
1984 {
1985 /* Make sure GDS is idle before copying the value. */
1986 cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
1987 radv_gang_cache_flush(cmd_buffer);
1988
1989 gfx10_copy_gds_query(cmd_buffer->gang.cs, gds_offset, va);
1990 }
1991
1992 static void
radv_update_hw_pipelinestat(struct radv_cmd_buffer * cmd_buffer)1993 radv_update_hw_pipelinestat(struct radv_cmd_buffer *cmd_buffer)
1994 {
1995 const uint32_t num_pipeline_stat_queries = radv_get_num_pipeline_stat_queries(cmd_buffer);
1996
1997 if (num_pipeline_stat_queries == 0) {
1998 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1999 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
2000 } else if (num_pipeline_stat_queries == 1) {
2001 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
2002 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
2003 }
2004 }
2005
2006 static void
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)2007 emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, VkQueryType query_type,
2008 VkQueryControlFlags flags, uint32_t index)
2009 {
2010 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2011 const struct radv_physical_device *pdev = radv_device_physical(device);
2012 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2013 switch (query_type) {
2014 case VK_QUERY_TYPE_OCCLUSION:
2015 radeon_check_space(device->ws, cs, 11);
2016
2017 ++cmd_buffer->state.active_occlusion_queries;
2018 if (cmd_buffer->state.active_occlusion_queries == 1) {
2019 if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
2020 /* This is the first occlusion query, enable
2021 * the hint if the precision bit is set.
2022 */
2023 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
2024 }
2025
2026 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2027 } else {
2028 if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && !cmd_buffer->state.perfect_occlusion_queries_enabled) {
2029 /* This is not the first query, but this one
2030 * needs to enable precision, DB_COUNT_CONTROL
2031 * has to be updated accordingly.
2032 */
2033 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
2034
2035 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2036 }
2037 }
2038
2039 if (pdev->info.gfx_level >= GFX11 && pdev->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
2040 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
2041 } else {
2042 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2043 if (pdev->info.gfx_level >= GFX11) {
2044 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
2045 } else {
2046 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
2047 }
2048 }
2049 radeon_emit(cs, va);
2050 radeon_emit(cs, va >> 32);
2051 break;
2052 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2053 radeon_check_space(device->ws, cs, 4);
2054
2055 ++cmd_buffer->state.active_pipeline_queries;
2056
2057 radv_update_hw_pipelinestat(cmd_buffer);
2058
2059 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
2060 uint32_t cs_invoc_offset =
2061 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
2062 va += cs_invoc_offset;
2063 }
2064
2065 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2066 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2067 radeon_emit(cs, va);
2068 radeon_emit(cs, va >> 32);
2069
2070 if (pool->uses_gds) {
2071 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
2072 uint32_t gs_prim_offset =
2073 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
2074
2075 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, va + gs_prim_offset);
2076 }
2077
2078 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) {
2079 uint32_t gs_invoc_offset =
2080 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT);
2081
2082 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_INVOCATION_OFFSET, va + gs_invoc_offset);
2083 }
2084
2085 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
2086 uint32_t mesh_invoc_offset =
2087 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
2088
2089 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
2090 }
2091
2092 /* Record that the command buffer needs GDS. */
2093 cmd_buffer->gds_needed = true;
2094
2095 if (!cmd_buffer->state.active_pipeline_gds_queries)
2096 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2097
2098 cmd_buffer->state.active_pipeline_gds_queries++;
2099 }
2100
2101 if (pool->uses_ace) {
2102 uint32_t task_invoc_offset =
2103 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2104
2105 if (pdev->info.gfx_level >= GFX11) {
2106 va += task_invoc_offset;
2107
2108 radeon_check_space(device->ws, cmd_buffer->gang.cs, 4);
2109
2110 radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2111 radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2112 radeon_emit(cmd_buffer->gang.cs, va);
2113 radeon_emit(cmd_buffer->gang.cs, va >> 32);
2114 } else {
2115 radeon_check_space(device->ws, cmd_buffer->gang.cs, 11);
2116
2117 gfx10_copy_gds_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
2118 radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
2119
2120 /* Record that the command buffer needs GDS. */
2121 cmd_buffer->gds_needed = true;
2122
2123 if (!cmd_buffer->state.active_pipeline_ace_queries)
2124 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2125
2126 cmd_buffer->state.active_pipeline_ace_queries++;
2127 }
2128 }
2129 break;
2130 }
2131 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2132 if (pdev->use_ngg_streamout) {
2133 /* generated prim counter */
2134 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
2135 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2136
2137 /* written prim counter */
2138 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 8);
2139 radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
2140
2141 /* Record that the command buffer needs GDS. */
2142 cmd_buffer->gds_needed = true;
2143
2144 if (!cmd_buffer->state.active_prims_xfb_gds_queries)
2145 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2146
2147 cmd_buffer->state.active_prims_xfb_gds_queries++;
2148 } else {
2149 cmd_buffer->state.active_prims_xfb_queries++;
2150
2151 radv_update_hw_pipelinestat(cmd_buffer);
2152
2153 emit_sample_streamout(cmd_buffer, va, index);
2154 }
2155 break;
2156 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2157 if (pdev->info.gfx_level >= GFX11) {
2158 /* On GFX11+, primitives generated query always use GDS. */
2159 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
2160 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2161
2162 /* Record that the command buffer needs GDS. */
2163 cmd_buffer->gds_needed = true;
2164
2165 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2166 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2167
2168 cmd_buffer->state.active_prims_gen_gds_queries++;
2169 } else {
2170 if (!cmd_buffer->state.active_prims_gen_queries) {
2171 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
2172
2173 cmd_buffer->state.active_prims_gen_queries++;
2174
2175 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
2176 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
2177 }
2178 } else {
2179 cmd_buffer->state.active_prims_gen_queries++;
2180 }
2181
2182 radv_update_hw_pipelinestat(cmd_buffer);
2183
2184 if (pool->uses_gds) {
2185 /* generated prim counter */
2186 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32);
2187 radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000);
2188
2189 /* Record that the command buffer needs GDS. */
2190 cmd_buffer->gds_needed = true;
2191
2192 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2193 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2194
2195 cmd_buffer->state.active_prims_gen_gds_queries++;
2196 }
2197
2198 emit_sample_streamout(cmd_buffer, va, index);
2199 }
2200 break;
2201 }
2202 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2203 radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2204 break;
2205 }
2206 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2207 if (pdev->info.gfx_level >= GFX11) {
2208 radeon_check_space(device->ws, cs, 4);
2209
2210 ++cmd_buffer->state.active_pipeline_queries;
2211
2212 radv_update_hw_pipelinestat(cmd_buffer);
2213
2214 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2215 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2216 radeon_emit(cs, va);
2217 radeon_emit(cs, va >> 32);
2218 } else {
2219 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va);
2220 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2221
2222 /* Record that the command buffer needs GDS. */
2223 cmd_buffer->gds_needed = true;
2224
2225 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2226 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2227
2228 cmd_buffer->state.active_prims_gen_gds_queries++;
2229 }
2230 break;
2231 }
2232 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2233 cmd_buffer->video.feedback_query_va = va;
2234 break;
2235 default:
2236 unreachable("beginning unhandled query type");
2237 }
2238 }
2239
2240 static void
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)2241 emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint64_t avail_va,
2242 VkQueryType query_type, uint32_t index)
2243 {
2244 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2245 const struct radv_physical_device *pdev = radv_device_physical(device);
2246 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2247 switch (query_type) {
2248 case VK_QUERY_TYPE_OCCLUSION:
2249 radeon_check_space(device->ws, cs, 14);
2250
2251 cmd_buffer->state.active_occlusion_queries--;
2252 if (cmd_buffer->state.active_occlusion_queries == 0) {
2253 /* Reset the perfect occlusion queries hint now that no
2254 * queries are active.
2255 */
2256 cmd_buffer->state.perfect_occlusion_queries_enabled = false;
2257
2258 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2259 }
2260
2261 if (pdev->info.gfx_level >= GFX11 && pdev->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
2262 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
2263 } else {
2264 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2265 if (pdev->info.gfx_level >= GFX11) {
2266 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
2267 } else {
2268 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
2269 }
2270 }
2271 radeon_emit(cs, va + 8);
2272 radeon_emit(cs, (va + 8) >> 32);
2273
2274 break;
2275 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2276 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
2277
2278 radeon_check_space(device->ws, cs, 16);
2279
2280 cmd_buffer->state.active_pipeline_queries--;
2281
2282 radv_update_hw_pipelinestat(cmd_buffer);
2283
2284 va += pipelinestat_block_size;
2285
2286 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
2287 uint32_t cs_invoc_offset =
2288 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
2289 va += cs_invoc_offset;
2290 }
2291
2292 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2293 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2294 radeon_emit(cs, va);
2295 radeon_emit(cs, va >> 32);
2296
2297 if (pool->uses_gds) {
2298 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
2299 uint32_t gs_prim_offset =
2300 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
2301
2302 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, va + gs_prim_offset);
2303 }
2304
2305 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) {
2306 uint32_t gs_invoc_offset =
2307 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT);
2308
2309 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_INVOCATION_OFFSET, va + gs_invoc_offset);
2310 }
2311
2312 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
2313 uint32_t mesh_invoc_offset =
2314 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
2315
2316 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
2317 }
2318
2319 cmd_buffer->state.active_pipeline_gds_queries--;
2320
2321 if (!cmd_buffer->state.active_pipeline_gds_queries)
2322 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2323 }
2324
2325 if (pool->uses_ace) {
2326 uint32_t task_invoc_offset =
2327 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2328
2329 if (pdev->info.gfx_level >= GFX11) {
2330 va += task_invoc_offset;
2331
2332 radeon_check_space(device->ws, cmd_buffer->gang.cs, 4);
2333
2334 radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2335 radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2336 radeon_emit(cmd_buffer->gang.cs, va);
2337 radeon_emit(cmd_buffer->gang.cs, va >> 32);
2338 } else {
2339 radeon_check_space(device->ws, cmd_buffer->gang.cs, 11);
2340
2341 gfx10_copy_gds_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
2342 radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
2343
2344 cmd_buffer->state.active_pipeline_ace_queries--;
2345
2346 if (!cmd_buffer->state.active_pipeline_ace_queries)
2347 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2348 }
2349 }
2350
2351 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
2352 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
2353 break;
2354 }
2355 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2356 if (pdev->use_ngg_streamout) {
2357 /* generated prim counter */
2358 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
2359 radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
2360
2361 /* written prim counter */
2362 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 24);
2363 radv_cs_write_data_imm(cs, V_370_ME, va + 28, 0x80000000);
2364
2365 cmd_buffer->state.active_prims_xfb_gds_queries--;
2366
2367 if (!cmd_buffer->state.active_prims_xfb_gds_queries)
2368 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2369 } else {
2370 cmd_buffer->state.active_prims_xfb_queries--;
2371
2372 radv_update_hw_pipelinestat(cmd_buffer);
2373
2374 emit_sample_streamout(cmd_buffer, va + 16, index);
2375 }
2376 break;
2377 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2378 if (pdev->info.gfx_level >= GFX11) {
2379 /* On GFX11+, primitives generated query always use GDS. */
2380 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
2381 radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
2382
2383 cmd_buffer->state.active_prims_gen_gds_queries--;
2384
2385 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2386 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2387 } else {
2388 if (cmd_buffer->state.active_prims_gen_queries == 1) {
2389 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
2390
2391 cmd_buffer->state.active_prims_gen_queries--;
2392
2393 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
2394 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
2395 }
2396 } else {
2397 cmd_buffer->state.active_prims_gen_queries--;
2398 }
2399
2400 radv_update_hw_pipelinestat(cmd_buffer);
2401
2402 if (pool->uses_gds) {
2403 /* generated prim counter */
2404 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40);
2405 radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000);
2406
2407 cmd_buffer->state.active_prims_gen_gds_queries--;
2408
2409 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2410 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2411 }
2412
2413 emit_sample_streamout(cmd_buffer, va + 16, index);
2414 }
2415 break;
2416 }
2417 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2418 radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2419 break;
2420 }
2421 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2422 if (pdev->info.gfx_level >= GFX11) {
2423 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
2424
2425 radeon_check_space(device->ws, cs, 16);
2426
2427 cmd_buffer->state.active_pipeline_queries--;
2428
2429 radv_update_hw_pipelinestat(cmd_buffer);
2430
2431 va += pipelinestat_block_size;
2432
2433 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2434 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2435 radeon_emit(cs, va);
2436 radeon_emit(cs, va >> 32);
2437
2438 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
2439 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1,
2440 cmd_buffer->gfx9_eop_bug_va);
2441 } else {
2442 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va + 8);
2443 radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
2444
2445 cmd_buffer->state.active_prims_gen_gds_queries--;
2446
2447 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2448 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2449 }
2450 break;
2451 }
2452 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2453 cmd_buffer->video.feedback_query_va = 0;
2454 break;
2455 default:
2456 unreachable("ending unhandled query type");
2457 }
2458
2459 cmd_buffer->active_query_flush_bits |=
2460 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2461 if (pdev->info.gfx_level >= GFX9) {
2462 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2463 }
2464 }
2465
2466 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)2467 radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2468 VkQueryControlFlags flags, uint32_t index)
2469 {
2470 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2471 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2472 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2473 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2474 uint64_t va = radv_buffer_get_va(pool->bo);
2475
2476 radv_cs_add_buffer(device->ws, cs, pool->bo);
2477
2478 emit_query_flush(cmd_buffer, pool);
2479
2480 va += pool->stride * query;
2481
2482 if (pool->uses_ace) {
2483 if (!radv_gang_init(cmd_buffer))
2484 return;
2485
2486 radv_cs_add_buffer(device->ws, cmd_buffer->gang.cs, pool->bo);
2487 }
2488
2489 emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, flags, index);
2490 }
2491
2492 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)2493 radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, uint32_t index)
2494 {
2495 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2496 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2497 uint64_t va = radv_buffer_get_va(pool->bo);
2498 uint64_t avail_va = va + pool->availability_offset + 4 * query;
2499 va += pool->stride * query;
2500
2501 /* Do not need to add the pool BO to the list because the query must
2502 * currently be active, which means the BO is already in the list.
2503 */
2504 emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, index);
2505
2506 /*
2507 * For multiview we have to emit a query for each bit in the mask,
2508 * however the first query we emit will get the totals for all the
2509 * operations, so we don't want to get a real value in the other
2510 * queries. This emits a fake begin/end sequence so the waiting
2511 * code gets a completed query value and doesn't hang, but the
2512 * query returns 0.
2513 */
2514 if (cmd_buffer->state.render.view_mask) {
2515 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.render.view_mask); i++) {
2516 va += pool->stride;
2517 avail_va += 4;
2518 emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, 0, 0);
2519 emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, 0);
2520 }
2521 }
2522 }
2523
2524 void
radv_write_timestamp(struct radv_cmd_buffer * cmd_buffer,uint64_t va,VkPipelineStageFlags2 stage)2525 radv_write_timestamp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkPipelineStageFlags2 stage)
2526 {
2527 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2528 const struct radv_physical_device *pdev = radv_device_physical(device);
2529 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2530
2531 if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2532 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2533 radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
2534 COPY_DATA_DST_SEL(V_370_MEM));
2535 radeon_emit(cs, 0);
2536 radeon_emit(cs, 0);
2537 radeon_emit(cs, va);
2538 radeon_emit(cs, va >> 32);
2539 } else {
2540 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
2541 EOP_DST_SEL_MEM, EOP_DATA_SEL_TIMESTAMP, va, 0, cmd_buffer->gfx9_eop_bug_va);
2542 }
2543 }
2544
2545 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)2546 radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool,
2547 uint32_t query)
2548 {
2549 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2550 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2551 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2552 const struct radv_physical_device *pdev = radv_device_physical(device);
2553 const struct radv_instance *instance = radv_physical_device_instance(pdev);
2554 const unsigned num_queries = MAX2(util_bitcount(cmd_buffer->state.render.view_mask), 1);
2555 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2556 const uint64_t va = radv_buffer_get_va(pool->bo);
2557 uint64_t query_va = va + pool->stride * query;
2558
2559 radv_cs_add_buffer(device->ws, cs, pool->bo);
2560
2561 assert(cmd_buffer->qf != RADV_QUEUE_VIDEO_DEC &&
2562 cmd_buffer->qf != RADV_QUEUE_VIDEO_ENC);
2563
2564 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
2565 if (instance->drirc.flush_before_timestamp_write) {
2566 radeon_check_space(device->ws, cmd_buffer->cs, 1);
2567 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
2568 }
2569
2570 for (unsigned i = 0; i < num_queries; ++i, query_va += pool->stride) {
2571 radeon_check_space(device->ws, cmd_buffer->cs, 3);
2572 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
2573 radeon_emit(cs, query_va);
2574 radeon_emit(cs, query_va >> 32);
2575 }
2576 return;
2577 }
2578
2579 if (instance->drirc.flush_before_timestamp_write) {
2580 /* Make sure previously launched waves have finished */
2581 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2582 }
2583
2584 radv_emit_cache_flush(cmd_buffer);
2585
2586 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 28 * num_queries);
2587
2588 for (unsigned i = 0; i < num_queries; i++) {
2589 radv_write_timestamp(cmd_buffer, query_va, stage);
2590 query_va += pool->stride;
2591 }
2592
2593 cmd_buffer->active_query_flush_bits |=
2594 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2595 if (pdev->info.gfx_level >= GFX9) {
2596 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2597 }
2598
2599 assert(cmd_buffer->cs->cdw <= cdw_max);
2600 }
2601
2602 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,uint32_t accelerationStructureCount,const VkAccelerationStructureKHR * pAccelerationStructures,VkQueryType queryType,VkQueryPool queryPool,uint32_t firstQuery)2603 radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2604 const VkAccelerationStructureKHR *pAccelerationStructures,
2605 VkQueryType queryType, VkQueryPool queryPool, uint32_t firstQuery)
2606 {
2607 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2608 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2609 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2610 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2611 uint64_t pool_va = radv_buffer_get_va(pool->bo);
2612 uint64_t query_va = pool_va + pool->stride * firstQuery;
2613
2614 radv_cs_add_buffer(device->ws, cs, pool->bo);
2615
2616 radv_emit_cache_flush(cmd_buffer);
2617
2618 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 6 * accelerationStructureCount);
2619
2620 for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2621 VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2622 uint64_t va = vk_acceleration_structure_get_va(accel_struct);
2623
2624 switch (queryType) {
2625 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2626 va += offsetof(struct radv_accel_struct_header, compacted_size);
2627 break;
2628 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2629 va += offsetof(struct radv_accel_struct_header, serialization_size);
2630 break;
2631 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2632 va += offsetof(struct radv_accel_struct_header, instance_count);
2633 break;
2634 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2635 va += offsetof(struct radv_accel_struct_header, size);
2636 break;
2637 default:
2638 unreachable("Unhandle accel struct query type.");
2639 }
2640
2641 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2642 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2643 COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2644 radeon_emit(cs, va);
2645 radeon_emit(cs, va >> 32);
2646 radeon_emit(cs, query_va);
2647 radeon_emit(cs, query_va >> 32);
2648
2649 query_va += pool->stride;
2650 }
2651
2652 assert(cmd_buffer->cs->cdw <= cdw_max);
2653 }
2654