1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "sid.h"
10 #include "util/u_memory.h"
11 #include "util/u_suballoc.h"
12
13 #include <stddef.h>
14
emit_shader_query(struct si_context * sctx,unsigned index)15 static void emit_shader_query(struct si_context *sctx, unsigned index)
16 {
17 assert(!list_is_empty(&sctx->shader_query_buffers));
18
19 struct gfx11_sh_query_buffer *qbuf =
20 list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
21 qbuf->head += sizeof(struct gfx11_sh_query_buffer_mem);
22 }
23
gfx11_release_query_buffers(struct si_context * sctx,struct gfx11_sh_query_buffer * first,struct gfx11_sh_query_buffer * last)24 static void gfx11_release_query_buffers(struct si_context *sctx,
25 struct gfx11_sh_query_buffer *first,
26 struct gfx11_sh_query_buffer *last)
27 {
28 while (first) {
29 struct gfx11_sh_query_buffer *qbuf = first;
30 if (first != last)
31 first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
32 else
33 first = NULL;
34
35 qbuf->refcount--;
36 if (qbuf->refcount)
37 continue;
38
39 if (qbuf->list.next == &sctx->shader_query_buffers)
40 continue; /* keep the most recent buffer; it may not be full yet */
41 if (qbuf->list.prev == &sctx->shader_query_buffers)
42 continue; /* keep the oldest buffer for recycling */
43
44 list_del(&qbuf->list);
45 si_resource_reference(&qbuf->buf, NULL);
46 FREE(qbuf);
47 }
48 }
49
gfx11_alloc_query_buffer(struct si_context * sctx)50 static bool gfx11_alloc_query_buffer(struct si_context *sctx)
51 {
52 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
53 return true;
54
55 struct gfx11_sh_query_buffer *qbuf = NULL;
56
57 if (!list_is_empty(&sctx->shader_query_buffers)) {
58 qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
59 if (qbuf->head + sizeof(struct gfx11_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
60 goto success;
61
62 qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
63 if (!qbuf->refcount &&
64 !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
65 sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
66 /* Can immediately re-use the oldest buffer */
67 list_del(&qbuf->list);
68 } else {
69 qbuf = NULL;
70 }
71 }
72
73 if (!qbuf) {
74 qbuf = CALLOC_STRUCT(gfx11_sh_query_buffer);
75 if (unlikely(!qbuf))
76 return false;
77
78 struct si_screen *screen = sctx->screen;
79 unsigned buf_size =
80 MAX2(sizeof(struct gfx11_sh_query_buffer_mem), screen->info.min_alloc_size);
81 qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
82 if (unlikely(!qbuf->buf)) {
83 FREE(qbuf);
84 return false;
85 }
86 }
87
88 /* The buffer is currently unused by the GPU. Initialize it.
89 *
90 * We need to set the high bit of all the primitive counters for
91 * compatibility with the SET_PREDICATION packet.
92 */
93 uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
94 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
95 assert(results);
96
97 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx11_sh_query_buffer_mem); i < e;
98 ++i) {
99 for (unsigned j = 0; j < 16; ++j)
100 results[32 * i + j] = (uint64_t)1 << 63;
101 results[32 * i + 16] = 0;
102 }
103
104 list_addtail(&qbuf->list, &sctx->shader_query_buffers);
105 qbuf->head = 0;
106 qbuf->refcount = sctx->num_active_shader_queries;
107
108 success:;
109 struct pipe_shader_buffer sbuf;
110 sbuf.buffer = &qbuf->buf->b.b;
111 sbuf.buffer_offset = qbuf->head;
112 sbuf.buffer_size = sizeof(struct gfx11_sh_query_buffer_mem);
113 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
114 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);
115
116 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
117 return true;
118 }
119
gfx11_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)120 static void gfx11_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
121 {
122 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
123 gfx11_release_query_buffers(sctx, query->first, query->last);
124 FREE(query);
125 }
126
gfx11_sh_query_begin(struct si_context * sctx,struct si_query * rquery)127 static bool gfx11_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
128 {
129 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
130
131 gfx11_release_query_buffers(sctx, query->first, query->last);
132 query->first = query->last = NULL;
133
134 if (unlikely(!gfx11_alloc_query_buffer(sctx)))
135 return false;
136
137 query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
138 query->first_begin = query->first->head;
139
140 sctx->num_active_shader_queries++;
141 query->first->refcount++;
142
143 return true;
144 }
145
gfx11_sh_query_end(struct si_context * sctx,struct si_query * rquery)146 static bool gfx11_sh_query_end(struct si_context *sctx, struct si_query *rquery)
147 {
148 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
149
150 if (unlikely(!query->first))
151 return false; /* earlier out of memory error */
152
153 query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
154 query->last_end = query->last->head;
155
156 /* Signal the fence of the previous chunk */
157 if (query->last_end != 0) {
158 uint64_t fence_va = query->last->buf->gpu_address;
159 fence_va += query->last_end - sizeof(struct gfx11_sh_query_buffer_mem);
160 fence_va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
161 si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
162 EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
163 0xffffffff, PIPE_QUERY_GPU_FINISHED);
164 }
165
166 sctx->num_active_shader_queries--;
167
168 if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
169 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
170 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);
171
172 /* If a query_begin is followed by a query_end without a draw
173 * in-between, we need to clear the atom to ensure that the
174 * next query_begin will re-initialize the shader buffer. */
175 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
176 }
177
178 return true;
179 }
180
gfx11_sh_query_add_result(struct gfx11_sh_query * query,struct gfx11_sh_query_buffer_mem * qmem,union pipe_query_result * result)181 static void gfx11_sh_query_add_result(struct gfx11_sh_query *query,
182 struct gfx11_sh_query_buffer_mem *qmem,
183 union pipe_query_result *result)
184 {
185 static const uint64_t mask = ((uint64_t)1 << 63) - 1;
186
187 switch (query->b.type) {
188 case PIPE_QUERY_PRIMITIVES_EMITTED:
189 result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
190 break;
191 case PIPE_QUERY_PRIMITIVES_GENERATED:
192 result->u64 += qmem->stream[query->stream].generated_primitives & mask;
193 break;
194 case PIPE_QUERY_SO_STATISTICS:
195 result->so_statistics.num_primitives_written +=
196 qmem->stream[query->stream].emitted_primitives & mask;
197 result->so_statistics.primitives_storage_needed +=
198 qmem->stream[query->stream].generated_primitives & mask;
199 break;
200 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
201 result->b |= qmem->stream[query->stream].emitted_primitives !=
202 qmem->stream[query->stream].generated_primitives;
203 break;
204 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
205 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
206 result->b |= qmem->stream[stream].emitted_primitives !=
207 qmem->stream[stream].generated_primitives;
208 }
209 break;
210 default:
211 assert(0);
212 }
213 }
214
gfx11_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)215 static bool gfx11_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
216 union pipe_query_result *result)
217 {
218 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
219
220 util_query_clear_result(result, query->b.type);
221
222 if (unlikely(!query->first))
223 return false; /* earlier out of memory error */
224 assert(query->last);
225
226 for (struct gfx11_sh_query_buffer *qbuf = query->last;;
227 qbuf = list_entry(qbuf->list.prev, struct gfx11_sh_query_buffer, list)) {
228 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
229 void *map;
230
231 if (rquery->b.flushed)
232 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
233 else
234 map = si_buffer_map(sctx, qbuf->buf, usage);
235
236 if (!map)
237 return false;
238
239 unsigned results_begin = 0;
240 unsigned results_end = qbuf->head;
241 if (qbuf == query->first)
242 results_begin = query->first_begin;
243 if (qbuf == query->last)
244 results_end = query->last_end;
245
246 while (results_begin != results_end) {
247 struct gfx11_sh_query_buffer_mem *qmem = map + results_begin;
248 results_begin += sizeof(*qmem);
249
250 gfx11_sh_query_add_result(query, qmem, result);
251 }
252
253 if (qbuf == query->first)
254 break;
255 }
256
257 return true;
258 }
259
gfx11_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)260 static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
261 enum pipe_query_flags flags,
262 enum pipe_query_value_type result_type,
263 int index, struct pipe_resource *resource,
264 unsigned offset)
265 {
266 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
267 struct si_qbo_state saved_state = {};
268 struct pipe_resource *tmp_buffer = NULL;
269 unsigned tmp_buffer_offset = 0;
270
271 if (!sctx->sh_query_result_shader) {
272 sctx->sh_query_result_shader = gfx11_create_sh_query_result_cs(sctx);
273 if (!sctx->sh_query_result_shader)
274 return;
275 }
276
277 if (query->first != query->last) {
278 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
279 if (!tmp_buffer)
280 return;
281 }
282
283 si_save_qbo_state(sctx, &saved_state);
284
285 /* Pre-fill the constants configuring the shader behavior. */
286 struct {
287 uint32_t config;
288 uint32_t offset;
289 uint32_t chain;
290 uint32_t result_count;
291 } consts;
292 struct pipe_constant_buffer constant_buffer = {};
293
294 if (index >= 0) {
295 switch (query->b.type) {
296 case PIPE_QUERY_PRIMITIVES_GENERATED:
297 consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
298 consts.config = 0;
299 break;
300 case PIPE_QUERY_PRIMITIVES_EMITTED:
301 consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
302 consts.config = 0;
303 break;
304 case PIPE_QUERY_SO_STATISTICS:
305 consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
306 consts.config = 0;
307 break;
308 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
309 consts.offset = 4 * sizeof(uint64_t) * query->stream;
310 consts.config = 2;
311 break;
312 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
313 consts.offset = 0;
314 consts.config = 3;
315 break;
316 default:
317 unreachable("bad query type");
318 }
319 } else {
320 /* Check result availability. */
321 consts.offset = 0;
322 consts.config = 1;
323 }
324
325 bool is_result_64bit = result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64;
326 if (is_result_64bit)
327 consts.config |= 8;
328
329 constant_buffer.buffer_size = sizeof(consts);
330 constant_buffer.user_buffer = &consts;
331
332 /* Pre-fill the SSBOs and grid. */
333 struct pipe_shader_buffer ssbo[3];
334 struct pipe_grid_info grid = {};
335
336 ssbo[1].buffer = tmp_buffer;
337 ssbo[1].buffer_offset = tmp_buffer_offset;
338 ssbo[1].buffer_size = 16;
339
340 ssbo[2] = ssbo[1];
341
342 grid.block[0] = 1;
343 grid.block[1] = 1;
344 grid.block[2] = 1;
345 grid.grid[0] = 1;
346 grid.grid[1] = 1;
347 grid.grid[2] = 1;
348
349 /* TODO: Range-invalidate GL2 */
350 if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) {
351 sctx->barrier_flags |= SI_BARRIER_INV_L2;
352 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
353 }
354
355 struct gfx11_sh_query_buffer *qbuf = query->first;
356 for (;;) {
357 unsigned begin = qbuf == query->first ? query->first_begin : 0;
358 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
359 if (!end)
360 continue;
361
362 ssbo[0].buffer = &qbuf->buf->b.b;
363 ssbo[0].buffer_offset = begin;
364 ssbo[0].buffer_size = end - begin;
365
366 consts.result_count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem);
367 consts.chain = 0;
368 if (qbuf != query->first)
369 consts.chain |= 1;
370 if (qbuf != query->last)
371 consts.chain |= 2;
372
373 if (qbuf == query->last) {
374 ssbo[2].buffer = resource;
375 ssbo[2].buffer_offset = offset;
376 ssbo[2].buffer_size = is_result_64bit ? 8 : 4;
377 }
378
379 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
380
381 if (flags & PIPE_QUERY_WAIT) {
382 uint64_t va;
383
384 /* Wait for result availability. Wait only for readiness
385 * of the last entry, since the fence writes should be
386 * serialized in the CP.
387 */
388 va = qbuf->buf->gpu_address;
389 va += end - sizeof(struct gfx11_sh_query_buffer_mem);
390 va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
391
392 si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
393 }
394
395 /* ssbo[2] is either tmp_buffer or resource */
396 assert(ssbo[2].buffer);
397
398 unsigned writable_bitmask = (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0);
399
400 si_barrier_before_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
401 si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader, 3, ssbo,
402 writable_bitmask, false);
403 si_barrier_after_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
404
405 if (qbuf == query->last)
406 break;
407 qbuf = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
408 }
409
410 si_restore_qbo_state(sctx, &saved_state);
411 pipe_resource_reference(&tmp_buffer, NULL);
412 }
413
414 static const struct si_query_ops gfx11_sh_query_ops = {
415 .destroy = gfx11_sh_query_destroy,
416 .begin = gfx11_sh_query_begin,
417 .end = gfx11_sh_query_end,
418 .get_result = gfx11_sh_query_get_result,
419 .get_result_resource = gfx11_sh_query_get_result_resource,
420 };
421
gfx11_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)422 struct pipe_query *gfx11_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
423 unsigned index)
424 {
425 struct gfx11_sh_query *query = CALLOC_STRUCT(gfx11_sh_query);
426 if (unlikely(!query))
427 return NULL;
428
429 query->b.ops = &gfx11_sh_query_ops;
430 query->b.type = query_type;
431 query->stream = index;
432
433 return (struct pipe_query *)query;
434 }
435
si_gfx11_init_query(struct si_context * sctx)436 void si_gfx11_init_query(struct si_context *sctx)
437 {
438 list_inithead(&sctx->shader_query_buffers);
439 sctx->atoms.s.shader_query.emit = emit_shader_query;
440 }
441
si_gfx11_destroy_query(struct si_context * sctx)442 void si_gfx11_destroy_query(struct si_context *sctx)
443 {
444 if (!sctx->shader_query_buffers.next)
445 return;
446
447 while (!list_is_empty(&sctx->shader_query_buffers)) {
448 struct gfx11_sh_query_buffer *qbuf =
449 list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
450 list_del(&qbuf->list);
451
452 assert(!qbuf->refcount);
453 si_resource_reference(&qbuf->buf, NULL);
454 FREE(qbuf);
455 }
456 }
457