xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/gfx11_query.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "sid.h"
10 #include "util/u_memory.h"
11 #include "util/u_suballoc.h"
12 
13 #include <stddef.h>
14 
emit_shader_query(struct si_context * sctx,unsigned index)15 static void emit_shader_query(struct si_context *sctx, unsigned index)
16 {
17    assert(!list_is_empty(&sctx->shader_query_buffers));
18 
19    struct gfx11_sh_query_buffer *qbuf =
20       list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
21    qbuf->head += sizeof(struct gfx11_sh_query_buffer_mem);
22 }
23 
gfx11_release_query_buffers(struct si_context * sctx,struct gfx11_sh_query_buffer * first,struct gfx11_sh_query_buffer * last)24 static void gfx11_release_query_buffers(struct si_context *sctx,
25                                         struct gfx11_sh_query_buffer *first,
26                                         struct gfx11_sh_query_buffer *last)
27 {
28    while (first) {
29       struct gfx11_sh_query_buffer *qbuf = first;
30       if (first != last)
31          first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
32       else
33          first = NULL;
34 
35       qbuf->refcount--;
36       if (qbuf->refcount)
37          continue;
38 
39       if (qbuf->list.next == &sctx->shader_query_buffers)
40          continue; /* keep the most recent buffer; it may not be full yet */
41       if (qbuf->list.prev == &sctx->shader_query_buffers)
42          continue; /* keep the oldest buffer for recycling */
43 
44       list_del(&qbuf->list);
45       si_resource_reference(&qbuf->buf, NULL);
46       FREE(qbuf);
47    }
48 }
49 
gfx11_alloc_query_buffer(struct si_context * sctx)50 static bool gfx11_alloc_query_buffer(struct si_context *sctx)
51 {
52    if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
53       return true;
54 
55    struct gfx11_sh_query_buffer *qbuf = NULL;
56 
57    if (!list_is_empty(&sctx->shader_query_buffers)) {
58       qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
59       if (qbuf->head + sizeof(struct gfx11_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
60          goto success;
61 
62       qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
63       if (!qbuf->refcount &&
64           !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
65           sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
66          /* Can immediately re-use the oldest buffer */
67          list_del(&qbuf->list);
68       } else {
69          qbuf = NULL;
70       }
71    }
72 
73    if (!qbuf) {
74       qbuf = CALLOC_STRUCT(gfx11_sh_query_buffer);
75       if (unlikely(!qbuf))
76          return false;
77 
78       struct si_screen *screen = sctx->screen;
79       unsigned buf_size =
80          MAX2(sizeof(struct gfx11_sh_query_buffer_mem), screen->info.min_alloc_size);
81       qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
82       if (unlikely(!qbuf->buf)) {
83          FREE(qbuf);
84          return false;
85       }
86    }
87 
88    /* The buffer is currently unused by the GPU. Initialize it.
89     *
90     * We need to set the high bit of all the primitive counters for
91     * compatibility with the SET_PREDICATION packet.
92     */
93    uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
94                                             PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
95    assert(results);
96 
97    for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx11_sh_query_buffer_mem); i < e;
98         ++i) {
99       for (unsigned j = 0; j < 16; ++j)
100          results[32 * i + j] = (uint64_t)1 << 63;
101       results[32 * i + 16] = 0;
102    }
103 
104    list_addtail(&qbuf->list, &sctx->shader_query_buffers);
105    qbuf->head = 0;
106    qbuf->refcount = sctx->num_active_shader_queries;
107 
108 success:;
109    struct pipe_shader_buffer sbuf;
110    sbuf.buffer = &qbuf->buf->b.b;
111    sbuf.buffer_offset = qbuf->head;
112    sbuf.buffer_size = sizeof(struct gfx11_sh_query_buffer_mem);
113    si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
114    SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);
115 
116    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
117    return true;
118 }
119 
gfx11_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)120 static void gfx11_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
121 {
122    struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
123    gfx11_release_query_buffers(sctx, query->first, query->last);
124    FREE(query);
125 }
126 
gfx11_sh_query_begin(struct si_context * sctx,struct si_query * rquery)127 static bool gfx11_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
128 {
129    struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
130 
131    gfx11_release_query_buffers(sctx, query->first, query->last);
132    query->first = query->last = NULL;
133 
134    if (unlikely(!gfx11_alloc_query_buffer(sctx)))
135       return false;
136 
137    query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
138    query->first_begin = query->first->head;
139 
140    sctx->num_active_shader_queries++;
141    query->first->refcount++;
142 
143    return true;
144 }
145 
gfx11_sh_query_end(struct si_context * sctx,struct si_query * rquery)146 static bool gfx11_sh_query_end(struct si_context *sctx, struct si_query *rquery)
147 {
148    struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
149 
150    if (unlikely(!query->first))
151       return false; /* earlier out of memory error */
152 
153    query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
154    query->last_end = query->last->head;
155 
156    /* Signal the fence of the previous chunk */
157    if (query->last_end != 0) {
158       uint64_t fence_va = query->last->buf->gpu_address;
159       fence_va += query->last_end - sizeof(struct gfx11_sh_query_buffer_mem);
160       fence_va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
161       si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
162                         EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
163                         0xffffffff, PIPE_QUERY_GPU_FINISHED);
164    }
165 
166    sctx->num_active_shader_queries--;
167 
168    if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
169       si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
170       SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);
171 
172       /* If a query_begin is followed by a query_end without a draw
173        * in-between, we need to clear the atom to ensure that the
174        * next query_begin will re-initialize the shader buffer. */
175       si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
176    }
177 
178    return true;
179 }
180 
gfx11_sh_query_add_result(struct gfx11_sh_query * query,struct gfx11_sh_query_buffer_mem * qmem,union pipe_query_result * result)181 static void gfx11_sh_query_add_result(struct gfx11_sh_query *query,
182                                       struct gfx11_sh_query_buffer_mem *qmem,
183                                       union pipe_query_result *result)
184 {
185    static const uint64_t mask = ((uint64_t)1 << 63) - 1;
186 
187    switch (query->b.type) {
188    case PIPE_QUERY_PRIMITIVES_EMITTED:
189       result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
190       break;
191    case PIPE_QUERY_PRIMITIVES_GENERATED:
192       result->u64 += qmem->stream[query->stream].generated_primitives & mask;
193       break;
194    case PIPE_QUERY_SO_STATISTICS:
195       result->so_statistics.num_primitives_written +=
196          qmem->stream[query->stream].emitted_primitives & mask;
197       result->so_statistics.primitives_storage_needed +=
198          qmem->stream[query->stream].generated_primitives & mask;
199       break;
200    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
201       result->b |= qmem->stream[query->stream].emitted_primitives !=
202                    qmem->stream[query->stream].generated_primitives;
203       break;
204    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
205       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
206          result->b |= qmem->stream[stream].emitted_primitives !=
207                       qmem->stream[stream].generated_primitives;
208       }
209       break;
210    default:
211       assert(0);
212    }
213 }
214 
gfx11_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)215 static bool gfx11_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
216                                       union pipe_query_result *result)
217 {
218    struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
219 
220    util_query_clear_result(result, query->b.type);
221 
222    if (unlikely(!query->first))
223       return false; /* earlier out of memory error */
224    assert(query->last);
225 
226    for (struct gfx11_sh_query_buffer *qbuf = query->last;;
227         qbuf = list_entry(qbuf->list.prev, struct gfx11_sh_query_buffer, list)) {
228       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
229       void *map;
230 
231       if (rquery->b.flushed)
232          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
233       else
234          map = si_buffer_map(sctx, qbuf->buf, usage);
235 
236       if (!map)
237          return false;
238 
239       unsigned results_begin = 0;
240       unsigned results_end = qbuf->head;
241       if (qbuf == query->first)
242          results_begin = query->first_begin;
243       if (qbuf == query->last)
244          results_end = query->last_end;
245 
246       while (results_begin != results_end) {
247          struct gfx11_sh_query_buffer_mem *qmem = map + results_begin;
248          results_begin += sizeof(*qmem);
249 
250          gfx11_sh_query_add_result(query, qmem, result);
251       }
252 
253       if (qbuf == query->first)
254          break;
255    }
256 
257    return true;
258 }
259 
gfx11_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)260 static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
261                                                enum pipe_query_flags flags,
262                                                enum pipe_query_value_type result_type,
263                                                int index, struct pipe_resource *resource,
264                                                unsigned offset)
265 {
266    struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
267    struct si_qbo_state saved_state = {};
268    struct pipe_resource *tmp_buffer = NULL;
269    unsigned tmp_buffer_offset = 0;
270 
271    if (!sctx->sh_query_result_shader) {
272       sctx->sh_query_result_shader = gfx11_create_sh_query_result_cs(sctx);
273       if (!sctx->sh_query_result_shader)
274          return;
275    }
276 
277    if (query->first != query->last) {
278       u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
279       if (!tmp_buffer)
280          return;
281    }
282 
283    si_save_qbo_state(sctx, &saved_state);
284 
285    /* Pre-fill the constants configuring the shader behavior. */
286    struct {
287       uint32_t config;
288       uint32_t offset;
289       uint32_t chain;
290       uint32_t result_count;
291    } consts;
292    struct pipe_constant_buffer constant_buffer = {};
293 
294    if (index >= 0) {
295       switch (query->b.type) {
296       case PIPE_QUERY_PRIMITIVES_GENERATED:
297          consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
298          consts.config = 0;
299          break;
300       case PIPE_QUERY_PRIMITIVES_EMITTED:
301          consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
302          consts.config = 0;
303          break;
304       case PIPE_QUERY_SO_STATISTICS:
305          consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
306          consts.config = 0;
307          break;
308       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
309          consts.offset = 4 * sizeof(uint64_t) * query->stream;
310          consts.config = 2;
311          break;
312       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
313          consts.offset = 0;
314          consts.config = 3;
315          break;
316       default:
317          unreachable("bad query type");
318       }
319    } else {
320       /* Check result availability. */
321       consts.offset = 0;
322       consts.config = 1;
323    }
324 
325    bool is_result_64bit = result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64;
326    if (is_result_64bit)
327       consts.config |= 8;
328 
329    constant_buffer.buffer_size = sizeof(consts);
330    constant_buffer.user_buffer = &consts;
331 
332    /* Pre-fill the SSBOs and grid. */
333    struct pipe_shader_buffer ssbo[3];
334    struct pipe_grid_info grid = {};
335 
336    ssbo[1].buffer = tmp_buffer;
337    ssbo[1].buffer_offset = tmp_buffer_offset;
338    ssbo[1].buffer_size = 16;
339 
340    ssbo[2] = ssbo[1];
341 
342    grid.block[0] = 1;
343    grid.block[1] = 1;
344    grid.block[2] = 1;
345    grid.grid[0] = 1;
346    grid.grid[1] = 1;
347    grid.grid[2] = 1;
348 
349    /* TODO: Range-invalidate GL2 */
350    if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) {
351       sctx->barrier_flags |= SI_BARRIER_INV_L2;
352       si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
353    }
354 
355    struct gfx11_sh_query_buffer *qbuf = query->first;
356    for (;;) {
357       unsigned begin = qbuf == query->first ? query->first_begin : 0;
358       unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
359       if (!end)
360          continue;
361 
362       ssbo[0].buffer = &qbuf->buf->b.b;
363       ssbo[0].buffer_offset = begin;
364       ssbo[0].buffer_size = end - begin;
365 
366       consts.result_count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem);
367       consts.chain = 0;
368       if (qbuf != query->first)
369          consts.chain |= 1;
370       if (qbuf != query->last)
371          consts.chain |= 2;
372 
373       if (qbuf == query->last) {
374          ssbo[2].buffer = resource;
375          ssbo[2].buffer_offset = offset;
376          ssbo[2].buffer_size = is_result_64bit ? 8 : 4;
377       }
378 
379       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
380 
381       if (flags & PIPE_QUERY_WAIT) {
382          uint64_t va;
383 
384          /* Wait for result availability. Wait only for readiness
385           * of the last entry, since the fence writes should be
386           * serialized in the CP.
387           */
388          va = qbuf->buf->gpu_address;
389          va += end - sizeof(struct gfx11_sh_query_buffer_mem);
390          va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
391 
392          si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
393       }
394 
395       /* ssbo[2] is either tmp_buffer or resource */
396       assert(ssbo[2].buffer);
397 
398       unsigned writable_bitmask = (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0);
399 
400       si_barrier_before_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
401       si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader, 3, ssbo,
402                                     writable_bitmask, false);
403       si_barrier_after_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
404 
405       if (qbuf == query->last)
406          break;
407       qbuf = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
408    }
409 
410    si_restore_qbo_state(sctx, &saved_state);
411    pipe_resource_reference(&tmp_buffer, NULL);
412 }
413 
414 static const struct si_query_ops gfx11_sh_query_ops = {
415    .destroy = gfx11_sh_query_destroy,
416    .begin = gfx11_sh_query_begin,
417    .end = gfx11_sh_query_end,
418    .get_result = gfx11_sh_query_get_result,
419    .get_result_resource = gfx11_sh_query_get_result_resource,
420 };
421 
gfx11_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)422 struct pipe_query *gfx11_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
423                                          unsigned index)
424 {
425    struct gfx11_sh_query *query = CALLOC_STRUCT(gfx11_sh_query);
426    if (unlikely(!query))
427       return NULL;
428 
429    query->b.ops = &gfx11_sh_query_ops;
430    query->b.type = query_type;
431    query->stream = index;
432 
433    return (struct pipe_query *)query;
434 }
435 
si_gfx11_init_query(struct si_context * sctx)436 void si_gfx11_init_query(struct si_context *sctx)
437 {
438    list_inithead(&sctx->shader_query_buffers);
439    sctx->atoms.s.shader_query.emit = emit_shader_query;
440 }
441 
si_gfx11_destroy_query(struct si_context * sctx)442 void si_gfx11_destroy_query(struct si_context *sctx)
443 {
444    if (!sctx->shader_query_buffers.next)
445       return;
446 
447    while (!list_is_empty(&sctx->shader_query_buffers)) {
448       struct gfx11_sh_query_buffer *qbuf =
449          list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
450       list_del(&qbuf->list);
451 
452       assert(!qbuf->refcount);
453       si_resource_reference(&qbuf->buf, NULL);
454       FREE(qbuf);
455    }
456 }
457