xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/a4xx/fd4_query.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "freedreno_context.h"
10 #include "freedreno_query_hw.h"
11 #include "freedreno_util.h"
12 
13 #include "fd4_context.h"
14 #include "fd4_draw.h"
15 #include "fd4_format.h"
16 #include "fd4_query.h"
17 
18 struct fd_rb_samp_ctrs {
19    uint64_t ctr[16];
20 };
21 
22 /*
23  * Occlusion Query:
24  *
25  * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
26  * interpret results
27  */
28 
29 static struct fd_hw_sample *
occlusion_get_sample(struct fd_batch * batch,struct fd_ringbuffer * ring)30 occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
31 {
32    struct fd_hw_sample *samp =
33       fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
34 
35    /* low bits of sample addr should be zero (since they are control
36     * flags in RB_SAMPLE_COUNT_CONTROL):
37     */
38    assert((samp->offset & 0x3) == 0);
39 
40    /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
41     * HW_QUERY_BASE_REG register:
42     */
43    OUT_PKT3(ring, CP_SET_CONSTANT, 3);
44    OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
45    OUT_RING(ring, HW_QUERY_BASE_REG);
46    OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset);
47 
48    OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
49    OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
50                         INDEX4_SIZE_32_BIT, USE_VISIBILITY));
51    OUT_RING(ring, 1); /* NumInstances */
52    OUT_RING(ring, 0); /* NumIndices */
53 
54    fd_event_write(batch, ring, ZPASS_DONE);
55 
56    return samp;
57 }
58 
59 static uint64_t
count_samples(const struct fd_rb_samp_ctrs * start,const struct fd_rb_samp_ctrs * end)60 count_samples(const struct fd_rb_samp_ctrs *start,
61               const struct fd_rb_samp_ctrs *end)
62 {
63    return end->ctr[0] - start->ctr[0];
64 }
65 
66 static void
occlusion_counter_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)67 occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start,
68                                     const void *end,
69                                     union pipe_query_result *result)
70 {
71    uint64_t n = count_samples(start, end);
72    result->u64 += n;
73 }
74 
75 static void
occlusion_predicate_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)76 occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start,
77                                       const void *end,
78                                       union pipe_query_result *result)
79 {
80    uint64_t n = count_samples(start, end);
81    result->b |= (n > 0);
82 }
83 
84 /*
85  * Time Elapsed Query:
86  *
87  * Note: we could in theory support timestamp queries, but they
88  * won't give sensible results for tilers.
89  */
90 
91 static void
time_elapsed_enable(struct fd_context * ctx,struct fd_ringbuffer * ring)92 time_elapsed_enable(struct fd_context *ctx,
93                     struct fd_ringbuffer *ring) assert_dt
94 {
95    /* Right now, the assignment of countable to counter register is
96     * just hard coded.  If we start exposing more countables than we
97     * have counters, we will need to be more clever.
98     */
99    struct fd_batch *batch = fd_context_batch(ctx);
100    fd_wfi(batch, ring);
101    OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
102    OUT_RING(ring, CP_ALWAYS_COUNT);
103    fd_batch_reference(&batch, NULL);
104 }
105 
106 static struct fd_hw_sample *
time_elapsed_get_sample(struct fd_batch * batch,struct fd_ringbuffer * ring)107 time_elapsed_get_sample(struct fd_batch *batch,
108                         struct fd_ringbuffer *ring) assert_dt
109 {
110    struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
111 
112    /* use unused part of vsc_size_mem as scratch space, to avoid
113     * extra allocation:
114     */
115    struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
116    const int sample_off = 128;
117    const int addr_off = sample_off + 8;
118 
119    assert(batch->ctx->screen->max_freq > 0);
120 
121    /* Basic issue is that we need to read counter value to a relative
122     * destination (with per-tile offset) rather than absolute dest
123     * addr.  But there is no pm4 packet that can do that.  This is
124     * where it would be *really* nice if we could write our own fw
125     * since afaict implementing the sort of packet we need would be
126     * trivial.
127     *
128     * Instead, we:
129     * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
130     * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
131     * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
132     *     address to the per-sample offset in the scratch buffer
133     * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
134     *     to CP_ME_NRT_ADDR
135     * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
136     *     buffer to CP_ME_NRT_DATA to trigger the write out to query
137     *     result buffer
138     *
139     * Straightforward, right?
140     *
141     * Maybe could swap the order of things in the scratch buffer to
142     * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
143     * shot, but that's really just polishing a turd..
144     */
145 
146    fd_wfi(batch, ring);
147 
148    /* copy sample counter _LO and _HI to scratch: */
149    OUT_PKT3(ring, CP_REG_TO_MEM, 2);
150    OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
151                      CP_REG_TO_MEM_0_64B |
152                      CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */
153    OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
154 
155    /* ok... here we really *would* like to use the CP_SET_CONSTANT
156     * mode which can add a constant to value in reg2 and write to
157     * reg1... *but* that only works for banked/context registers,
158     * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
159     * CP math to the scratch buffer instead:
160     *
161     * (note first 8 bytes are counter value, use offset 0x8 for
162     * address calculation)
163     */
164 
165    /* per-sample offset to scratch bo: */
166    OUT_PKT3(ring, CP_MEM_WRITE, 2);
167    OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
168    OUT_RING(ring, samp->offset);
169 
170    /* now add to that the per-tile base: */
171    OUT_PKT3(ring, CP_REG_TO_MEM, 2);
172    OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
173                      CP_REG_TO_MEM_0_ACCUMULATE |
174                      CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */
175    OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
176 
177    /* now copy that back to CP_ME_NRT_ADDR: */
178    OUT_PKT3(ring, CP_MEM_TO_REG, 2);
179    OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
180    OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
181 
182    /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
183     * to trigger the write to result buffer
184     */
185    OUT_PKT3(ring, CP_MEM_TO_REG, 2);
186    OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
187    OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
188 
189    /* and again to get the value of the _HI reg from scratch: */
190    OUT_PKT3(ring, CP_MEM_TO_REG, 2);
191    OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
192    OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
193 
194    /* Sigh.. */
195 
196    return samp;
197 }
198 
199 static void
time_elapsed_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)200 time_elapsed_accumulate_result(struct fd_context *ctx, const void *start,
201                                const void *end, union pipe_query_result *result)
202 {
203    uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
204    /* max_freq is in Hz, convert cycle count to ns: */
205    result->u64 += n * 1000000000 / ctx->screen->max_freq;
206 }
207 
208 static void
timestamp_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)209 timestamp_accumulate_result(struct fd_context *ctx, const void *start,
210                             const void *end, union pipe_query_result *result)
211 {
212    /* just return the value from fist tile: */
213    if (result->u64 != 0)
214       return;
215    uint64_t n = *(uint64_t *)start;
216    /* max_freq is in Hz, convert cycle count to ns: */
217    result->u64 = n * 1000000000 / ctx->screen->max_freq;
218 }
219 
220 static const struct fd_hw_sample_provider occlusion_counter = {
221    .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
222    .get_sample = occlusion_get_sample,
223    .accumulate_result = occlusion_counter_accumulate_result,
224 };
225 
226 static const struct fd_hw_sample_provider occlusion_predicate = {
227    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
228    .get_sample = occlusion_get_sample,
229    .accumulate_result = occlusion_predicate_accumulate_result,
230 };
231 
232 static const struct fd_hw_sample_provider occlusion_predicate_conservative = {
233    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
234    .get_sample = occlusion_get_sample,
235    .accumulate_result = occlusion_predicate_accumulate_result,
236 };
237 
238 static const struct fd_hw_sample_provider time_elapsed = {
239    .query_type = PIPE_QUERY_TIME_ELAPSED,
240    .always = true,
241    .enable = time_elapsed_enable,
242    .get_sample = time_elapsed_get_sample,
243    .accumulate_result = time_elapsed_accumulate_result,
244 };
245 
246 /* NOTE: timestamp query isn't going to give terribly sensible results
247  * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
248  * add in a binning pass, the results get even more non-sensical.  So
249  * we just return the timestamp on the first tile and hope that is
250  * kind of good enough.
251  */
252 static const struct fd_hw_sample_provider timestamp = {
253    .query_type = PIPE_QUERY_TIMESTAMP,
254    .always = true,
255    .enable = time_elapsed_enable,
256    .get_sample = time_elapsed_get_sample,
257    .accumulate_result = timestamp_accumulate_result,
258 };
259 
260 void
fd4_query_context_init(struct pipe_context * pctx)261 fd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
262 {
263    struct fd_context *ctx = fd_context(pctx);
264 
265    ctx->create_query = fd_hw_create_query;
266    ctx->query_prepare = fd_hw_query_prepare;
267    ctx->query_prepare_tile = fd_hw_query_prepare_tile;
268    ctx->query_update_batch = fd_hw_query_update_batch;
269 
270    fd_hw_query_register_provider(pctx, &occlusion_counter);
271    fd_hw_query_register_provider(pctx, &occlusion_predicate);
272    fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative);
273    fd_hw_query_register_provider(pctx, &time_elapsed);
274    fd_hw_query_register_provider(pctx, &timestamp);
275 }
276