1 /*
2 * Copyright © 2017 Rob Clark <[email protected]>
3 * Copyright © 2018 Google, Inc.
4 * SPDX-License-Identifier: MIT
5 *
6 * Authors:
7 * Rob Clark <[email protected]>
8 */
9
10 #define FD_BO_NO_HARDPIN 1
11
12 /* NOTE: see https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A5xx-Queries */
13
14 #include "freedreno_query_acc.h"
15 #include "freedreno_resource.h"
16
17 #include "fd6_context.h"
18 #include "fd6_emit.h"
19 #include "fd6_query.h"
20
21 #include "fd6_pack.h"
22
23 /* g++ is a picky about offsets that cannot be resolved at compile time, so
24 * roll our own __offsetof()
25 */
26 #define __offsetof(type, field) \
27 ({ type _x = {}; ((uint8_t *)&_x.field) - ((uint8_t *)&_x);})
28
29 struct PACKED fd6_query_sample {
30 struct fd_acc_query_sample base;
31
32 /* The RB_SAMPLE_COUNT_ADDR destination needs to be 16-byte aligned: */
33 uint64_t pad;
34
35 uint64_t start;
36 uint64_t result;
37 uint64_t stop;
38 };
39 FD_DEFINE_CAST(fd_acc_query_sample, fd6_query_sample);
40
41 /* offset of a single field of an array of fd6_query_sample: */
42 #define query_sample_idx(aq, idx, field) \
43 fd_resource((aq)->prsc)->bo, \
44 (idx * sizeof(struct fd6_query_sample)) + \
45 offsetof(struct fd6_query_sample, field), \
46 0, 0
47
48 /* offset of a single field of fd6_query_sample: */
49 #define query_sample(aq, field) query_sample_idx(aq, 0, field)
50
51 /*
52 * Occlusion Query:
53 *
54 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
55 * interpret results
56 */
57
58 template <chip CHIP>
59 static void
occlusion_resume(struct fd_acc_query * aq,struct fd_batch * batch)60 occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
61 {
62 struct fd_context *ctx = batch->ctx;
63 struct fd_ringbuffer *ring = batch->draw;
64
65 ASSERT_ALIGNED(struct fd6_query_sample, start, 16);
66
67 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
68 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
69
70 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
71 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
72 OUT_RELOC(ring, query_sample(aq, start));
73
74 fd6_event_write<CHIP>(ctx, ring, FD_ZPASS_DONE);
75
76 /* Copied from blob's cmdstream, not sure why it is done. */
77 if (CHIP == A7XX) {
78 fd6_event_write<CHIP>(ctx, ring, FD_CCU_CLEAN_DEPTH);
79 }
80 } else {
81 OUT_PKT(ring, CP_EVENT_WRITE7,
82 CP_EVENT_WRITE7_0(
83 .event = ZPASS_DONE,
84 .write_sample_count = true,
85 ),
86 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
87 );
88 OUT_PKT(ring, CP_EVENT_WRITE7,
89 CP_EVENT_WRITE7_0(
90 .event = ZPASS_DONE,
91 .write_sample_count = true,
92 .sample_count_end_offset = true,
93 .write_accum_sample_count_diff = true,
94 ),
95 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
96 );
97 }
98
99 }
100
101 template <chip CHIP>
102 static void
occlusion_pause(struct fd_acc_query * aq,struct fd_batch * batch)103 occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
104 {
105 struct fd_context *ctx = batch->ctx;
106 struct fd_ringbuffer *ring = batch->draw;
107
108 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
109 OUT_PKT7(ring, CP_MEM_WRITE, 4);
110 OUT_RELOC(ring, query_sample(aq, stop));
111 OUT_RING(ring, 0xffffffff);
112 OUT_RING(ring, 0xffffffff);
113
114 OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
115 }
116
117 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
118 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
119
120 ASSERT_ALIGNED(struct fd6_query_sample, stop, 16);
121
122 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
123 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
124 OUT_RELOC(ring, query_sample(aq, stop));
125
126 fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
127
128 /* To avoid stalling in the draw buffer, emit code the code to compute the
129 * counter delta in the epilogue ring.
130 */
131 struct fd_ringbuffer *epilogue = fd_batch_get_tile_epilogue(batch);
132
133 OUT_PKT7(epilogue, CP_WAIT_REG_MEM, 6);
134 OUT_RING(epilogue, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
135 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
136 OUT_RELOC(epilogue, query_sample(aq, stop));
137 OUT_RING(epilogue, CP_WAIT_REG_MEM_3_REF(0xffffffff));
138 OUT_RING(epilogue, CP_WAIT_REG_MEM_4_MASK(0xffffffff));
139 OUT_RING(epilogue, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
140
141 /* result += stop - start: */
142 OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9);
143 OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
144 OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */
145 OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */
146 OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */
147 OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */
148 } else {
149 OUT_PKT(ring, CP_EVENT_WRITE7,
150 CP_EVENT_WRITE7_0(
151 .event = ZPASS_DONE,
152 .write_sample_count = true,
153 ),
154 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, stop)),
155 );
156 OUT_PKT(ring, CP_EVENT_WRITE7,
157 CP_EVENT_WRITE7_0(
158 .event = ZPASS_DONE,
159 .write_sample_count = true,
160 .sample_count_end_offset = true,
161 .write_accum_sample_count_diff = true,
162 ),
163 /* Note: SQE is adding offsets to the iova, SAMPLE_COUNT_END_OFFSET causes
164 * the result to be written to iova+16, and WRITE_ACCUM_SAMP_COUNT_DIFF
165 * does *(iova + 8) += *(iova + 16) - *iova
166 *
167 * It just so happens this is the layout we already to for start/result/stop
168 * So we just give the start address in all cases.
169 */
170 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
171 );
172 }
173 }
174
175 static void
occlusion_counter_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)176 occlusion_counter_result(struct fd_acc_query *aq,
177 struct fd_acc_query_sample *s,
178 union pipe_query_result *result)
179 {
180 struct fd6_query_sample *sp = fd6_query_sample(s);
181 result->u64 = sp->result;
182 }
183
184 static void
occlusion_counter_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)185 occlusion_counter_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
186 enum pipe_query_value_type result_type,
187 int index, struct fd_resource *dst,
188 unsigned offset)
189 {
190 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
191 offsetof(struct fd6_query_sample, result));
192 }
193
194 static void
occlusion_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)195 occlusion_predicate_result(struct fd_acc_query *aq,
196 struct fd_acc_query_sample *s,
197 union pipe_query_result *result)
198 {
199 struct fd6_query_sample *sp = fd6_query_sample(s);
200 result->b = !!sp->result;
201 }
202
203 static void
occlusion_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)204 occlusion_predicate_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
205 enum pipe_query_value_type result_type,
206 int index, struct fd_resource *dst,
207 unsigned offset)
208 {
209 /* This is a bit annoying but we need to turn the result into a one or
210 * zero.. to do this use a CP_COND_WRITE to overwrite the result with
211 * a one if it is non-zero. This doesn't change the results if the
212 * query is also read on the CPU (ie. occlusion_predicate_result()).
213 */
214 OUT_PKT7(ring, CP_COND_WRITE5, 9);
215 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
216 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY) |
217 CP_COND_WRITE5_0_WRITE_MEMORY);
218 OUT_RELOC(ring, query_sample(aq, result)); /* POLL_ADDR_LO/HI */
219 OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
220 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
221 OUT_RELOC(ring, query_sample(aq, result)); /* WRITE_ADDR_LO/HI */
222 OUT_RING(ring, 1);
223 OUT_RING(ring, 0);
224
225 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
226 offsetof(struct fd6_query_sample, result));
227 }
228
229 template <chip CHIP>
230 static const struct fd_acc_sample_provider occlusion_counter = {
231 .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
232 .size = sizeof(struct fd6_query_sample),
233 .resume = occlusion_resume<CHIP>,
234 .pause = occlusion_pause<CHIP>,
235 .result = occlusion_counter_result,
236 .result_resource = occlusion_counter_result_resource,
237 };
238
239 template <chip CHIP>
240 static const struct fd_acc_sample_provider occlusion_predicate = {
241 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
242 .size = sizeof(struct fd6_query_sample),
243 .resume = occlusion_resume<CHIP>,
244 .pause = occlusion_pause<CHIP>,
245 .result = occlusion_predicate_result,
246 .result_resource = occlusion_predicate_result_resource,
247 };
248
249 template <chip CHIP>
250 static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
251 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
252 .size = sizeof(struct fd6_query_sample),
253 .resume = occlusion_resume<CHIP>,
254 .pause = occlusion_pause<CHIP>,
255 .result = occlusion_predicate_result,
256 .result_resource = occlusion_predicate_result_resource,
257 };
258
259 /*
260 * Timestamp Queries:
261 */
262
263 template <chip CHIP>
264 static void
timestamp_resume(struct fd_acc_query * aq,struct fd_batch * batch)265 timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch)
266 {
267 struct fd_ringbuffer *ring = batch->draw;
268
269 fd6_record_ts<CHIP>(ring, query_sample(aq, start));
270 }
271
272 template <chip CHIP>
273 static void
time_elapsed_pause(struct fd_acc_query * aq,struct fd_batch * batch)274 time_elapsed_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
275 {
276 struct fd_ringbuffer *ring = batch->draw;
277
278 fd6_record_ts<CHIP>(ring, query_sample(aq, stop));
279
280 OUT_WFI5(ring);
281
282 /* result += stop - start: */
283 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
284 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
285 OUT_RELOC(ring, query_sample(aq, result)); /* dst */
286 OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
287 OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */
288 OUT_RELOC(ring, query_sample(aq, start)); /* srcC */
289 }
290
291 static void
timestamp_pause(struct fd_acc_query * aq,struct fd_batch * batch)292 timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch)
293 {
294 /* We captured a timestamp in timestamp_resume(), nothing to do here. */
295 }
296
297 /* timestamp logging for u_trace: */
298 template <chip CHIP>
299 static void
record_timestamp(struct fd_ringbuffer * ring,struct fd_bo * bo,unsigned offset)300 record_timestamp(struct fd_ringbuffer *ring, struct fd_bo *bo, unsigned offset)
301 {
302 fd6_record_ts<CHIP>(ring, bo, offset, 0, 0);
303 }
304
305 static void
time_elapsed_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)306 time_elapsed_accumulate_result(struct fd_acc_query *aq,
307 struct fd_acc_query_sample *s,
308 union pipe_query_result *result)
309 {
310 struct fd6_query_sample *sp = fd6_query_sample(s);
311 result->u64 = ticks_to_ns(sp->result);
312 }
313
314 static void
time_elapsed_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)315 time_elapsed_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
316 enum pipe_query_value_type result_type,
317 int index, struct fd_resource *dst,
318 unsigned offset)
319 {
320 // TODO ticks_to_ns conversion would require spinning up a compute shader?
321 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
322 offsetof(struct fd6_query_sample, result));
323 }
324
325 static void
timestamp_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)326 timestamp_accumulate_result(struct fd_acc_query *aq,
327 struct fd_acc_query_sample *s,
328 union pipe_query_result *result)
329 {
330 struct fd6_query_sample *sp = fd6_query_sample(s);
331 result->u64 = ticks_to_ns(sp->start);
332 }
333
334 static void
timestamp_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)335 timestamp_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
336 enum pipe_query_value_type result_type,
337 int index, struct fd_resource *dst,
338 unsigned offset)
339 {
340 // TODO ticks_to_ns conversion would require spinning up a compute shader?
341 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
342 offsetof(struct fd6_query_sample, start));
343 }
344
345 template <chip CHIP>
346 static const struct fd_acc_sample_provider time_elapsed = {
347 .query_type = PIPE_QUERY_TIME_ELAPSED,
348 .always = true,
349 .size = sizeof(struct fd6_query_sample),
350 .resume = timestamp_resume<CHIP>,
351 .pause = time_elapsed_pause<CHIP>,
352 .result = time_elapsed_accumulate_result,
353 .result_resource = time_elapsed_result_resource,
354 };
355
356 /* NOTE: timestamp query isn't going to give terribly sensible results
357 * on a tiler. But it is needed by qapitrace profile heatmap. If you
358 * add in a binning pass, the results get even more non-sensical. So
359 * we just return the timestamp on the last tile and hope that is
360 * kind of good enough.
361 */
362
363 template <chip CHIP>
364 static const struct fd_acc_sample_provider timestamp = {
365 .query_type = PIPE_QUERY_TIMESTAMP,
366 .always = true,
367 .size = sizeof(struct fd6_query_sample),
368 .resume = timestamp_resume<CHIP>,
369 .pause = timestamp_pause,
370 .result = timestamp_accumulate_result,
371 .result_resource = timestamp_result_resource,
372 };
373
374 struct PACKED fd6_pipeline_stats_sample {
375 struct fd_acc_query_sample base;
376
377 uint64_t start, stop, result;
378 };
379 FD_DEFINE_CAST(fd_acc_query_sample, fd6_pipeline_stats_sample);
380
381 #define stats_reloc(ring, aq, field) \
382 OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \
383 offsetof(struct fd6_pipeline_stats_sample, field), 0, 0);
384
385 /* Mapping of counters to pipeline stats:
386 *
387 * Gallium (PIPE_STAT_QUERY_x) | Vulkan (VK_QUERY_PIPELINE_STATISTIC_x_BIT) | hw counter
388 * ----------------------------+--------------------------------------------+----------------
389 * IA_VERTICES | INPUT_ASSEMBLY_VERTICES | RBBM_PRIMCTR_0
390 * IA_PRIMITIVES | INPUT_ASSEMBLY_PRIMITIVES | RBBM_PRIMCTR_1
391 * VS_INVOCATIONS | VERTEX_SHADER_INVOCATIONS | RBBM_PRIMCTR_0
392 * GS_INVOCATIONS | GEOMETRY_SHADER_INVOCATIONS | RBBM_PRIMCTR_5
393 * GS_PRIMITIVES | GEOMETRY_SHADER_PRIMITIVES | RBBM_PRIMCTR_6
394 * C_INVOCATIONS | CLIPPING_INVOCATIONS | RBBM_PRIMCTR_7
395 * C_PRIMITIVES | CLIPPING_PRIMITIVES | RBBM_PRIMCTR_8
396 * PS_INVOCATIONS | FRAGMENT_SHADER_INVOCATIONS | RBBM_PRIMCTR_9
397 * HS_INVOCATIONS | TESSELLATION_CONTROL_SHADER_PATCHES | RBBM_PRIMCTR_2
398 * DS_INVOCATIONS | TESSELLATION_EVALUATION_SHADER_INVOCATIONS | RBBM_PRIMCTR_4
399 * CS_INVOCATIONS | COMPUTE_SHADER_INVOCATIONS | RBBM_PRIMCTR_10
400 *
401 * Note that "Vertices corresponding to incomplete primitives may contribute to the count.",
402 * in our case they do not, so IA_VERTICES and VS_INVOCATIONS are the same thing.
403 */
404
405 enum stats_type {
406 STATS_PRIMITIVE,
407 STATS_FRAGMENT,
408 STATS_COMPUTE,
409 };
410
411 static const struct {
412 enum fd_gpu_event start, stop;
413 } stats_counter_events[] = {
414 [STATS_PRIMITIVE] = { FD_START_PRIMITIVE_CTRS, FD_STOP_PRIMITIVE_CTRS },
415 [STATS_FRAGMENT] = { FD_START_FRAGMENT_CTRS, FD_STOP_FRAGMENT_CTRS },
416 [STATS_COMPUTE] = { FD_START_COMPUTE_CTRS, FD_STOP_COMPUTE_CTRS },
417 };
418
419 static enum stats_type
get_stats_type(struct fd_acc_query * aq)420 get_stats_type(struct fd_acc_query *aq)
421 {
422 if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
423 return STATS_PRIMITIVE;
424
425 switch (aq->base.index) {
426 case PIPE_STAT_QUERY_PS_INVOCATIONS: return STATS_FRAGMENT;
427 case PIPE_STAT_QUERY_CS_INVOCATIONS: return STATS_COMPUTE;
428 default:
429 return STATS_PRIMITIVE;
430 }
431 }
432
433 static unsigned
stats_counter_index(struct fd_acc_query * aq)434 stats_counter_index(struct fd_acc_query *aq)
435 {
436 if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
437 return 7;
438
439 switch (aq->base.index) {
440 case PIPE_STAT_QUERY_IA_VERTICES: return 0;
441 case PIPE_STAT_QUERY_IA_PRIMITIVES: return 1;
442 case PIPE_STAT_QUERY_VS_INVOCATIONS: return 0;
443 case PIPE_STAT_QUERY_GS_INVOCATIONS: return 5;
444 case PIPE_STAT_QUERY_GS_PRIMITIVES: return 6;
445 case PIPE_STAT_QUERY_C_INVOCATIONS: return 7;
446 case PIPE_STAT_QUERY_C_PRIMITIVES: return 8;
447 case PIPE_STAT_QUERY_PS_INVOCATIONS: return 9;
448 case PIPE_STAT_QUERY_HS_INVOCATIONS: return 2;
449 case PIPE_STAT_QUERY_DS_INVOCATIONS: return 4;
450 case PIPE_STAT_QUERY_CS_INVOCATIONS: return 10;
451 default:
452 return 0;
453 }
454 }
455
456 static void
log_pipeline_stats(struct fd6_pipeline_stats_sample * ps,unsigned idx)457 log_pipeline_stats(struct fd6_pipeline_stats_sample *ps, unsigned idx)
458 {
459 #ifdef DEBUG_COUNTERS
460 const char *labels[] = {
461 "VS_INVOCATIONS",
462 "IA_PRIMITIVES",
463 "HS_INVOCATIONS",
464 "??",
465 "DS_INVOCATIONS",
466 "GS_INVOCATIONS",
467 "GS_PRIMITIVES",
468 "C_INVOCATIONS",
469 "C_PRIMITIVES",
470 "PS_INVOCATIONS",
471 "CS_INVOCATIONS",
472 };
473
474 mesa_logd(" counter\t\tstart\t\t\tstop\t\t\tdiff");
475 mesa_logd(" RBBM_PRIMCTR_%d\t0x%016" PRIx64 "\t0x%016" PRIx64 "\t%" PRIi64 "\t%s",
476 idx, ps->start, ps->stop, ps->stop - ps->start, labels[idx]);
477 #endif
478 }
479
480 template <chip CHIP>
481 static void
pipeline_stats_resume(struct fd_acc_query * aq,struct fd_batch * batch)482 pipeline_stats_resume(struct fd_acc_query *aq, struct fd_batch *batch)
483 assert_dt
484 {
485 struct fd_ringbuffer *ring = batch->draw;
486 enum stats_type type = get_stats_type(aq);
487 unsigned idx = stats_counter_index(aq);
488 unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
489
490 OUT_WFI5(ring);
491
492 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
493 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
494 CP_REG_TO_MEM_0_CNT(2) |
495 CP_REG_TO_MEM_0_REG(reg));
496 stats_reloc(ring, aq, start);
497
498 assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
499
500 if (!batch->pipeline_stats_queries_active[type])
501 fd6_event_write<CHIP>(batch->ctx, ring, stats_counter_events[type].start);
502 batch->pipeline_stats_queries_active[type]++;
503 }
504
505 template <chip CHIP>
506 static void
pipeline_stats_pause(struct fd_acc_query * aq,struct fd_batch * batch)507 pipeline_stats_pause(struct fd_acc_query *aq, struct fd_batch *batch)
508 assert_dt
509 {
510 struct fd_ringbuffer *ring = batch->draw;
511 enum stats_type type = get_stats_type(aq);
512 unsigned idx = stats_counter_index(aq);
513 unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
514
515 OUT_WFI5(ring);
516
517 /* snapshot the end values: */
518 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
519 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
520 CP_REG_TO_MEM_0_CNT(2) |
521 CP_REG_TO_MEM_0_REG(reg));
522 stats_reloc(ring, aq, stop);
523
524 assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
525 assert(batch->pipeline_stats_queries_active[type] > 0);
526
527 batch->pipeline_stats_queries_active[type]--;
528 if (batch->pipeline_stats_queries_active[type])
529 fd6_event_write<CHIP>(batch->ctx, ring, stats_counter_events[type].stop);
530
531 /* result += stop - start: */
532 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
533 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x40000000);
534 stats_reloc(ring, aq, result);
535 stats_reloc(ring, aq, result);
536 stats_reloc(ring, aq, stop)
537 stats_reloc(ring, aq, start);
538 }
539
540 static void
pipeline_stats_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)541 pipeline_stats_result(struct fd_acc_query *aq,
542 struct fd_acc_query_sample *s,
543 union pipe_query_result *result)
544 {
545 struct fd6_pipeline_stats_sample *ps = fd6_pipeline_stats_sample(s);
546
547 log_pipeline_stats(ps, stats_counter_index(aq));
548
549 result->u64 = ps->result;
550 }
551
552 static void
pipeline_stats_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)553 pipeline_stats_result_resource(struct fd_acc_query *aq,
554 struct fd_ringbuffer *ring,
555 enum pipe_query_value_type result_type,
556 int index, struct fd_resource *dst,
557 unsigned offset)
558 {
559 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
560 offsetof(struct fd6_pipeline_stats_sample, result));
561 }
562
563 template <chip CHIP>
564 static const struct fd_acc_sample_provider primitives_generated = {
565 .query_type = PIPE_QUERY_PRIMITIVES_GENERATED,
566 .size = sizeof(struct fd6_pipeline_stats_sample),
567 .resume = pipeline_stats_resume<CHIP>,
568 .pause = pipeline_stats_pause<CHIP>,
569 .result = pipeline_stats_result,
570 .result_resource = pipeline_stats_result_resource,
571 };
572
573 template <chip CHIP>
574 static const struct fd_acc_sample_provider pipeline_statistics_single = {
575 .query_type = PIPE_QUERY_PIPELINE_STATISTICS_SINGLE,
576 .size = sizeof(struct fd6_pipeline_stats_sample),
577 .resume = pipeline_stats_resume<CHIP>,
578 .pause = pipeline_stats_pause<CHIP>,
579 .result = pipeline_stats_result,
580 .result_resource = pipeline_stats_result_resource,
581 };
582
583 struct PACKED fd6_primitives_sample {
584 struct fd_acc_query_sample base;
585
586 /* VPC_SO_STREAM_COUNTS dest address must be 32b aligned: */
587 uint64_t pad[3];
588
589 struct {
590 uint64_t emitted, generated;
591 } start[4], stop[4], result;
592 };
593 FD_DEFINE_CAST(fd_acc_query_sample, fd6_primitives_sample);
594
595 #define primitives_reloc(ring, aq, field) \
596 OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \
597 __offsetof(struct fd6_primitives_sample, field), 0, 0);
598
599 static void
log_primitives_sample(struct fd6_primitives_sample * ps)600 log_primitives_sample(struct fd6_primitives_sample *ps)
601 {
602 #ifdef DEBUG_COUNTERS
603 mesa_logd(" so counts");
604 for (int i = 0; i < ARRAY_SIZE(ps->start); i++) {
605 mesa_logd(" CHANNEL %d emitted\t0x%016" PRIx64 "\t0x%016" PRIx64
606 "\t%" PRIi64,
607 i, ps->start[i].generated, ps->stop[i].generated,
608 ps->stop[i].generated - ps->start[i].generated);
609 mesa_logd(" CHANNEL %d generated\t0x%016" PRIx64 "\t0x%016" PRIx64
610 "\t%" PRIi64,
611 i, ps->start[i].emitted, ps->stop[i].emitted,
612 ps->stop[i].emitted - ps->start[i].emitted);
613 }
614
615 mesa_logd("generated %" PRIu64 ", emitted %" PRIu64, ps->result.generated,
616 ps->result.emitted);
617 #endif
618 }
619
620 template <chip CHIP>
621 static void
primitives_emitted_resume(struct fd_acc_query * aq,struct fd_batch * batch)622 primitives_emitted_resume(struct fd_acc_query *aq,
623 struct fd_batch *batch) assert_dt
624 {
625 struct fd_ringbuffer *ring = batch->draw;
626
627 OUT_WFI5(ring);
628
629 ASSERT_ALIGNED(struct fd6_primitives_sample, start[0], 32);
630
631 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
632 primitives_reloc(ring, aq, start[0]);
633
634 fd6_event_write<CHIP>(batch->ctx, ring, FD_WRITE_PRIMITIVE_COUNTS);
635 }
636
637 static void
accumultate_primitives_emitted(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)638 accumultate_primitives_emitted(struct fd_acc_query *aq,
639 struct fd_ringbuffer *ring,
640 int idx)
641 {
642 /* result += stop - start: */
643 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
644 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
645 primitives_reloc(ring, aq, result.emitted);
646 primitives_reloc(ring, aq, result.emitted);
647 primitives_reloc(ring, aq, stop[idx].emitted);
648 primitives_reloc(ring, aq, start[idx].emitted);
649 }
650
651 static void
accumultate_primitives_generated(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)652 accumultate_primitives_generated(struct fd_acc_query *aq,
653 struct fd_ringbuffer *ring,
654 int idx)
655 {
656 /* result += stop - start: */
657 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
658 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
659 primitives_reloc(ring, aq, result.generated);
660 primitives_reloc(ring, aq, result.generated);
661 primitives_reloc(ring, aq, stop[idx].generated);
662 primitives_reloc(ring, aq, start[idx].generated);
663 }
664
665 template <chip CHIP>
666 static void
primitives_emitted_pause(struct fd_acc_query * aq,struct fd_batch * batch)667 primitives_emitted_pause(struct fd_acc_query *aq,
668 struct fd_batch *batch) assert_dt
669 {
670 struct fd_ringbuffer *ring = batch->draw;
671
672 OUT_WFI5(ring);
673
674 ASSERT_ALIGNED(struct fd6_primitives_sample, stop[0], 32);
675
676 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
677 primitives_reloc(ring, aq, stop[0]);
678
679 fd6_event_write<CHIP>(batch->ctx, ring, FD_WRITE_PRIMITIVE_COUNTS);
680 fd6_event_write<CHIP>(batch->ctx, ring, FD_CACHE_CLEAN);
681
682 if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
683 /* Need results from all channels: */
684 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
685 accumultate_primitives_emitted(aq, ring, i);
686 accumultate_primitives_generated(aq, ring, i);
687 }
688 } else {
689 accumultate_primitives_emitted(aq, ring, aq->base.index);
690 /* Only need primitives generated counts for the overflow queries: */
691 if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
692 accumultate_primitives_generated(aq, ring, aq->base.index);
693 }
694 }
695
696 static void
primitives_emitted_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)697 primitives_emitted_result(struct fd_acc_query *aq,
698 struct fd_acc_query_sample *s,
699 union pipe_query_result *result)
700 {
701 struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
702
703 log_primitives_sample(ps);
704
705 result->u64 = ps->result.emitted;
706 }
707
708 static void
primitives_emitted_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)709 primitives_emitted_result_resource(struct fd_acc_query *aq,
710 struct fd_ringbuffer *ring,
711 enum pipe_query_value_type result_type,
712 int index, struct fd_resource *dst,
713 unsigned offset)
714 {
715 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
716 offsetof(struct fd6_primitives_sample, result.emitted));
717 }
718
719 static void
so_overflow_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)720 so_overflow_predicate_result(struct fd_acc_query *aq,
721 struct fd_acc_query_sample *s,
722 union pipe_query_result *result)
723 {
724 struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
725
726 log_primitives_sample(ps);
727
728 result->b = ps->result.emitted != ps->result.generated;
729 }
730
731 static void
so_overflow_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)732 so_overflow_predicate_result_resource(struct fd_acc_query *aq,
733 struct fd_ringbuffer *ring,
734 enum pipe_query_value_type result_type,
735 int index, struct fd_resource *dst,
736 unsigned offset)
737 {
738 fd_ringbuffer_attach_bo(ring, dst->bo);
739 fd_ringbuffer_attach_bo(ring, fd_resource(aq->prsc)->bo);
740
741 /* result = generated - emitted: */
742 OUT_PKT7(ring, CP_MEM_TO_MEM, 7);
743 OUT_RING(ring, CP_MEM_TO_MEM_0_NEG_B |
744 COND(result_type >= PIPE_QUERY_TYPE_I64, CP_MEM_TO_MEM_0_DOUBLE));
745 OUT_RELOC(ring, dst->bo, offset, 0, 0);
746 primitives_reloc(ring, aq, result.generated);
747 primitives_reloc(ring, aq, result.emitted);
748
749 /* This is a bit awkward, but glcts expects the result to be 1 or 0
750 * rather than non-zero vs zero:
751 */
752 OUT_PKT7(ring, CP_COND_WRITE5, 9);
753 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
754 CP_COND_WRITE5_0_POLL(POLL_MEMORY) |
755 CP_COND_WRITE5_0_WRITE_MEMORY);
756 OUT_RELOC(ring, dst->bo, offset, 0, 0); /* POLL_ADDR_LO/HI */
757 OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
758 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
759 OUT_RELOC(ring, dst->bo, offset, 0, 0); /* WRITE_ADDR_LO/HI */
760 OUT_RING(ring, 1);
761 OUT_RING(ring, 0);
762 }
763
764 template <chip CHIP>
765 static const struct fd_acc_sample_provider primitives_emitted = {
766 .query_type = PIPE_QUERY_PRIMITIVES_EMITTED,
767 .size = sizeof(struct fd6_primitives_sample),
768 .resume = primitives_emitted_resume<CHIP>,
769 .pause = primitives_emitted_pause<CHIP>,
770 .result = primitives_emitted_result,
771 .result_resource = primitives_emitted_result_resource,
772 };
773
774 template <chip CHIP>
775 static const struct fd_acc_sample_provider so_overflow_any_predicate = {
776 .query_type = PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE,
777 .size = sizeof(struct fd6_primitives_sample),
778 .resume = primitives_emitted_resume<CHIP>,
779 .pause = primitives_emitted_pause<CHIP>,
780 .result = so_overflow_predicate_result,
781 .result_resource = so_overflow_predicate_result_resource,
782 };
783
784 template <chip CHIP>
785 static const struct fd_acc_sample_provider so_overflow_predicate = {
786 .query_type = PIPE_QUERY_SO_OVERFLOW_PREDICATE,
787 .size = sizeof(struct fd6_primitives_sample),
788 .resume = primitives_emitted_resume<CHIP>,
789 .pause = primitives_emitted_pause<CHIP>,
790 .result = so_overflow_predicate_result,
791 .result_resource = so_overflow_predicate_result_resource,
792 };
793
794 /*
795 * Performance Counter (batch) queries:
796 *
797 * Only one of these is active at a time, per design of the gallium
798 * batch_query API design. On perfcntr query tracks N query_types,
799 * each of which has a 'fd_batch_query_entry' that maps it back to
800 * the associated group and counter.
801 */
802
803 struct fd_batch_query_entry {
804 uint8_t gid; /* group-id */
805 uint8_t cid; /* countable-id within the group */
806 };
807
808 struct fd_batch_query_data {
809 struct fd_screen *screen;
810 unsigned num_query_entries;
811 struct fd_batch_query_entry query_entries[];
812 };
813
814 static void
perfcntr_resume(struct fd_acc_query * aq,struct fd_batch * batch)815 perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
816 {
817 struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
818 struct fd_screen *screen = data->screen;
819 struct fd_ringbuffer *ring = batch->draw;
820
821 unsigned counters_per_group[screen->num_perfcntr_groups];
822 memset(counters_per_group, 0, sizeof(counters_per_group));
823
824 OUT_WFI5(ring);
825
826 /* configure performance counters for the requested queries: */
827 for (unsigned i = 0; i < data->num_query_entries; i++) {
828 struct fd_batch_query_entry *entry = &data->query_entries[i];
829 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
830 unsigned counter_idx = counters_per_group[entry->gid]++;
831
832 assert(counter_idx < g->num_counters);
833
834 OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
835 OUT_RING(ring, g->countables[entry->cid].selector);
836 }
837
838 memset(counters_per_group, 0, sizeof(counters_per_group));
839
840 /* and snapshot the start values */
841 for (unsigned i = 0; i < data->num_query_entries; i++) {
842 struct fd_batch_query_entry *entry = &data->query_entries[i];
843 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
844 unsigned counter_idx = counters_per_group[entry->gid]++;
845 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
846
847 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
848 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
849 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
850 OUT_RELOC(ring, query_sample_idx(aq, i, start));
851 }
852 }
853
854 static void
perfcntr_pause(struct fd_acc_query * aq,struct fd_batch * batch)855 perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
856 {
857 struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
858 struct fd_screen *screen = data->screen;
859 struct fd_ringbuffer *ring = batch->draw;
860
861 unsigned counters_per_group[screen->num_perfcntr_groups];
862 memset(counters_per_group, 0, sizeof(counters_per_group));
863
864 OUT_WFI5(ring);
865
866 /* TODO do we need to bother to turn anything off? */
867
868 /* snapshot the end values: */
869 for (unsigned i = 0; i < data->num_query_entries; i++) {
870 struct fd_batch_query_entry *entry = &data->query_entries[i];
871 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
872 unsigned counter_idx = counters_per_group[entry->gid]++;
873 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
874
875 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
876 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
877 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
878 OUT_RELOC(ring, query_sample_idx(aq, i, stop));
879 }
880
881 /* and compute the result: */
882 for (unsigned i = 0; i < data->num_query_entries; i++) {
883 /* result += stop - start: */
884 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
885 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
886 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
887 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
888 OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */
889 OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */
890 }
891 }
892
893 static void
perfcntr_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)894 perfcntr_accumulate_result(struct fd_acc_query *aq,
895 struct fd_acc_query_sample *s,
896 union pipe_query_result *result)
897 {
898 struct fd_batch_query_data *data =
899 (struct fd_batch_query_data *)aq->query_data;
900 struct fd6_query_sample *sp = fd6_query_sample(s);
901
902 for (unsigned i = 0; i < data->num_query_entries; i++) {
903 result->batch[i].u64 = sp[i].result;
904 }
905 }
906
907 static const struct fd_acc_sample_provider perfcntr = {
908 .query_type = FD_QUERY_FIRST_PERFCNTR,
909 .always = true,
910 .resume = perfcntr_resume,
911 .pause = perfcntr_pause,
912 .result = perfcntr_accumulate_result,
913 };
914
915 static struct pipe_query *
fd6_create_batch_query(struct pipe_context * pctx,unsigned num_queries,unsigned * query_types)916 fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
917 unsigned *query_types)
918 {
919 struct fd_context *ctx = fd_context(pctx);
920 struct fd_screen *screen = ctx->screen;
921 struct fd_query *q;
922 struct fd_acc_query *aq;
923 struct fd_batch_query_data *data;
924
925 data = CALLOC_VARIANT_LENGTH_STRUCT(
926 fd_batch_query_data, num_queries * sizeof(data->query_entries[0]));
927
928 data->screen = screen;
929 data->num_query_entries = num_queries;
930
931 /* validate the requested query_types and ensure we don't try
932 * to request more query_types of a given group than we have
933 * counters:
934 */
935 unsigned counters_per_group[screen->num_perfcntr_groups];
936 memset(counters_per_group, 0, sizeof(counters_per_group));
937
938 for (unsigned i = 0; i < num_queries; i++) {
939 unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
940
941 /* verify valid query_type, ie. is it actually a perfcntr? */
942 if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
943 (idx >= screen->num_perfcntr_queries)) {
944 mesa_loge("invalid batch query query_type: %u", query_types[i]);
945 goto error;
946 }
947
948 struct fd_batch_query_entry *entry = &data->query_entries[i];
949 struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
950
951 entry->gid = pq->group_id;
952
953 /* the perfcntr_queries[] table flattens all the countables
954 * for each group in series, ie:
955 *
956 * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
957 *
958 * So to find the countable index just step back through the
959 * table to find the first entry with the same group-id.
960 */
961 while (pq > screen->perfcntr_queries) {
962 pq--;
963 if (pq->group_id == entry->gid)
964 entry->cid++;
965 }
966
967 if (counters_per_group[entry->gid] >=
968 screen->perfcntr_groups[entry->gid].num_counters) {
969 mesa_loge("too many counters for group %u", entry->gid);
970 goto error;
971 }
972
973 counters_per_group[entry->gid]++;
974 }
975
976 q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
977 aq = fd_acc_query(q);
978
979 /* sample buffer size is based on # of queries: */
980 aq->size = num_queries * sizeof(struct fd6_query_sample);
981 aq->query_data = data;
982
983 return (struct pipe_query *)q;
984
985 error:
986 free(data);
987 return NULL;
988 }
989
990 template <chip CHIP>
991 void
fd6_query_context_init(struct pipe_context * pctx)992 fd6_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
993 {
994 struct fd_context *ctx = fd_context(pctx);
995
996 ctx->create_query = fd_acc_create_query;
997 ctx->query_update_batch = fd_acc_query_update_batch;
998
999 ctx->record_timestamp = record_timestamp<CHIP>;
1000 ctx->ts_to_ns = ticks_to_ns;
1001
1002 pctx->create_batch_query = fd6_create_batch_query;
1003
1004 fd_acc_query_register_provider(pctx, &occlusion_counter<CHIP>);
1005 fd_acc_query_register_provider(pctx, &occlusion_predicate<CHIP>);
1006 fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative<CHIP>);
1007
1008 fd_acc_query_register_provider(pctx, &time_elapsed<CHIP>);
1009 fd_acc_query_register_provider(pctx, ×tamp<CHIP>);
1010
1011 fd_acc_query_register_provider(pctx, &primitives_generated<CHIP>);
1012 fd_acc_query_register_provider(pctx, &pipeline_statistics_single<CHIP>);
1013
1014 fd_acc_query_register_provider(pctx, &primitives_emitted<CHIP>);
1015 fd_acc_query_register_provider(pctx, &so_overflow_any_predicate<CHIP>);
1016 fd_acc_query_register_provider(pctx, &so_overflow_predicate<CHIP>);
1017 }
1018 FD_GENX(fd6_query_context_init);
1019