1 /*
2 * Copyright 2021 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_spm.h"
8
9 #include "util/bitscan.h"
10 #include "util/u_memory.h"
11 #include "ac_perfcounter.h"
12
13 /* SPM counters definition. */
14 /* GFX10+ */
15 static struct ac_spm_counter_descr gfx10_num_l2_hits = {TCP, 0x9};
16 static struct ac_spm_counter_descr gfx10_num_l2_misses = {TCP, 0x12};
17 static struct ac_spm_counter_descr gfx10_num_scache_hits = {SQ, 0x14f};
18 static struct ac_spm_counter_descr gfx10_num_scache_misses = {SQ, 0x150};
19 static struct ac_spm_counter_descr gfx10_num_scache_misses_dup = {SQ, 0x151};
20 static struct ac_spm_counter_descr gfx10_num_icache_hits = {SQ, 0x12c};
21 static struct ac_spm_counter_descr gfx10_num_icache_misses = {SQ, 0x12d};
22 static struct ac_spm_counter_descr gfx10_num_icache_misses_dup = {SQ, 0x12e};
23 static struct ac_spm_counter_descr gfx10_num_gl1c_hits = {GL1C, 0xe};
24 static struct ac_spm_counter_descr gfx10_num_gl1c_misses = {GL1C, 0x12};
25 static struct ac_spm_counter_descr gfx10_num_gl2c_hits = {GL2C, 0x3};
26 static struct ac_spm_counter_descr gfx10_num_gl2c_misses = {GL2C, 0x23};
27
28 static struct ac_spm_counter_create_info gfx10_spm_counters[] = {
29 {&gfx10_num_l2_hits},
30 {&gfx10_num_l2_misses},
31 {&gfx10_num_scache_hits},
32 {&gfx10_num_scache_misses},
33 {&gfx10_num_scache_misses_dup},
34 {&gfx10_num_icache_hits},
35 {&gfx10_num_icache_misses},
36 {&gfx10_num_icache_misses_dup},
37 {&gfx10_num_gl1c_hits},
38 {&gfx10_num_gl1c_misses},
39 {&gfx10_num_gl2c_hits},
40 {&gfx10_num_gl2c_misses},
41 };
42
43 /* GFX10.3+ */
44 static struct ac_spm_counter_descr gfx103_num_gl2c_misses = {GL2C, 0x2b};
45
46 static struct ac_spm_counter_create_info gfx103_spm_counters[] = {
47 {&gfx10_num_l2_hits},
48 {&gfx10_num_l2_misses},
49 {&gfx10_num_scache_hits},
50 {&gfx10_num_scache_misses},
51 {&gfx10_num_scache_misses_dup},
52 {&gfx10_num_icache_hits},
53 {&gfx10_num_icache_misses},
54 {&gfx10_num_icache_misses_dup},
55 {&gfx10_num_gl1c_hits},
56 {&gfx10_num_gl1c_misses},
57 {&gfx10_num_gl2c_hits},
58 {&gfx103_num_gl2c_misses},
59 };
60
61 /* GFX11+ */
62 static struct ac_spm_counter_descr gfx11_num_l2_misses = {TCP, 0x11};
63 static struct ac_spm_counter_descr gfx11_num_scache_hits = {SQ_WGP, 0x126};
64 static struct ac_spm_counter_descr gfx11_num_scache_misses = {SQ_WGP, 0x127};
65 static struct ac_spm_counter_descr gfx11_num_scache_misses_dup = {SQ_WGP, 0x128};
66 static struct ac_spm_counter_descr gfx11_num_icache_hits = {SQ_WGP, 0x10e};
67 static struct ac_spm_counter_descr gfx11_num_icache_misses = {SQ_WGP, 0x10f};
68 static struct ac_spm_counter_descr gfx11_num_icache_misses_dup = {SQ_WGP, 0x110};
69
70 static struct ac_spm_counter_create_info gfx11_spm_counters[] = {
71 {&gfx10_num_l2_hits},
72 {&gfx11_num_l2_misses},
73 {&gfx11_num_scache_hits},
74 {&gfx11_num_scache_misses},
75 {&gfx11_num_scache_misses_dup},
76 {&gfx11_num_icache_hits},
77 {&gfx11_num_icache_misses},
78 {&gfx11_num_icache_misses_dup},
79 {&gfx10_num_gl1c_hits},
80 {&gfx10_num_gl1c_misses},
81 {&gfx10_num_gl2c_hits},
82 {&gfx103_num_gl2c_misses},
83 };
84
85 static struct ac_spm_block_select *
ac_spm_get_block_select(struct ac_spm * spm,const struct ac_pc_block * block)86 ac_spm_get_block_select(struct ac_spm *spm, const struct ac_pc_block *block)
87 {
88 struct ac_spm_block_select *block_sel, *new_block_sel;
89 uint32_t num_block_sel;
90
91 for (uint32_t i = 0; i < spm->num_block_sel; i++) {
92 if (spm->block_sel[i].b->b->b->gpu_block == block->b->b->gpu_block)
93 return &spm->block_sel[i];
94 }
95
96 /* Allocate a new select block if it doesn't already exist. */
97 num_block_sel = spm->num_block_sel + 1;
98 block_sel = realloc(spm->block_sel, num_block_sel * sizeof(*block_sel));
99 if (!block_sel)
100 return NULL;
101
102 spm->num_block_sel = num_block_sel;
103 spm->block_sel = block_sel;
104
105 /* Initialize the new select block. */
106 new_block_sel = &spm->block_sel[spm->num_block_sel - 1];
107 memset(new_block_sel, 0, sizeof(*new_block_sel));
108
109 new_block_sel->b = block;
110 new_block_sel->instances =
111 calloc(block->num_global_instances, sizeof(*new_block_sel->instances));
112 if (!new_block_sel->instances)
113 return NULL;
114 new_block_sel->num_instances = block->num_global_instances;
115
116 for (unsigned i = 0; i < new_block_sel->num_instances; i++)
117 new_block_sel->instances[i].num_counters = block->b->b->num_spm_counters;
118
119 return new_block_sel;
120 }
121
122 struct ac_spm_instance_mapping {
123 uint32_t se_index; /* SE index or 0 if global */
124 uint32_t sa_index; /* SA index or 0 if global or per-SE */
125 uint32_t instance_index;
126 };
127
128 static bool
ac_spm_init_instance_mapping(const struct radeon_info * info,const struct ac_pc_block * block,const struct ac_spm_counter_info * counter,struct ac_spm_instance_mapping * mapping)129 ac_spm_init_instance_mapping(const struct radeon_info *info,
130 const struct ac_pc_block *block,
131 const struct ac_spm_counter_info *counter,
132 struct ac_spm_instance_mapping *mapping)
133 {
134 uint32_t instance_index = 0, se_index = 0, sa_index = 0;
135
136 if (block->b->b->flags & AC_PC_BLOCK_SE) {
137 if (block->b->b->gpu_block == SQ) {
138 /* Per-SE blocks. */
139 se_index = counter->instance / block->num_instances;
140 instance_index = counter->instance % block->num_instances;
141 } else {
142 /* Per-SA blocks. */
143 assert(block->b->b->gpu_block == GL1C ||
144 block->b->b->gpu_block == TCP ||
145 block->b->b->gpu_block == SQ_WGP);
146 se_index = (counter->instance / block->num_instances) / info->max_sa_per_se;
147 sa_index = (counter->instance / block->num_instances) % info->max_sa_per_se;
148 instance_index = counter->instance % block->num_instances;
149 }
150 } else {
151 /* Global blocks. */
152 assert(block->b->b->gpu_block == GL2C);
153 instance_index = counter->instance;
154 }
155
156 if (se_index >= info->num_se ||
157 sa_index >= info->max_sa_per_se ||
158 instance_index >= block->num_instances)
159 return false;
160
161 mapping->se_index = se_index;
162 mapping->sa_index = sa_index;
163 mapping->instance_index = instance_index;
164
165 return true;
166 }
167
168 static void
ac_spm_init_muxsel(const struct radeon_info * info,const struct ac_pc_block * block,const struct ac_spm_instance_mapping * mapping,struct ac_spm_counter_info * counter,uint32_t spm_wire)169 ac_spm_init_muxsel(const struct radeon_info *info,
170 const struct ac_pc_block *block,
171 const struct ac_spm_instance_mapping *mapping,
172 struct ac_spm_counter_info *counter,
173 uint32_t spm_wire)
174 {
175 const uint16_t counter_idx = 2 * spm_wire + (counter->is_even ? 0 : 1);
176 union ac_spm_muxsel *muxsel = &counter->muxsel;
177
178 if (info->gfx_level >= GFX11) {
179 muxsel->gfx11.counter = counter_idx;
180 muxsel->gfx11.block = block->b->b->spm_block_select;
181 muxsel->gfx11.shader_array = mapping->sa_index;
182 muxsel->gfx11.instance = mapping->instance_index;
183 } else {
184 muxsel->gfx10.counter = counter_idx;
185 muxsel->gfx10.block = block->b->b->spm_block_select;
186 muxsel->gfx10.shader_array = mapping->sa_index;
187 muxsel->gfx10.instance = mapping->instance_index;
188 }
189 }
190
191 static uint32_t
ac_spm_init_grbm_gfx_index(const struct ac_pc_block * block,const struct ac_spm_instance_mapping * mapping)192 ac_spm_init_grbm_gfx_index(const struct ac_pc_block *block,
193 const struct ac_spm_instance_mapping *mapping)
194 {
195 uint32_t instance = mapping->instance_index;
196 uint32_t grbm_gfx_index = 0;
197
198 grbm_gfx_index |= S_030800_SE_INDEX(mapping->se_index) |
199 S_030800_SH_INDEX(mapping->sa_index);
200
201 switch (block->b->b->gpu_block) {
202 case GL2C:
203 /* Global blocks. */
204 grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
205 break;
206 case SQ:
207 /* Per-SE blocks. */
208 grbm_gfx_index |= S_030800_SH_BROADCAST_WRITES(1);
209 break;
210 default:
211 /* Other blocks shouldn't broadcast. */
212 break;
213 }
214
215 if (block->b->b->gpu_block == SQ_WGP) {
216 union {
217 struct {
218 uint32_t block_index : 2; /* Block index withing WGP */
219 uint32_t wgp_index : 3;
220 uint32_t is_below_spi : 1; /* 0: lower WGP numbers, 1: higher WGP numbers */
221 uint32_t reserved : 26;
222 };
223
224 uint32_t value;
225 } instance_index = {0};
226
227 const uint32_t num_wgp_above_spi = 4;
228 const bool is_below_spi = mapping->instance_index >= num_wgp_above_spi;
229
230 instance_index.wgp_index =
231 is_below_spi ? (mapping->instance_index - num_wgp_above_spi) : mapping->instance_index;
232 instance_index.is_below_spi = is_below_spi;
233
234 instance = instance_index.value;
235 }
236
237 grbm_gfx_index |= S_030800_INSTANCE_INDEX(instance);
238
239 return grbm_gfx_index;
240 }
241
242 static bool
ac_spm_map_counter(struct ac_spm * spm,struct ac_spm_block_select * block_sel,struct ac_spm_counter_info * counter,const struct ac_spm_instance_mapping * mapping,uint32_t * spm_wire)243 ac_spm_map_counter(struct ac_spm *spm, struct ac_spm_block_select *block_sel,
244 struct ac_spm_counter_info *counter,
245 const struct ac_spm_instance_mapping *mapping,
246 uint32_t *spm_wire)
247 {
248 uint32_t instance = counter->instance;
249
250 if (block_sel->b->b->b->gpu_block == SQ_WGP) {
251 if (!spm->sq_wgp[instance].grbm_gfx_index) {
252 spm->sq_wgp[instance].grbm_gfx_index =
253 ac_spm_init_grbm_gfx_index(block_sel->b, mapping);
254 }
255
256 for (unsigned i = 0; i < ARRAY_SIZE(spm->sq_wgp[instance].counters); i++) {
257 struct ac_spm_counter_select *cntr_sel = &spm->sq_wgp[instance].counters[i];
258
259 if (i < spm->sq_wgp[instance].num_counters)
260 continue;
261
262 cntr_sel->sel0 |= S_036700_PERF_SEL(counter->event_id) |
263 S_036700_SPM_MODE(1) | /* 16-bit clamp */
264 S_036700_PERF_MODE(0);
265
266 /* Each SQ_WQP modules (GFX11+) share one 32-bit accumulator/wire
267 * per pair of selects.
268 */
269 cntr_sel->active |= 1 << (i % 2);
270 *spm_wire = i / 2;
271
272 if (cntr_sel->active & 0x1)
273 counter->is_even = true;
274
275 spm->sq_wgp[instance].num_counters++;
276 return true;
277 }
278 } else if (block_sel->b->b->b->gpu_block == SQ) {
279 for (unsigned i = 0; i < ARRAY_SIZE(spm->sqg[instance].counters); i++) {
280 struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[i];
281
282 if (i < spm->sqg[instance].num_counters)
283 continue;
284
285 /* SQ doesn't support 16-bit counters. */
286 cntr_sel->sel0 |= S_036700_PERF_SEL(counter->event_id) |
287 S_036700_SPM_MODE(3) | /* 32-bit clamp */
288 S_036700_PERF_MODE(0);
289 cntr_sel->active |= 0x3;
290
291 /* 32-bits counter are always even. */
292 counter->is_even = true;
293
294 /* One wire per SQ module. */
295 *spm_wire = i;
296
297 spm->sqg[instance].num_counters++;
298 return true;
299 }
300 } else {
301 /* Generic blocks. */
302 struct ac_spm_block_instance *block_instance =
303 &block_sel->instances[instance];
304
305 if (!block_instance->grbm_gfx_index) {
306 block_instance->grbm_gfx_index =
307 ac_spm_init_grbm_gfx_index(block_sel->b, mapping);
308 }
309
310 for (unsigned i = 0; i < block_instance->num_counters; i++) {
311 struct ac_spm_counter_select *cntr_sel = &block_instance->counters[i];
312 int index = ffs(~cntr_sel->active) - 1;
313
314 switch (index) {
315 case 0: /* use S_037004_PERF_SEL */
316 cntr_sel->sel0 |= S_037004_PERF_SEL(counter->event_id) |
317 S_037004_CNTR_MODE(1) | /* 16-bit clamp */
318 S_037004_PERF_MODE(0); /* accum */
319 break;
320 case 1: /* use S_037004_PERF_SEL1 */
321 cntr_sel->sel0 |= S_037004_PERF_SEL1(counter->event_id) |
322 S_037004_PERF_MODE1(0);
323 break;
324 case 2: /* use S_037004_PERF_SEL2 */
325 cntr_sel->sel1 |= S_037008_PERF_SEL2(counter->event_id) |
326 S_037008_PERF_MODE2(0);
327 break;
328 case 3: /* use S_037004_PERF_SEL3 */
329 cntr_sel->sel1 |= S_037008_PERF_SEL3(counter->event_id) |
330 S_037008_PERF_MODE3(0);
331 break;
332 default:
333 return false;
334 }
335
336 /* Mark this 16-bit counter as used. */
337 cntr_sel->active |= 1 << index;
338
339 /* Determine if the counter is even or odd. */
340 counter->is_even = !(index % 2);
341
342 /* Determine the SPM wire (one wire holds two 16-bit counters). */
343 *spm_wire = !!(index >= 2);
344
345 return true;
346 }
347 }
348
349 return false;
350 }
351
352 static bool
ac_spm_add_counter(const struct radeon_info * info,const struct ac_perfcounters * pc,struct ac_spm * spm,const struct ac_spm_counter_create_info * counter_info)353 ac_spm_add_counter(const struct radeon_info *info,
354 const struct ac_perfcounters *pc,
355 struct ac_spm *spm,
356 const struct ac_spm_counter_create_info *counter_info)
357 {
358 struct ac_spm_instance_mapping instance_mapping = {0};
359 struct ac_spm_counter_info *counter;
360 struct ac_spm_block_select *block_sel;
361 struct ac_pc_block *block;
362 uint32_t spm_wire;
363
364 /* Check if the GPU block is valid. */
365 block = ac_pc_get_block(pc, counter_info->b->gpu_block);
366 if (!block) {
367 fprintf(stderr, "ac/spm: Invalid GPU block.\n");
368 return false;
369 }
370
371 /* Check if the number of instances is valid. */
372 if (counter_info->instance > block->num_global_instances - 1) {
373 fprintf(stderr, "ac/spm: Invalid instance ID.\n");
374 return false;
375 }
376
377 /* Check if the event ID is valid. */
378 if (counter_info->b->event_id > block->b->selectors) {
379 fprintf(stderr, "ac/spm: Invalid event ID.\n");
380 return false;
381 }
382
383 counter = &spm->counters[spm->num_counters];
384 spm->num_counters++;
385
386 counter->gpu_block = counter_info->b->gpu_block;
387 counter->event_id = counter_info->b->event_id;
388 counter->instance = counter_info->instance;
389
390 /* Get the select block used to configure the counter. */
391 block_sel = ac_spm_get_block_select(spm, block);
392 if (!block_sel)
393 return false;
394
395 /* Initialize instance mapping for the counter. */
396 if (!ac_spm_init_instance_mapping(info, block, counter, &instance_mapping)) {
397 fprintf(stderr, "ac/spm: Failed to initialize instance mapping.\n");
398 return false;
399 }
400
401 /* Map the counter to the select block. */
402 if (!ac_spm_map_counter(spm, block_sel, counter, &instance_mapping, &spm_wire)) {
403 fprintf(stderr, "ac/spm: No free slots available!\n");
404 return false;
405 }
406
407 /* Determine the counter segment type. */
408 if (block->b->b->flags & AC_PC_BLOCK_SE) {
409 counter->segment_type = instance_mapping.se_index;
410 } else {
411 counter->segment_type = AC_SPM_SEGMENT_TYPE_GLOBAL;
412 }
413
414 /* Configure the muxsel for SPM. */
415 ac_spm_init_muxsel(info, block, &instance_mapping, counter, spm_wire);
416
417 return true;
418 }
419
420 static void
ac_spm_fill_muxsel_ram(const struct radeon_info * info,struct ac_spm * spm,enum ac_spm_segment_type segment_type,uint32_t offset)421 ac_spm_fill_muxsel_ram(const struct radeon_info *info,
422 struct ac_spm *spm,
423 enum ac_spm_segment_type segment_type,
424 uint32_t offset)
425 {
426 struct ac_spm_muxsel_line *mappings = spm->muxsel_lines[segment_type];
427 uint32_t even_counter_idx = 0, even_line_idx = 0;
428 uint32_t odd_counter_idx = 0, odd_line_idx = 1;
429
430 /* Add the global timestamps first. */
431 if (segment_type == AC_SPM_SEGMENT_TYPE_GLOBAL) {
432 if (info->gfx_level >= GFX11) {
433 mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf840;
434 mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf841;
435 mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf842;
436 mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf843;
437 } else {
438 for (unsigned i = 0; i < 4; i++) {
439 mappings[even_line_idx].muxsel[even_counter_idx++].value = 0xf0f0;
440 }
441 }
442 }
443
444 for (unsigned i = 0; i < spm->num_counters; i++) {
445 struct ac_spm_counter_info *counter = &spm->counters[i];
446
447 if (counter->segment_type != segment_type)
448 continue;
449
450 if (counter->is_even) {
451 counter->offset =
452 (offset + even_line_idx) * AC_SPM_NUM_COUNTER_PER_MUXSEL + even_counter_idx;
453
454 mappings[even_line_idx].muxsel[even_counter_idx] = spm->counters[i].muxsel;
455 if (++even_counter_idx == AC_SPM_NUM_COUNTER_PER_MUXSEL) {
456 even_counter_idx = 0;
457 even_line_idx += 2;
458 }
459 } else {
460 counter->offset =
461 (offset + odd_line_idx) * AC_SPM_NUM_COUNTER_PER_MUXSEL + odd_counter_idx;
462
463 mappings[odd_line_idx].muxsel[odd_counter_idx] = spm->counters[i].muxsel;
464 if (++odd_counter_idx == AC_SPM_NUM_COUNTER_PER_MUXSEL) {
465 odd_counter_idx = 0;
466 odd_line_idx += 2;
467 }
468 }
469 }
470 }
471
ac_init_spm(const struct radeon_info * info,const struct ac_perfcounters * pc,struct ac_spm * spm)472 bool ac_init_spm(const struct radeon_info *info,
473 const struct ac_perfcounters *pc,
474 struct ac_spm *spm)
475 {
476 const struct ac_spm_counter_create_info *create_info;
477 unsigned create_info_count;
478 unsigned num_counters = 0;
479
480 switch (info->gfx_level) {
481 case GFX10:
482 create_info_count = ARRAY_SIZE(gfx10_spm_counters);
483 create_info = gfx10_spm_counters;
484 break;
485 case GFX10_3:
486 create_info_count = ARRAY_SIZE(gfx103_spm_counters);
487 create_info = gfx103_spm_counters;
488 break;
489 case GFX11:
490 case GFX11_5:
491 create_info_count = ARRAY_SIZE(gfx11_spm_counters);
492 create_info = gfx11_spm_counters;
493 break;
494 default:
495 return false; /* not implemented */
496 }
497
498 /* Count the total number of counters. */
499 for (unsigned i = 0; i < create_info_count; i++) {
500 const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block);
501
502 if (!block)
503 return false;
504
505 num_counters += block->num_global_instances;
506 }
507
508 spm->counters = CALLOC(num_counters, sizeof(*spm->counters));
509 if (!spm->counters)
510 return false;
511
512 for (unsigned i = 0; i < create_info_count; i++) {
513 const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block);
514 struct ac_spm_counter_create_info counter = create_info[i];
515
516 for (unsigned j = 0; j < block->num_global_instances; j++) {
517 counter.instance = j;
518
519 if (!ac_spm_add_counter(info, pc, spm, &counter)) {
520 fprintf(stderr, "ac/spm: Failed to add SPM counter (%d).\n", i);
521 return false;
522 }
523 }
524 }
525
526 /* Determine the segment size and create a muxsel ram for every segment. */
527 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
528 unsigned num_even_counters = 0, num_odd_counters = 0;
529
530 if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
531 /* The global segment always start with a 64-bit timestamp. */
532 num_even_counters += AC_SPM_GLOBAL_TIMESTAMP_COUNTERS;
533 }
534
535 /* Count the number of even/odd counters for this segment. */
536 for (unsigned c = 0; c < spm->num_counters; c++) {
537 struct ac_spm_counter_info *counter = &spm->counters[c];
538
539 if (counter->segment_type != s)
540 continue;
541
542 if (counter->is_even) {
543 num_even_counters++;
544 } else {
545 num_odd_counters++;
546 }
547 }
548
549 /* Compute the number of lines. */
550 unsigned even_lines =
551 DIV_ROUND_UP(num_even_counters, AC_SPM_NUM_COUNTER_PER_MUXSEL);
552 unsigned odd_lines =
553 DIV_ROUND_UP(num_odd_counters, AC_SPM_NUM_COUNTER_PER_MUXSEL);
554 unsigned num_lines = (even_lines > odd_lines) ? (2 * even_lines - 1) : (2 * odd_lines);
555
556 spm->muxsel_lines[s] = CALLOC(num_lines, sizeof(*spm->muxsel_lines[s]));
557 if (!spm->muxsel_lines[s])
558 return false;
559 spm->num_muxsel_lines[s] = num_lines;
560 }
561
562 /* Compute the maximum number of muxsel lines among all SEs. On GFX11,
563 * there is only one SE segment size value and the highest value is used.
564 */
565 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_GLOBAL; s++) {
566 spm->max_se_muxsel_lines =
567 MAX2(spm->num_muxsel_lines[s], spm->max_se_muxsel_lines);
568 }
569
570 /* RLC uses the following order: Global, SE0, SE1, SE2, SE3, SE4, SE5. */
571 ac_spm_fill_muxsel_ram(info, spm, AC_SPM_SEGMENT_TYPE_GLOBAL, 0);
572
573 const uint32_t num_global_lines = spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL];
574
575 if (info->gfx_level >= GFX11) {
576 /* On GFX11, RLC uses one segment size for every single SE. */
577 for (unsigned i = 0; i < info->num_se; i++) {
578 assert(i < AC_SPM_SEGMENT_TYPE_GLOBAL);
579 uint32_t offset = num_global_lines + i * spm->max_se_muxsel_lines;
580
581 ac_spm_fill_muxsel_ram(info, spm, i, offset);
582 }
583 } else {
584 uint32_t offset = num_global_lines;
585
586 for (unsigned i = 0; i < info->num_se; i++) {
587 assert(i < AC_SPM_SEGMENT_TYPE_GLOBAL);
588
589 ac_spm_fill_muxsel_ram(info, spm, i, offset);
590
591 offset += spm->num_muxsel_lines[i];
592 }
593 }
594
595 /* On GFX11, the data size written by the hw is in units of segment. */
596 spm->ptr_granularity = info->gfx_level >= GFX11 ? 32 : 1;
597
598 return true;
599 }
600
ac_destroy_spm(struct ac_spm * spm)601 void ac_destroy_spm(struct ac_spm *spm)
602 {
603 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
604 FREE(spm->muxsel_lines[s]);
605 }
606
607 for (unsigned i = 0; i < spm->num_block_sel; i++) {
608 FREE(spm->block_sel[i].instances);
609 }
610
611 FREE(spm->block_sel);
612 FREE(spm->counters);
613 }
614
ac_spm_get_sample_size(const struct ac_spm * spm)615 static uint32_t ac_spm_get_sample_size(const struct ac_spm *spm)
616 {
617 uint32_t sample_size = 0; /* in bytes */
618
619 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
620 sample_size += spm->num_muxsel_lines[s] * AC_SPM_MUXSEL_LINE_SIZE * 4;
621 }
622
623 return sample_size;
624 }
625
ac_spm_get_num_samples(const struct ac_spm * spm)626 static uint32_t ac_spm_get_num_samples(const struct ac_spm *spm)
627 {
628 uint32_t sample_size = ac_spm_get_sample_size(spm);
629 uint32_t *ptr = (uint32_t *)spm->ptr;
630 uint32_t data_size, num_lines_written;
631 uint32_t num_samples = 0;
632
633 /* Get the data size (in bytes) written by the hw to the ring buffer. */
634 data_size = ptr[0] * spm->ptr_granularity;
635
636 /* Compute the number of 256 bits (16 * 16-bits counters) lines written. */
637 num_lines_written = data_size / (2 * AC_SPM_NUM_COUNTER_PER_MUXSEL);
638
639 /* Check for overflow. */
640 if (num_lines_written % (sample_size / 32)) {
641 abort();
642 } else {
643 num_samples = num_lines_written / (sample_size / 32);
644 }
645
646 return num_samples;
647 }
648
ac_spm_get_trace(const struct ac_spm * spm,struct ac_spm_trace * trace)649 void ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace)
650 {
651 memset(trace, 0, sizeof(*trace));
652
653 trace->ptr = spm->ptr;
654 trace->sample_interval = spm->sample_interval;
655 trace->num_counters = spm->num_counters;
656 trace->counters = spm->counters;
657 trace->sample_size_in_bytes = ac_spm_get_sample_size(spm);
658 trace->num_samples = ac_spm_get_num_samples(spm);
659 }
660