1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "amd_family.h"
8 #include "si_build_pm4.h"
9 #include "si_pipe.h"
10
11 #include "tgsi/tgsi_from_mesa.h"
12 #include "util/hash_table.h"
13 #include "util/u_debug.h"
14 #include "util/u_memory.h"
15 #include "ac_rgp.h"
16 #include "ac_sqtt.h"
17
18 static void
19 si_emit_spi_config_cntl(struct si_context *sctx,
20 struct radeon_cmdbuf *cs, bool enable);
21
si_sqtt_init_bo(struct si_context * sctx)22 static bool si_sqtt_init_bo(struct si_context *sctx)
23 {
24 unsigned max_se = sctx->screen->info.max_se;
25 struct radeon_winsys *ws = sctx->ws;
26 uint64_t size;
27
28 /* The buffer size and address need to be aligned in HW regs. Align the
29 * size as early as possible so that we do all the allocation & addressing
30 * correctly. */
31 sctx->sqtt->buffer_size =
32 align64(sctx->sqtt->buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
33
34 /* Compute total size of the thread trace BO for all SEs. */
35 size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
36 1 << SQTT_BUFFER_ALIGN_SHIFT);
37 size += sctx->sqtt->buffer_size * (uint64_t)max_se;
38
39 sctx->sqtt->bo =
40 ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_GTT,
41 RADEON_FLAG_NO_INTERPROCESS_SHARING |
42 RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC);
43 if (!sctx->sqtt->bo)
44 return false;
45
46 sctx->sqtt->buffer_va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
47
48 return true;
49 }
50
si_emit_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)51 static void si_emit_sqtt_start(struct si_context *sctx,
52 struct radeon_cmdbuf *cs,
53 enum amd_ip_type ip_type)
54 {
55 struct si_screen *sscreen = sctx->screen;
56 const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
57 struct ac_pm4_state *pm4;
58
59 pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
60 if (!pm4)
61 return;
62
63 ac_sqtt_emit_start(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
64 ac_pm4_finalize(pm4);
65
66 radeon_begin(cs);
67 radeon_emit_array(pm4->pm4, pm4->ndw);
68 radeon_end();
69
70 ac_pm4_free_state(pm4);
71 }
72
si_emit_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)73 static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs,
74 enum amd_ip_type ip_type)
75 {
76 struct si_screen *sscreen = sctx->screen;
77 const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
78 struct ac_pm4_state *pm4;
79
80 pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
81 if (!pm4)
82 return;
83
84 ac_sqtt_emit_stop(&sscreen->info, pm4, is_compute_queue);
85 ac_pm4_finalize(pm4);
86
87 radeon_begin(cs);
88 radeon_emit_array(pm4->pm4, pm4->ndw);
89 radeon_end();
90
91 ac_pm4_clear_state(pm4, &sscreen->info, false, is_compute_queue);
92
93 if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
94 /* Some chips with disabled RBs should wait for idle because FINISH_DONE
95 * doesn't work. */
96 sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB |
97 SI_BARRIER_SYNC_CS;
98 sctx->emit_barrier(sctx, cs);
99 }
100
101 ac_sqtt_emit_wait(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
102 ac_pm4_finalize(pm4);
103
104 radeon_begin_again(cs);
105 radeon_emit_array(pm4->pm4, pm4->ndw);
106 radeon_end();
107
108 ac_pm4_free_state(pm4);
109 }
110
si_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs)111 static void si_sqtt_start(struct si_context *sctx, struct radeon_cmdbuf *cs)
112 {
113 struct radeon_winsys *ws = sctx->ws;
114 enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
115
116 radeon_begin(cs);
117
118 switch (ip_type) {
119 case AMD_IP_GFX:
120 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
121 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
122 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
123 break;
124 case AMD_IP_COMPUTE:
125 radeon_emit(PKT3(PKT3_NOP, 0, 0));
126 radeon_emit(0);
127 break;
128 default:
129 /* Unsupported. */
130 assert(false);
131 }
132 radeon_end();
133
134 ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
135 RADEON_DOMAIN_VRAM);
136 if (sctx->spm.bo)
137 ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
138 RADEON_DOMAIN_VRAM);
139
140 si_cp_dma_wait_for_idle(sctx, cs);
141
142 /* Make sure to wait-for-idle before starting SQTT. */
143 sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
144 SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
145 SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
146 SI_BARRIER_PFP_SYNC_ME;
147 sctx->emit_barrier(sctx, cs);
148
149 si_inhibit_clockgating(sctx, cs, true);
150
151 /* Enable SQG events that collects thread trace data. */
152 si_emit_spi_config_cntl(sctx, cs, true);
153
154 if (sctx->spm.bo) {
155 si_pc_emit_spm_reset(cs);
156 si_pc_emit_shaders(cs, ac_sqtt_get_shader_mask(&sctx->screen->info));
157 si_emit_spm_setup(sctx, cs);
158 }
159
160 si_emit_sqtt_start(sctx, cs, ip_type);
161
162 if (sctx->spm.bo)
163 si_pc_emit_spm_start(cs);
164 }
165
si_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs)166 static void si_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs)
167 {
168 struct radeon_winsys *ws = sctx->ws;
169 enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
170
171 radeon_begin(cs);
172
173 switch (ip_type) {
174 case AMD_IP_GFX:
175 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
176 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
177 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
178 break;
179 case AMD_IP_COMPUTE:
180 radeon_emit(PKT3(PKT3_NOP, 0, 0));
181 radeon_emit(0);
182 break;
183 default:
184 /* Unsupported. */
185 assert(false);
186 }
187 radeon_end();
188
189 ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
190 RADEON_DOMAIN_VRAM);
191
192 if (sctx->spm.bo)
193 ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
194 RADEON_DOMAIN_VRAM);
195
196 si_cp_dma_wait_for_idle(sctx, cs);
197
198 if (sctx->spm.bo)
199 si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
200 sctx->screen->info.never_send_perfcounter_stop);
201
202 /* Make sure to wait-for-idle before stopping SQTT. */
203 sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
204 SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
205 SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
206 SI_BARRIER_PFP_SYNC_ME;
207 sctx->emit_barrier(sctx, cs);
208
209 si_emit_sqtt_stop(sctx, cs, ip_type);
210
211 if (sctx->spm.bo)
212 si_pc_emit_spm_reset(cs);
213
214 /* Restore previous state by disabling SQG events. */
215 si_emit_spi_config_cntl(sctx, cs, false);
216
217 si_inhibit_clockgating(sctx, cs, false);
218 }
219
si_sqtt_init_cs(struct si_context * sctx)220 static void si_sqtt_init_cs(struct si_context *sctx)
221 {
222 struct radeon_winsys *ws = sctx->ws;
223
224 for (unsigned i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
225 sctx->sqtt->start_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
226 if (!ws->cs_create(sctx->sqtt->start_cs[i], sctx->ctx, (enum amd_ip_type)i,
227 NULL, NULL)) {
228 free(sctx->sqtt->start_cs[i]);
229 sctx->sqtt->start_cs[i] = NULL;
230 return;
231 }
232 si_sqtt_start(sctx, sctx->sqtt->start_cs[i]);
233
234 sctx->sqtt->stop_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
235 if (!ws->cs_create(sctx->sqtt->stop_cs[i], sctx->ctx, (enum amd_ip_type)i,
236 NULL, NULL)) {
237 ws->cs_destroy(sctx->sqtt->start_cs[i]);
238 free(sctx->sqtt->start_cs[i]);
239 sctx->sqtt->start_cs[i] = NULL;
240 free(sctx->sqtt->stop_cs[i]);
241 sctx->sqtt->stop_cs[i] = NULL;
242 return;
243 }
244
245 si_sqtt_stop(sctx, sctx->sqtt->stop_cs[i]);
246 }
247 }
248
si_begin_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)249 static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
250 {
251 struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[sctx->ws->cs_get_ip_type(rcs)];
252 sctx->ws->cs_flush(cs, 0, NULL);
253 }
254
si_end_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)255 static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
256 {
257 struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[sctx->ws->cs_get_ip_type(rcs)];
258 sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
259 }
260
261 static bool
si_sqtt_resize_bo(struct si_context * sctx)262 si_sqtt_resize_bo(struct si_context *sctx)
263 {
264 /* Destroy the previous thread trace BO. */
265 struct pb_buffer_lean *bo = sctx->sqtt->bo;
266 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
267
268 /* Double the size of the thread trace buffer per SE. */
269 sctx->sqtt->buffer_size *= 2;
270
271 fprintf(stderr,
272 "Failed to get the thread trace because the buffer "
273 "was too small, resizing to %d KB\n",
274 sctx->sqtt->buffer_size / 1024);
275
276 /* Re-create the thread trace BO. */
277 return si_sqtt_init_bo(sctx);
278 }
279
si_get_sqtt_trace(struct si_context * sctx,struct ac_sqtt_trace * sqtt)280 static bool si_get_sqtt_trace(struct si_context *sctx,
281 struct ac_sqtt_trace *sqtt)
282 {
283 memset(sqtt, 0, sizeof(*sqtt));
284
285 sctx->sqtt->ptr =
286 sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ);
287
288 if (!sctx->sqtt->ptr)
289 return false;
290
291 if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) {
292 if (!si_sqtt_resize_bo(sctx)) {
293 fprintf(stderr, "radeonsi: Failed to resize the SQTT buffer.\n");
294 } else {
295 for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
296 sctx->screen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
297 sctx->screen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
298 }
299 si_sqtt_init_cs(sctx);
300 }
301 return false;
302 }
303 return true;
304 }
305
si_init_sqtt(struct si_context * sctx)306 bool si_init_sqtt(struct si_context *sctx)
307 {
308 static bool warn_once = true;
309 if (warn_once) {
310 fprintf(stderr, "*************************************************\n");
311 fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
312 fprintf(stderr, "*************************************************\n");
313 warn_once = false;
314 }
315
316 sctx->sqtt = CALLOC_STRUCT(ac_sqtt);
317
318 if (sctx->gfx_level < GFX8) {
319 fprintf(stderr, "GPU hardware not supported: refer to "
320 "the RGP documentation for the list of "
321 "supported GPUs!\n");
322 return false;
323 }
324
325 if (sctx->gfx_level > GFX11) {
326 fprintf(stderr, "radeonsi: Thread trace is not supported "
327 "for that GPU!\n");
328 return false;
329 }
330
331 /* Default buffer size set to 32MB per SE. */
332 sctx->sqtt->buffer_size =
333 debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
334 sctx->sqtt->instruction_timing_enabled =
335 debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true);
336 sctx->sqtt->start_frame = 10;
337
338 const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
339 if (trigger) {
340 sctx->sqtt->start_frame = atoi(trigger);
341 if (sctx->sqtt->start_frame <= 0) {
342 /* This isn't a frame number, must be a file */
343 sctx->sqtt->trigger_file = strdup(trigger);
344 sctx->sqtt->start_frame = -1;
345 }
346 }
347
348 if (!si_sqtt_init_bo(sctx))
349 return false;
350
351 sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL);
352
353 ac_sqtt_init(sctx->sqtt);
354
355 if (sctx->gfx_level >= GFX10 &&
356 debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
357 /* Limit SPM counters to GFX10 and GFX10_3 for now */
358 ASSERTED bool r = si_spm_init(sctx);
359 assert(r);
360 }
361
362 si_sqtt_init_cs(sctx);
363
364 sctx->sqtt_next_event = EventInvalid;
365
366 return true;
367 }
368
si_destroy_sqtt(struct si_context * sctx)369 void si_destroy_sqtt(struct si_context *sctx)
370 {
371 struct si_screen *sscreen = sctx->screen;
372 struct pb_buffer_lean *bo = sctx->sqtt->bo;
373 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
374
375 if (sctx->sqtt->trigger_file)
376 free(sctx->sqtt->trigger_file);
377
378 for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
379 sscreen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
380 sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
381 }
382
383 struct rgp_pso_correlation *pso_correlation =
384 &sctx->sqtt->rgp_pso_correlation;
385 struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events;
386 struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
387 list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
388 &pso_correlation->record, list) {
389 list_del(&record->list);
390 pso_correlation->record_count--;
391 free(record);
392 }
393
394 list_for_each_entry_safe (struct rgp_loader_events_record, record,
395 &loader_events->record, list) {
396 list_del(&record->list);
397 loader_events->record_count--;
398 free(record);
399 }
400
401 list_for_each_entry_safe (struct rgp_code_object_record, record,
402 &code_object->record, list) {
403 uint32_t mask = record->shader_stages_mask;
404 int i;
405
406 /* Free the disassembly. */
407 while (mask) {
408 i = u_bit_scan(&mask);
409 free(record->shader_data[i].code);
410 }
411 list_del(&record->list);
412 free(record);
413 code_object->record_count--;
414 }
415
416 ac_sqtt_finish(sctx->sqtt);
417
418 hash_table_foreach (sctx->sqtt->pipeline_bos->table, entry) {
419 struct si_sqtt_fake_pipeline *pipeline =
420 (struct si_sqtt_fake_pipeline *)entry->data;
421 si_resource_reference(&pipeline->bo, NULL);
422 FREE(pipeline);
423 }
424
425 free(sctx->sqtt);
426 sctx->sqtt = NULL;
427
428 if (sctx->spm.bo)
429 si_spm_finish(sctx);
430 }
431
432 static uint64_t num_frames = 0;
433
si_handle_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)434 void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
435 {
436 /* Should we enable SQTT yet? */
437 if (!sctx->sqtt_enabled) {
438 bool frame_trigger = num_frames == sctx->sqtt->start_frame;
439 bool file_trigger = false;
440 if (sctx->sqtt->trigger_file &&
441 access(sctx->sqtt->trigger_file, W_OK) == 0) {
442 if (unlink(sctx->sqtt->trigger_file) == 0) {
443 file_trigger = true;
444 } else {
445 /* Do not enable tracing if we cannot remove the file,
446 * because by then we'll trace every frame.
447 */
448 fprintf(stderr, "radeonsi: could not remove thread "
449 "trace trigger file, ignoring\n");
450 }
451 }
452
453 if (frame_trigger || file_trigger) {
454 /* Wait for last submission */
455 sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence,
456 OS_TIMEOUT_INFINITE);
457
458 /* Start SQTT */
459 si_begin_sqtt(sctx, rcs);
460
461 sctx->sqtt_enabled = true;
462 sctx->sqtt->start_frame = -1;
463
464 /* Force shader update to make sure si_sqtt_describe_pipeline_bind is
465 * called for the current "pipeline".
466 */
467 sctx->do_update_shaders = true;
468 }
469 } else {
470 struct ac_sqtt_trace sqtt_trace = {0};
471
472 /* Stop SQTT */
473 si_end_sqtt(sctx, rcs);
474 sctx->sqtt_enabled = false;
475 sctx->sqtt->start_frame = -1;
476 assert(sctx->last_sqtt_fence);
477
478 /* Wait for SQTT to finish and read back the bo */
479 if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence,
480 OS_TIMEOUT_INFINITE) &&
481 si_get_sqtt_trace(sctx, &sqtt_trace)) {
482 struct ac_spm_trace spm_trace;
483
484 /* Map the SPM counter buffer */
485 if (sctx->spm.bo) {
486 sctx->spm.ptr = sctx->ws->buffer_map(
487 sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
488 ac_spm_get_trace(&sctx->spm, &spm_trace);
489 }
490
491 ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace,
492 sctx->spm.bo ? &spm_trace : NULL);
493
494 if (sctx->spm.ptr)
495 sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
496 } else {
497 fprintf(stderr, "Failed to read the trace\n");
498 if (!sctx->sqtt->trigger_file) {
499 sctx->sqtt->start_frame = num_frames + 10;
500 }
501 }
502 }
503
504 num_frames++;
505 }
506
si_emit_sqtt_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)507 static void si_emit_sqtt_userdata(struct si_context *sctx,
508 struct radeon_cmdbuf *cs, const void *data,
509 uint32_t num_dwords)
510 {
511 const uint32_t *dwords = (uint32_t *)data;
512
513 radeon_begin(cs);
514
515 while (num_dwords > 0) {
516 uint32_t count = MIN2(num_dwords, 2);
517
518 radeon_set_uconfig_perfctr_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
519 radeon_emit_array(dwords, count);
520
521 dwords += count;
522 num_dwords -= count;
523 }
524 radeon_end();
525 }
526
527 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)528 si_emit_spi_config_cntl(struct si_context *sctx,
529 struct radeon_cmdbuf *cs, bool enable)
530 {
531 radeon_begin(cs);
532
533 if (sctx->gfx_level >= GFX9) {
534 uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
535 S_031100_EXP_PRIORITY_ORDER(3) |
536 S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
537 S_031100_ENABLE_SQG_BOP_EVENTS(enable);
538
539 if (sctx->gfx_level >= GFX10)
540 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
541
542 radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
543 } else {
544 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
545 radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
546 S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
547 S_009100_ENABLE_SQG_BOP_EVENTS(enable));
548 }
549 radeon_end();
550 }
551
552 static uint32_t num_events = 0;
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)553 void si_sqtt_write_event_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
554 enum rgp_sqtt_marker_event_type api_type,
555 uint32_t vertex_offset_user_data,
556 uint32_t instance_offset_user_data,
557 uint32_t draw_index_user_data)
558 {
559 struct rgp_sqtt_marker_event marker = {0};
560
561 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
562 marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
563 marker.cmd_id = num_events++;
564 marker.cb_id = 0;
565
566 if (vertex_offset_user_data == UINT_MAX ||
567 instance_offset_user_data == UINT_MAX) {
568 vertex_offset_user_data = 0;
569 instance_offset_user_data = 0;
570 }
571
572 if (draw_index_user_data == UINT_MAX)
573 draw_index_user_data = vertex_offset_user_data;
574
575 marker.vertex_offset_reg_idx = vertex_offset_user_data;
576 marker.instance_offset_reg_idx = instance_offset_user_data;
577 marker.draw_index_reg_idx = draw_index_user_data;
578
579 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
580
581 sctx->sqtt_next_event = EventInvalid;
582 }
583
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)584 void si_write_event_with_dims_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
585 enum rgp_sqtt_marker_event_type api_type,
586 uint32_t x, uint32_t y, uint32_t z)
587 {
588 struct rgp_sqtt_marker_event_with_dims marker = {0};
589
590 marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
591 marker.event.api_type = api_type;
592 marker.event.cmd_id = num_events++;
593 marker.event.cb_id = 0;
594 marker.event.has_thread_dims = 1;
595
596 marker.thread_x = x;
597 marker.thread_y = y;
598 marker.thread_z = z;
599
600 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
601 sctx->sqtt_next_event = EventInvalid;
602 }
603
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)604 void si_sqtt_describe_barrier_start(struct si_context *sctx, struct radeon_cmdbuf *rcs)
605 {
606 struct rgp_sqtt_marker_barrier_start marker = {0};
607
608 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
609 marker.cb_id = 0;
610 marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
611
612 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
613 }
614
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)615 void si_sqtt_describe_barrier_end(struct si_context *sctx, struct radeon_cmdbuf *rcs,
616 unsigned flags)
617 {
618 struct rgp_sqtt_marker_barrier_end marker = {0};
619
620 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
621 marker.cb_id = 0;
622
623 if (flags & SI_BARRIER_SYNC_VS)
624 marker.vs_partial_flush = true;
625 if (flags & SI_BARRIER_SYNC_PS)
626 marker.ps_partial_flush = true;
627 if (flags & SI_BARRIER_SYNC_CS)
628 marker.cs_partial_flush = true;
629
630 if (flags & SI_BARRIER_PFP_SYNC_ME)
631 marker.pfp_sync_me = true;
632
633 if (flags & SI_BARRIER_INV_VMEM)
634 marker.inval_tcp = true;
635 if (flags & SI_BARRIER_INV_ICACHE)
636 marker.inval_sqI = true;
637 if (flags & SI_BARRIER_INV_SMEM)
638 marker.inval_sqK = true;
639 if (flags & SI_BARRIER_INV_L2)
640 marker.inval_tcc = true;
641
642 if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
643 marker.inval_cb = true;
644 marker.flush_cb = true;
645 }
646 if (flags & SI_BARRIER_SYNC_AND_INV_DB) {
647 marker.inval_db = true;
648 marker.flush_db = true;
649 }
650
651 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
652 }
653
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)654 void si_write_user_event(struct si_context *sctx, struct radeon_cmdbuf *rcs,
655 enum rgp_sqtt_marker_user_event_type type,
656 const char *str, int len)
657 {
658 if (type == UserEventPop) {
659 assert(str == NULL);
660 struct rgp_sqtt_marker_user_event marker = {0};
661 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
662 marker.data_type = type;
663
664 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
665 } else {
666 assert(str != NULL);
667 struct rgp_sqtt_marker_user_event_with_length marker = {0};
668 marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
669 marker.user_event.data_type = type;
670 len = MIN2(1024, len);
671 marker.length = align(len, 4);
672
673 uint8_t *buffer = alloca(sizeof(marker) + marker.length);
674 memcpy(buffer, &marker, sizeof(marker));
675 memcpy(buffer + sizeof(marker), str, len);
676 buffer[sizeof(marker) + len - 1] = '\0';
677
678 si_emit_sqtt_userdata(sctx, rcs, buffer,
679 sizeof(marker) / 4 + marker.length / 4);
680 }
681 }
682
si_sqtt_pipeline_is_registered(struct ac_sqtt * sqtt,uint64_t pipeline_hash)683 bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
684 uint64_t pipeline_hash)
685 {
686 simple_mtx_lock(&sqtt->rgp_pso_correlation.lock);
687 list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
688 &sqtt->rgp_pso_correlation.record, list) {
689 if (record->pipeline_hash[0] == pipeline_hash) {
690 simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
691 return true;
692 }
693 }
694 simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
695
696 return false;
697 }
698
699 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)700 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key *key, enum pipe_shader_type stage)
701 {
702 switch (stage) {
703 case PIPE_SHADER_VERTEX:
704 if (key->ge.as_ls)
705 return RGP_HW_STAGE_LS;
706 else if (key->ge.as_es)
707 return RGP_HW_STAGE_ES;
708 else if (key->ge.as_ngg)
709 return RGP_HW_STAGE_GS;
710 else
711 return RGP_HW_STAGE_VS;
712 case PIPE_SHADER_TESS_CTRL:
713 return RGP_HW_STAGE_HS;
714 case PIPE_SHADER_TESS_EVAL:
715 if (key->ge.as_es)
716 return RGP_HW_STAGE_ES;
717 else if (key->ge.as_ngg)
718 return RGP_HW_STAGE_GS;
719 else
720 return RGP_HW_STAGE_VS;
721 case PIPE_SHADER_GEOMETRY:
722 return RGP_HW_STAGE_GS;
723 case PIPE_SHADER_FRAGMENT:
724 return RGP_HW_STAGE_PS;
725 case PIPE_SHADER_COMPUTE:
726 return RGP_HW_STAGE_CS;
727 default:
728 unreachable("invalid mesa shader stage");
729 }
730 }
731
732 static bool
si_sqtt_add_code_object(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)733 si_sqtt_add_code_object(struct si_context *sctx,
734 struct si_sqtt_fake_pipeline *pipeline,
735 uint32_t *gfx_sh_offsets)
736 {
737 struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
738 struct rgp_code_object_record *record;
739 bool is_compute = gfx_sh_offsets == NULL;
740
741 record = calloc(1, sizeof(struct rgp_code_object_record));
742 if (!record)
743 return false;
744
745 record->shader_stages_mask = 0;
746 record->num_shaders_combined = 0;
747 record->pipeline_hash[0] = pipeline->code_hash;
748 record->pipeline_hash[1] = pipeline->code_hash;
749
750 for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
751 struct si_shader *shader;
752 enum rgp_hardware_stages hw_stage;
753
754 if (is_compute) {
755 if (i != PIPE_SHADER_COMPUTE)
756 continue;
757 shader = &sctx->cs_shader_state.program->shader;
758 hw_stage = RGP_HW_STAGE_CS;
759 } else if (i <= PIPE_SHADER_FRAGMENT) {
760 if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
761 continue;
762 shader = sctx->shaders[i].current;
763 hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
764 } else {
765 continue;
766 }
767
768 uint8_t *code = malloc(shader->binary.uploaded_code_size);
769 if (!code) {
770 free(record);
771 return false;
772 }
773 memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
774
775 uint64_t va = pipeline->bo->gpu_address + (is_compute ? 0 : gfx_sh_offsets[i]);
776 unsigned lds_increment = sctx->gfx_level >= GFX11 && i == MESA_SHADER_FRAGMENT ?
777 1024 : sctx->screen->info.lds_encode_granularity;
778
779 memset(record->shader_data[i].rt_shader_name, 0, sizeof(record->shader_data[i].rt_shader_name));
780 record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
781 record->shader_data[i].hash[1] = record->shader_data[i].hash[0];
782 record->shader_data[i].code_size = shader->binary.uploaded_code_size;
783 record->shader_data[i].code = code;
784 record->shader_data[i].vgpr_count = shader->config.num_vgprs;
785 record->shader_data[i].sgpr_count = shader->config.num_sgprs;
786 record->shader_data[i].base_address = va & 0xffffffffffff;
787 record->shader_data[i].elf_symbol_offset = 0;
788 record->shader_data[i].hw_stage = hw_stage;
789 record->shader_data[i].is_combined = false;
790 record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
791 record->shader_data[i].lds_size = shader->config.lds_size * lds_increment;
792 record->shader_data[i].wavefront_size = shader->wave_size;
793
794 record->shader_stages_mask |= 1 << i;
795 record->num_shaders_combined++;
796 }
797
798 simple_mtx_lock(&code_object->lock);
799 list_addtail(&record->list, &code_object->record);
800 code_object->record_count++;
801 simple_mtx_unlock(&code_object->lock);
802
803 return true;
804 }
805
si_sqtt_register_pipeline(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)806 bool si_sqtt_register_pipeline(struct si_context *sctx, struct si_sqtt_fake_pipeline *pipeline,
807 uint32_t *gfx_sh_offsets)
808 {
809 assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash));
810
811 bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash, pipeline->code_hash);
812 if (!result)
813 return false;
814
815 result = ac_sqtt_add_code_object_loader_event(
816 sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address);
817 if (!result)
818 return false;
819
820 return si_sqtt_add_code_object(sctx, pipeline, gfx_sh_offsets);
821 }
822
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)823 void si_sqtt_describe_pipeline_bind(struct si_context *sctx,
824 uint64_t pipeline_hash,
825 int bind_point)
826 {
827 struct rgp_sqtt_marker_pipeline_bind marker = {0};
828 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
829
830 if (likely(!sctx->sqtt_enabled)) {
831 return;
832 }
833
834 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
835 marker.cb_id = 0;
836 marker.bind_point = bind_point;
837 marker.api_pso_hash[0] = pipeline_hash;
838 marker.api_pso_hash[1] = pipeline_hash >> 32;
839
840 si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4);
841 }
842