xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_sqtt.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "amd_family.h"
8 #include "si_build_pm4.h"
9 #include "si_pipe.h"
10 
11 #include "tgsi/tgsi_from_mesa.h"
12 #include "util/hash_table.h"
13 #include "util/u_debug.h"
14 #include "util/u_memory.h"
15 #include "ac_rgp.h"
16 #include "ac_sqtt.h"
17 
18 static void
19 si_emit_spi_config_cntl(struct si_context *sctx,
20                         struct radeon_cmdbuf *cs, bool enable);
21 
si_sqtt_init_bo(struct si_context * sctx)22 static bool si_sqtt_init_bo(struct si_context *sctx)
23 {
24    unsigned max_se = sctx->screen->info.max_se;
25    struct radeon_winsys *ws = sctx->ws;
26    uint64_t size;
27 
28    /* The buffer size and address need to be aligned in HW regs. Align the
29     * size as early as possible so that we do all the allocation & addressing
30     * correctly. */
31    sctx->sqtt->buffer_size =
32       align64(sctx->sqtt->buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
33 
34    /* Compute total size of the thread trace BO for all SEs. */
35    size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
36                   1 << SQTT_BUFFER_ALIGN_SHIFT);
37    size += sctx->sqtt->buffer_size * (uint64_t)max_se;
38 
39    sctx->sqtt->bo =
40       ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_GTT,
41                         RADEON_FLAG_NO_INTERPROCESS_SHARING |
42                            RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC);
43    if (!sctx->sqtt->bo)
44       return false;
45 
46    sctx->sqtt->buffer_va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
47 
48    return true;
49 }
50 
si_emit_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)51 static void si_emit_sqtt_start(struct si_context *sctx,
52                                struct radeon_cmdbuf *cs,
53                                enum amd_ip_type ip_type)
54 {
55    struct si_screen *sscreen = sctx->screen;
56    const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
57    struct ac_pm4_state *pm4;
58 
59    pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
60    if (!pm4)
61       return;
62 
63    ac_sqtt_emit_start(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
64    ac_pm4_finalize(pm4);
65 
66    radeon_begin(cs);
67    radeon_emit_array(pm4->pm4, pm4->ndw);
68    radeon_end();
69 
70    ac_pm4_free_state(pm4);
71 }
72 
si_emit_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)73 static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs,
74                               enum amd_ip_type ip_type)
75 {
76    struct si_screen *sscreen = sctx->screen;
77    const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
78    struct ac_pm4_state *pm4;
79 
80    pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
81    if (!pm4)
82       return;
83 
84    ac_sqtt_emit_stop(&sscreen->info, pm4, is_compute_queue);
85    ac_pm4_finalize(pm4);
86 
87    radeon_begin(cs);
88    radeon_emit_array(pm4->pm4, pm4->ndw);
89    radeon_end();
90 
91    ac_pm4_clear_state(pm4, &sscreen->info, false, is_compute_queue);
92 
93    if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
94       /* Some chips with disabled RBs should wait for idle because FINISH_DONE
95        * doesn't work. */
96       sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB |
97                              SI_BARRIER_SYNC_CS;
98       sctx->emit_barrier(sctx, cs);
99    }
100 
101    ac_sqtt_emit_wait(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
102    ac_pm4_finalize(pm4);
103 
104    radeon_begin_again(cs);
105    radeon_emit_array(pm4->pm4, pm4->ndw);
106    radeon_end();
107 
108    ac_pm4_free_state(pm4);
109 }
110 
si_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs)111 static void si_sqtt_start(struct si_context *sctx, struct radeon_cmdbuf *cs)
112 {
113    struct radeon_winsys *ws = sctx->ws;
114    enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
115 
116    radeon_begin(cs);
117 
118    switch (ip_type) {
119       case AMD_IP_GFX:
120          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
121          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
122          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
123          break;
124       case AMD_IP_COMPUTE:
125          radeon_emit(PKT3(PKT3_NOP, 0, 0));
126          radeon_emit(0);
127          break;
128       default:
129         /* Unsupported. */
130         assert(false);
131    }
132    radeon_end();
133 
134    ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
135                      RADEON_DOMAIN_VRAM);
136    if (sctx->spm.bo)
137       ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
138                         RADEON_DOMAIN_VRAM);
139 
140    si_cp_dma_wait_for_idle(sctx, cs);
141 
142    /* Make sure to wait-for-idle before starting SQTT. */
143    sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
144                           SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
145                           SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
146                           SI_BARRIER_PFP_SYNC_ME;
147    sctx->emit_barrier(sctx, cs);
148 
149    si_inhibit_clockgating(sctx, cs, true);
150 
151    /* Enable SQG events that collects thread trace data. */
152    si_emit_spi_config_cntl(sctx, cs, true);
153 
154    if (sctx->spm.bo) {
155       si_pc_emit_spm_reset(cs);
156       si_pc_emit_shaders(cs, ac_sqtt_get_shader_mask(&sctx->screen->info));
157       si_emit_spm_setup(sctx, cs);
158    }
159 
160    si_emit_sqtt_start(sctx, cs, ip_type);
161 
162    if (sctx->spm.bo)
163       si_pc_emit_spm_start(cs);
164 }
165 
si_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs)166 static void si_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs)
167 {
168    struct radeon_winsys *ws = sctx->ws;
169    enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
170 
171    radeon_begin(cs);
172 
173    switch (ip_type) {
174       case AMD_IP_GFX:
175          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
176          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
177          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
178          break;
179       case AMD_IP_COMPUTE:
180          radeon_emit(PKT3(PKT3_NOP, 0, 0));
181          radeon_emit(0);
182          break;
183       default:
184         /* Unsupported. */
185         assert(false);
186    }
187    radeon_end();
188 
189    ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
190                      RADEON_DOMAIN_VRAM);
191 
192    if (sctx->spm.bo)
193       ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
194                         RADEON_DOMAIN_VRAM);
195 
196    si_cp_dma_wait_for_idle(sctx, cs);
197 
198    if (sctx->spm.bo)
199       si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
200                           sctx->screen->info.never_send_perfcounter_stop);
201 
202    /* Make sure to wait-for-idle before stopping SQTT. */
203    sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
204                           SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
205                           SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
206                           SI_BARRIER_PFP_SYNC_ME;
207    sctx->emit_barrier(sctx, cs);
208 
209    si_emit_sqtt_stop(sctx, cs, ip_type);
210 
211    if (sctx->spm.bo)
212       si_pc_emit_spm_reset(cs);
213 
214    /* Restore previous state by disabling SQG events. */
215    si_emit_spi_config_cntl(sctx, cs, false);
216 
217    si_inhibit_clockgating(sctx, cs, false);
218 }
219 
si_sqtt_init_cs(struct si_context * sctx)220 static void si_sqtt_init_cs(struct si_context *sctx)
221 {
222    struct radeon_winsys *ws = sctx->ws;
223 
224    for (unsigned i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
225       sctx->sqtt->start_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
226       if (!ws->cs_create(sctx->sqtt->start_cs[i], sctx->ctx, (enum amd_ip_type)i,
227                          NULL, NULL)) {
228          free(sctx->sqtt->start_cs[i]);
229          sctx->sqtt->start_cs[i] = NULL;
230          return;
231       }
232       si_sqtt_start(sctx, sctx->sqtt->start_cs[i]);
233 
234       sctx->sqtt->stop_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
235       if (!ws->cs_create(sctx->sqtt->stop_cs[i], sctx->ctx, (enum amd_ip_type)i,
236                          NULL, NULL)) {
237          ws->cs_destroy(sctx->sqtt->start_cs[i]);
238          free(sctx->sqtt->start_cs[i]);
239          sctx->sqtt->start_cs[i] = NULL;
240          free(sctx->sqtt->stop_cs[i]);
241          sctx->sqtt->stop_cs[i] = NULL;
242          return;
243       }
244 
245       si_sqtt_stop(sctx, sctx->sqtt->stop_cs[i]);
246    }
247 }
248 
si_begin_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)249 static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
250 {
251    struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[sctx->ws->cs_get_ip_type(rcs)];
252    sctx->ws->cs_flush(cs, 0, NULL);
253 }
254 
si_end_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)255 static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
256 {
257    struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[sctx->ws->cs_get_ip_type(rcs)];
258    sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
259 }
260 
261 static bool
si_sqtt_resize_bo(struct si_context * sctx)262 si_sqtt_resize_bo(struct si_context *sctx)
263 {
264    /* Destroy the previous thread trace BO. */
265    struct pb_buffer_lean *bo = sctx->sqtt->bo;
266    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
267 
268    /* Double the size of the thread trace buffer per SE. */
269    sctx->sqtt->buffer_size *= 2;
270 
271    fprintf(stderr,
272            "Failed to get the thread trace because the buffer "
273            "was too small, resizing to %d KB\n",
274            sctx->sqtt->buffer_size / 1024);
275 
276    /* Re-create the thread trace BO. */
277    return si_sqtt_init_bo(sctx);
278 }
279 
si_get_sqtt_trace(struct si_context * sctx,struct ac_sqtt_trace * sqtt)280 static bool si_get_sqtt_trace(struct si_context *sctx,
281                               struct ac_sqtt_trace *sqtt)
282 {
283    memset(sqtt, 0, sizeof(*sqtt));
284 
285    sctx->sqtt->ptr =
286       sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ);
287 
288    if (!sctx->sqtt->ptr)
289       return false;
290 
291    if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) {
292       if (!si_sqtt_resize_bo(sctx)) {
293          fprintf(stderr, "radeonsi: Failed to resize the SQTT buffer.\n");
294       } else {
295          for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
296             sctx->screen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
297             sctx->screen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
298          }
299          si_sqtt_init_cs(sctx);
300       }
301       return false;
302    }
303    return true;
304 }
305 
si_init_sqtt(struct si_context * sctx)306 bool si_init_sqtt(struct si_context *sctx)
307 {
308    static bool warn_once = true;
309    if (warn_once) {
310       fprintf(stderr, "*************************************************\n");
311       fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
312       fprintf(stderr, "*************************************************\n");
313       warn_once = false;
314    }
315 
316    sctx->sqtt = CALLOC_STRUCT(ac_sqtt);
317 
318    if (sctx->gfx_level < GFX8) {
319       fprintf(stderr, "GPU hardware not supported: refer to "
320                       "the RGP documentation for the list of "
321                       "supported GPUs!\n");
322       return false;
323    }
324 
325    if (sctx->gfx_level > GFX11) {
326       fprintf(stderr, "radeonsi: Thread trace is not supported "
327                       "for that GPU!\n");
328       return false;
329    }
330 
331    /* Default buffer size set to 32MB per SE. */
332    sctx->sqtt->buffer_size =
333       debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
334    sctx->sqtt->instruction_timing_enabled =
335       debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true);
336    sctx->sqtt->start_frame = 10;
337 
338    const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
339    if (trigger) {
340       sctx->sqtt->start_frame = atoi(trigger);
341       if (sctx->sqtt->start_frame <= 0) {
342          /* This isn't a frame number, must be a file */
343          sctx->sqtt->trigger_file = strdup(trigger);
344          sctx->sqtt->start_frame = -1;
345       }
346    }
347 
348    if (!si_sqtt_init_bo(sctx))
349       return false;
350 
351    sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL);
352 
353    ac_sqtt_init(sctx->sqtt);
354 
355    if (sctx->gfx_level >= GFX10 &&
356        debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
357       /* Limit SPM counters to GFX10 and GFX10_3 for now */
358       ASSERTED bool r = si_spm_init(sctx);
359       assert(r);
360    }
361 
362    si_sqtt_init_cs(sctx);
363 
364    sctx->sqtt_next_event = EventInvalid;
365 
366    return true;
367 }
368 
si_destroy_sqtt(struct si_context * sctx)369 void si_destroy_sqtt(struct si_context *sctx)
370 {
371    struct si_screen *sscreen = sctx->screen;
372    struct pb_buffer_lean *bo = sctx->sqtt->bo;
373    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
374 
375    if (sctx->sqtt->trigger_file)
376       free(sctx->sqtt->trigger_file);
377 
378    for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
379       sscreen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
380       sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
381    }
382 
383    struct rgp_pso_correlation *pso_correlation =
384       &sctx->sqtt->rgp_pso_correlation;
385    struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events;
386    struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
387    list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
388                              &pso_correlation->record, list) {
389       list_del(&record->list);
390       pso_correlation->record_count--;
391       free(record);
392    }
393 
394    list_for_each_entry_safe (struct rgp_loader_events_record, record,
395                              &loader_events->record, list) {
396       list_del(&record->list);
397       loader_events->record_count--;
398       free(record);
399    }
400 
401    list_for_each_entry_safe (struct rgp_code_object_record, record,
402                              &code_object->record, list) {
403       uint32_t mask = record->shader_stages_mask;
404       int i;
405 
406       /* Free the disassembly. */
407       while (mask) {
408          i = u_bit_scan(&mask);
409          free(record->shader_data[i].code);
410       }
411       list_del(&record->list);
412       free(record);
413       code_object->record_count--;
414    }
415 
416    ac_sqtt_finish(sctx->sqtt);
417 
418    hash_table_foreach (sctx->sqtt->pipeline_bos->table, entry) {
419       struct si_sqtt_fake_pipeline *pipeline =
420          (struct si_sqtt_fake_pipeline *)entry->data;
421       si_resource_reference(&pipeline->bo, NULL);
422       FREE(pipeline);
423    }
424 
425    free(sctx->sqtt);
426    sctx->sqtt = NULL;
427 
428    if (sctx->spm.bo)
429       si_spm_finish(sctx);
430 }
431 
432 static uint64_t num_frames = 0;
433 
si_handle_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)434 void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
435 {
436    /* Should we enable SQTT yet? */
437    if (!sctx->sqtt_enabled) {
438       bool frame_trigger = num_frames == sctx->sqtt->start_frame;
439       bool file_trigger = false;
440       if (sctx->sqtt->trigger_file &&
441           access(sctx->sqtt->trigger_file, W_OK) == 0) {
442          if (unlink(sctx->sqtt->trigger_file) == 0) {
443             file_trigger = true;
444          } else {
445             /* Do not enable tracing if we cannot remove the file,
446              * because by then we'll trace every frame.
447              */
448             fprintf(stderr, "radeonsi: could not remove thread "
449                             "trace trigger file, ignoring\n");
450          }
451       }
452 
453       if (frame_trigger || file_trigger) {
454          /* Wait for last submission */
455          sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence,
456                               OS_TIMEOUT_INFINITE);
457 
458          /* Start SQTT */
459          si_begin_sqtt(sctx, rcs);
460 
461          sctx->sqtt_enabled = true;
462          sctx->sqtt->start_frame = -1;
463 
464          /* Force shader update to make sure si_sqtt_describe_pipeline_bind is
465           * called for the current "pipeline".
466           */
467          sctx->do_update_shaders = true;
468       }
469    } else {
470       struct ac_sqtt_trace sqtt_trace = {0};
471 
472       /* Stop SQTT */
473       si_end_sqtt(sctx, rcs);
474       sctx->sqtt_enabled = false;
475       sctx->sqtt->start_frame = -1;
476       assert(sctx->last_sqtt_fence);
477 
478       /* Wait for SQTT to finish and read back the bo */
479       if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence,
480                                OS_TIMEOUT_INFINITE) &&
481           si_get_sqtt_trace(sctx, &sqtt_trace)) {
482          struct ac_spm_trace spm_trace;
483 
484          /* Map the SPM counter buffer */
485          if (sctx->spm.bo) {
486             sctx->spm.ptr = sctx->ws->buffer_map(
487                sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
488             ac_spm_get_trace(&sctx->spm, &spm_trace);
489          }
490 
491          ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace,
492                              sctx->spm.bo ? &spm_trace : NULL);
493 
494          if (sctx->spm.ptr)
495             sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
496       } else {
497          fprintf(stderr, "Failed to read the trace\n");
498          if (!sctx->sqtt->trigger_file) {
499             sctx->sqtt->start_frame = num_frames + 10;
500          }
501       }
502    }
503 
504    num_frames++;
505 }
506 
si_emit_sqtt_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)507 static void si_emit_sqtt_userdata(struct si_context *sctx,
508                                   struct radeon_cmdbuf *cs, const void *data,
509                                   uint32_t num_dwords)
510 {
511    const uint32_t *dwords = (uint32_t *)data;
512 
513    radeon_begin(cs);
514 
515    while (num_dwords > 0) {
516       uint32_t count = MIN2(num_dwords, 2);
517 
518       radeon_set_uconfig_perfctr_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
519       radeon_emit_array(dwords, count);
520 
521       dwords += count;
522       num_dwords -= count;
523    }
524    radeon_end();
525 }
526 
527 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)528 si_emit_spi_config_cntl(struct si_context *sctx,
529                         struct radeon_cmdbuf *cs, bool enable)
530 {
531    radeon_begin(cs);
532 
533    if (sctx->gfx_level >= GFX9) {
534       uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
535                                  S_031100_EXP_PRIORITY_ORDER(3) |
536                                  S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
537                                  S_031100_ENABLE_SQG_BOP_EVENTS(enable);
538 
539       if (sctx->gfx_level >= GFX10)
540          spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
541 
542       radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
543    } else {
544       /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
545       radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
546                                        S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
547                                        S_009100_ENABLE_SQG_BOP_EVENTS(enable));
548    }
549    radeon_end();
550 }
551 
552 static uint32_t num_events = 0;
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)553 void si_sqtt_write_event_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
554                                 enum rgp_sqtt_marker_event_type api_type,
555                                 uint32_t vertex_offset_user_data,
556                                 uint32_t instance_offset_user_data,
557                                 uint32_t draw_index_user_data)
558 {
559    struct rgp_sqtt_marker_event marker = {0};
560 
561    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
562    marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
563    marker.cmd_id = num_events++;
564    marker.cb_id = 0;
565 
566    if (vertex_offset_user_data == UINT_MAX ||
567        instance_offset_user_data == UINT_MAX) {
568       vertex_offset_user_data = 0;
569       instance_offset_user_data = 0;
570    }
571 
572    if (draw_index_user_data == UINT_MAX)
573       draw_index_user_data = vertex_offset_user_data;
574 
575    marker.vertex_offset_reg_idx = vertex_offset_user_data;
576    marker.instance_offset_reg_idx = instance_offset_user_data;
577    marker.draw_index_reg_idx = draw_index_user_data;
578 
579    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
580 
581    sctx->sqtt_next_event = EventInvalid;
582 }
583 
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)584 void si_write_event_with_dims_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
585                                      enum rgp_sqtt_marker_event_type api_type,
586                                      uint32_t x, uint32_t y, uint32_t z)
587 {
588    struct rgp_sqtt_marker_event_with_dims marker = {0};
589 
590    marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
591    marker.event.api_type = api_type;
592    marker.event.cmd_id = num_events++;
593    marker.event.cb_id = 0;
594    marker.event.has_thread_dims = 1;
595 
596    marker.thread_x = x;
597    marker.thread_y = y;
598    marker.thread_z = z;
599 
600    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
601    sctx->sqtt_next_event = EventInvalid;
602 }
603 
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)604 void si_sqtt_describe_barrier_start(struct si_context *sctx, struct radeon_cmdbuf *rcs)
605 {
606    struct rgp_sqtt_marker_barrier_start marker = {0};
607 
608    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
609    marker.cb_id = 0;
610    marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
611 
612    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
613 }
614 
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)615 void si_sqtt_describe_barrier_end(struct si_context *sctx, struct radeon_cmdbuf *rcs,
616                                   unsigned flags)
617 {
618    struct rgp_sqtt_marker_barrier_end marker = {0};
619 
620    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
621    marker.cb_id = 0;
622 
623    if (flags & SI_BARRIER_SYNC_VS)
624       marker.vs_partial_flush = true;
625    if (flags & SI_BARRIER_SYNC_PS)
626       marker.ps_partial_flush = true;
627    if (flags & SI_BARRIER_SYNC_CS)
628       marker.cs_partial_flush = true;
629 
630    if (flags & SI_BARRIER_PFP_SYNC_ME)
631       marker.pfp_sync_me = true;
632 
633    if (flags & SI_BARRIER_INV_VMEM)
634       marker.inval_tcp = true;
635    if (flags & SI_BARRIER_INV_ICACHE)
636       marker.inval_sqI = true;
637    if (flags & SI_BARRIER_INV_SMEM)
638       marker.inval_sqK = true;
639    if (flags & SI_BARRIER_INV_L2)
640       marker.inval_tcc = true;
641 
642    if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
643       marker.inval_cb = true;
644       marker.flush_cb = true;
645    }
646    if (flags & SI_BARRIER_SYNC_AND_INV_DB) {
647       marker.inval_db = true;
648       marker.flush_db = true;
649    }
650 
651    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
652 }
653 
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)654 void si_write_user_event(struct si_context *sctx, struct radeon_cmdbuf *rcs,
655                          enum rgp_sqtt_marker_user_event_type type,
656                          const char *str, int len)
657 {
658    if (type == UserEventPop) {
659       assert(str == NULL);
660       struct rgp_sqtt_marker_user_event marker = {0};
661       marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
662       marker.data_type = type;
663 
664       si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
665    } else {
666       assert(str != NULL);
667       struct rgp_sqtt_marker_user_event_with_length marker = {0};
668       marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
669       marker.user_event.data_type = type;
670       len = MIN2(1024, len);
671       marker.length = align(len, 4);
672 
673       uint8_t *buffer = alloca(sizeof(marker) + marker.length);
674       memcpy(buffer, &marker, sizeof(marker));
675       memcpy(buffer + sizeof(marker), str, len);
676       buffer[sizeof(marker) + len - 1] = '\0';
677 
678       si_emit_sqtt_userdata(sctx, rcs, buffer,
679                             sizeof(marker) / 4 + marker.length / 4);
680    }
681 }
682 
si_sqtt_pipeline_is_registered(struct ac_sqtt * sqtt,uint64_t pipeline_hash)683 bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
684                                     uint64_t pipeline_hash)
685 {
686    simple_mtx_lock(&sqtt->rgp_pso_correlation.lock);
687    list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
688                              &sqtt->rgp_pso_correlation.record, list) {
689       if (record->pipeline_hash[0] == pipeline_hash) {
690          simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
691          return true;
692       }
693    }
694    simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
695 
696    return false;
697 }
698 
699 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)700 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key *key, enum pipe_shader_type stage)
701 {
702    switch (stage) {
703       case PIPE_SHADER_VERTEX:
704          if (key->ge.as_ls)
705             return RGP_HW_STAGE_LS;
706          else if (key->ge.as_es)
707             return RGP_HW_STAGE_ES;
708          else if (key->ge.as_ngg)
709             return RGP_HW_STAGE_GS;
710          else
711             return RGP_HW_STAGE_VS;
712       case PIPE_SHADER_TESS_CTRL:
713          return RGP_HW_STAGE_HS;
714       case PIPE_SHADER_TESS_EVAL:
715          if (key->ge.as_es)
716             return RGP_HW_STAGE_ES;
717          else if (key->ge.as_ngg)
718             return RGP_HW_STAGE_GS;
719          else
720             return RGP_HW_STAGE_VS;
721       case PIPE_SHADER_GEOMETRY:
722          return RGP_HW_STAGE_GS;
723       case PIPE_SHADER_FRAGMENT:
724          return RGP_HW_STAGE_PS;
725       case PIPE_SHADER_COMPUTE:
726          return RGP_HW_STAGE_CS;
727       default:
728          unreachable("invalid mesa shader stage");
729    }
730 }
731 
732 static bool
si_sqtt_add_code_object(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)733 si_sqtt_add_code_object(struct si_context *sctx,
734                         struct si_sqtt_fake_pipeline *pipeline,
735                         uint32_t *gfx_sh_offsets)
736 {
737    struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
738    struct rgp_code_object_record *record;
739    bool is_compute = gfx_sh_offsets == NULL;
740 
741    record = calloc(1, sizeof(struct rgp_code_object_record));
742    if (!record)
743       return false;
744 
745    record->shader_stages_mask = 0;
746    record->num_shaders_combined = 0;
747    record->pipeline_hash[0] = pipeline->code_hash;
748    record->pipeline_hash[1] = pipeline->code_hash;
749 
750    for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
751       struct si_shader *shader;
752       enum rgp_hardware_stages hw_stage;
753 
754       if (is_compute) {
755          if (i != PIPE_SHADER_COMPUTE)
756             continue;
757          shader = &sctx->cs_shader_state.program->shader;
758          hw_stage = RGP_HW_STAGE_CS;
759       } else if (i <= PIPE_SHADER_FRAGMENT) {
760          if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
761             continue;
762          shader = sctx->shaders[i].current;
763          hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
764       } else {
765          continue;
766       }
767 
768       uint8_t *code = malloc(shader->binary.uploaded_code_size);
769       if (!code) {
770          free(record);
771          return false;
772       }
773       memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
774 
775       uint64_t va = pipeline->bo->gpu_address + (is_compute ? 0 : gfx_sh_offsets[i]);
776       unsigned lds_increment = sctx->gfx_level >= GFX11 && i == MESA_SHADER_FRAGMENT ?
777          1024 : sctx->screen->info.lds_encode_granularity;
778 
779       memset(record->shader_data[i].rt_shader_name, 0, sizeof(record->shader_data[i].rt_shader_name));
780       record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
781       record->shader_data[i].hash[1] = record->shader_data[i].hash[0];
782       record->shader_data[i].code_size = shader->binary.uploaded_code_size;
783       record->shader_data[i].code = code;
784       record->shader_data[i].vgpr_count = shader->config.num_vgprs;
785       record->shader_data[i].sgpr_count = shader->config.num_sgprs;
786       record->shader_data[i].base_address = va & 0xffffffffffff;
787       record->shader_data[i].elf_symbol_offset = 0;
788       record->shader_data[i].hw_stage = hw_stage;
789       record->shader_data[i].is_combined = false;
790       record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
791       record->shader_data[i].lds_size = shader->config.lds_size * lds_increment;
792       record->shader_data[i].wavefront_size = shader->wave_size;
793 
794       record->shader_stages_mask |= 1 << i;
795       record->num_shaders_combined++;
796    }
797 
798    simple_mtx_lock(&code_object->lock);
799    list_addtail(&record->list, &code_object->record);
800    code_object->record_count++;
801    simple_mtx_unlock(&code_object->lock);
802 
803    return true;
804 }
805 
si_sqtt_register_pipeline(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)806 bool si_sqtt_register_pipeline(struct si_context *sctx, struct si_sqtt_fake_pipeline *pipeline,
807                                uint32_t *gfx_sh_offsets)
808 {
809    assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash));
810 
811    bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash, pipeline->code_hash);
812    if (!result)
813       return false;
814 
815    result = ac_sqtt_add_code_object_loader_event(
816       sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address);
817    if (!result)
818       return false;
819 
820    return si_sqtt_add_code_object(sctx, pipeline, gfx_sh_offsets);
821 }
822 
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)823 void si_sqtt_describe_pipeline_bind(struct si_context *sctx,
824                                     uint64_t pipeline_hash,
825                                     int bind_point)
826 {
827    struct rgp_sqtt_marker_pipeline_bind marker = {0};
828    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
829 
830    if (likely(!sctx->sqtt_enabled)) {
831       return;
832    }
833 
834    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
835    marker.cb_id = 0;
836    marker.bind_point = bind_point;
837    marker.api_pso_hash[0] = pipeline_hash;
838    marker.api_pso_hash[1] = pipeline_hash >> 32;
839 
840    si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4);
841 }
842