/* * Copyright 2023 Advanced Micro Devices, Inc. * * SPDX-License-Identifier: MIT */ #include #include #include #include "util/hash_table.h" #include "util/u_process.h" #include "util/hash_table.h" #include "si_pipe.h" #include "si_perfetto.h" #include "si_tracepoints.h" #ifdef HAVE_PERFETTO #include "util/perf/u_perfetto.h" #include "util/perf/u_perfetto_renderpass.h" #include "si_tracepoints_perfetto.h" /* Just naming stages */ static const struct { const char *name; /* The perfetto UI requires that there is a parent-child relationship * within a row of elements. Which means that all children elements must * end within the lifespan of their parent. * * Some elements like stalls and command buffers follow that relationship, * but not all. This tells us in which UI row the elements should live. */ enum si_ds_queue_stage draw_stage; } si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = { /* Order must match the enum! */ { "queue", SI_DS_QUEUE_STAGE_QUEUE, }, { "compute", SI_DS_QUEUE_STAGE_COMPUTE, }, { "draw", SI_DS_QUEUE_STAGE_DRAW, } }; struct SIRenderpassIncrementalState { bool was_cleared = true; }; struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits { using IncrementalStateType = SIRenderpassIncrementalState; }; class SIRenderpassDataSource : public MesaRenderpassDataSource { }; PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource); PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource); using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory; static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device) { uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count(); uint64_t gpu_ts; struct si_context *sctx = container_of(device, struct si_context, ds); gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b); cpu_ts = perfetto::base::GetBootTimeNs().count(); if (cpu_ts < device->next_clock_sync_ns) return; PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id); device->sync_gpu_ts = gpu_ts; device->next_clock_sync_ns = cpu_ts + 1000000000ull; MesaRenderpassDataSource:: EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id); } static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device) { PERFETTO_LOG("Sending renderstage descriptors"); device->event_id = 0; list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) { for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) { queue->stages[s].start_ns[0] = 0; } } { auto packet = ctx.NewTracePacket(); packet->set_timestamp(perfetto::base::GetBootTimeNs().count()); packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME); packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED); auto interned_data = packet->set_interned_data(); { auto desc = interned_data->add_graphics_contexts(); desc->set_iid(device->iid); desc->set_pid(getpid()); switch (device->api) { case AMD_DS_API_OPENGL: desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL); break; case AMD_DS_API_VULKAN: desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN); break; default: break; } } /* Emit all the IID picked at device/queue creation. */ list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) { for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) { { /* We put the stage number in there so that all rows are order * by si_ds_queue_stage. */ char name[100]; snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(), queue->name, s, si_queue_stage_desc[s].name); auto desc = interned_data->add_gpu_specifications(); desc->set_iid(queue->stages[s].queue_iid); desc->set_name(name); } { auto desc = interned_data->add_gpu_specifications(); desc->set_iid(queue->stages[s].stage_iid); desc->set_name(si_queue_stage_desc[s].name); } } } } device->next_clock_sync_ns = 0; sync_timestamp(ctx, device); } typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*); static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id) { PERFETTO_LOG("begin event called - ts_ns=%" PRIu64, ts_ns); uint32_t level = queue->stages[stage_id].level; /* If we haven't managed to calibrate the alignment between GPU and CPU * timestamps yet, then skip this trace, otherwise perfetto won't know * what to do with it. */ if (!queue->device->sync_gpu_ts) { queue->stages[stage_id].start_ns[level] = 0; return; } if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1)) return; queue->stages[stage_id].start_ns[level] = ts_ns; queue->stages[stage_id].level++; } static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id, uint32_t submission_id, const char *app_event, const void* payload = nullptr, trace_payload_as_extra_func payload_as_extra = nullptr) { PERFETTO_LOG("end event called - ts_ns=%" PRIu64, ts_ns); struct si_ds_device *device = queue->device; /* If we haven't managed to calibrate the alignment between GPU and CPU * timestamps yet, then skip this trace, otherwise perfetto won't know * what to do with it. */ if (!device->sync_gpu_ts) return; if (queue->stages[stage_id].level == 0) return; uint32_t level = --queue->stages[stage_id].level; struct si_ds_stage *stage = &queue->stages[stage_id]; uint64_t start_ns = stage->start_ns[level]; PERFETTO_LOG("end event called - start_ns=%" PRIu64 " ts_ns=%" PRIu64, start_ns, ts_ns); if (!start_ns || start_ns > ts_ns) return; SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) { if (auto state = tctx.GetIncrementalState(); state->was_cleared) { send_descriptors(tctx, queue->device); state->was_cleared = false; } sync_timestamp(tctx, queue->device); uint64_t evt_id = device->event_id++; /* If this is an application event, we might need to generate a new * stage_iid if not already seen. Otherwise, it's a driver event and we * have use the internal stage_iid. */ uint64_t stage_iid = app_event ? tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) : stage->stage_iid; auto packet = tctx.NewTracePacket(); packet->set_timestamp(start_ns); packet->set_timestamp_clock_id(queue->device->gpu_clock_id); assert(ts_ns >= start_ns); auto event = packet->set_gpu_render_stage_event(); event->set_gpu_id(queue->device->gpu_id); event->set_hw_queue_iid(stage->queue_iid); event->set_stage_iid(stage_iid); event->set_context(queue->device->iid); event->set_event_id(evt_id); event->set_duration(ts_ns - start_ns); event->set_submission_id(submission_id); if (payload && payload_as_extra) { payload_as_extra(event, payload); } }); stage->start_ns[level] = 0; } #endif /* HAVE_PERFETTO */ #ifdef __cplusplus extern "C" { #endif #ifdef HAVE_PERFETTO /* * Trace callbacks, called from u_trace once the timestamps from GPU have been * collected. */ #define CREATE_DUAL_EVENT_CALLBACK(event_name, stage) \ void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \ const void *flush_data, \ const struct trace_si_begin_##event_name *payload, \ const void *indirect_data) \ { \ const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \ begin_event(flush->queue, ts_ns, stage); \ } \ \ void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \ const void *flush_data, \ const struct trace_si_end_##event_name *payload, \ const void *indirect_data) \ { \ const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \ end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload, \ (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name); \ } \ CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW) CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE) uint64_t si_ds_begin_submit(struct si_ds_queue *queue) { return perfetto::base::GetBootTimeNs().count(); } void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts) { if (!u_trace_should_process(&queue->device->trace_context)) { queue->device->sync_gpu_ts = 0; queue->device->next_clock_sync_ns = 0; return; } uint64_t end_ts = perfetto::base::GetBootTimeNs().count(); uint32_t submission_id = queue->submission_id++; SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) { if (auto state = tctx.GetIncrementalState(); state->was_cleared) { send_descriptors(tctx, queue->device); state->was_cleared = false; } sync_timestamp(tctx, queue->device); auto packet = tctx.NewTracePacket(); packet->set_timestamp(start_ts); auto event = packet->set_vulkan_api_event(); auto submit = event->set_vk_queue_submit(); submit->set_duration_ns(end_ts - start_ts); submit->set_vk_queue((uintptr_t) queue); submit->set_submission_id(submission_id); }); } #endif /* HAVE_PERFETTO */ static void si_driver_ds_init_once(void) { #ifdef HAVE_PERFETTO util_perfetto_init(); perfetto::DataSourceDescriptor dsd; dsd.set_name("gpu.renderstages.amd"); SIRenderpassDataSource::Register(dsd); #endif } static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT; static uint64_t iid = 1; static uint64_t get_iid() { return iid++; } static uint32_t si_pps_clock_id(uint32_t gpu_id) { char buf[40]; snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id); return _mesa_hash_string(buf) | 0x80000000; } void si_driver_ds_init(void) { call_once(&si_driver_ds_once_flag, si_driver_ds_init_once); si_gpu_tracepoint_config_variable(); } void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo, uint32_t gpu_id, enum amd_ds_api api) { device->gpu_id = gpu_id; device->gpu_clock_id = si_pps_clock_id(gpu_id); device->info = devinfo; device->iid = get_iid(); device->api = api; list_inithead(&device->queues); } void si_ds_device_fini(struct si_ds_device *device) { u_trace_context_fini(&device->trace_context); } struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device, struct si_ds_queue *queue, const char *fmt_name, ...) { va_list ap; queue->device = device; va_start(ap, fmt_name); vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap); va_end(ap); for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) { queue->stages[s].queue_iid = get_iid(); queue->stages[s].stage_iid = get_iid(); } list_add(&queue->link, &device->queues); return queue; } void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue, uint64_t submission_id) { memset(data, 0, sizeof(*data)); data->queue = queue; data->submission_id = submission_id; u_trace_init(&data->trace, &queue->device->trace_context); } void si_ds_flush_data_fini(struct si_ds_flush_data *data) { u_trace_fini(&data->trace); } #ifdef __cplusplus } #endif