xref: /aosp_15_r20/external/mesa3d/src/intel/ds/intel_pps_driver.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020-2021 Collabora, Ltd.
3  * Author: Antonio Caggiano <[email protected]>
4  * Author: Corentin Noël <[email protected]>
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "intel_pps_driver.h"
10 
11 #include <dirent.h>
12 #include <fcntl.h>
13 #include <math.h>
14 #include <poll.h>
15 #include <strings.h>
16 #include <sys/ioctl.h>
17 #include <unistd.h>
18 
19 #include "common/intel_gem.h"
20 #include "dev/intel_device_info.h"
21 #include "perf/intel_perf.h"
22 #include "perf/intel_perf_query.h"
23 
24 #include <pps/pps.h>
25 #include <pps/pps_algorithm.h>
26 
27 #include "intel_pps_perf.h"
28 #include "intel_pps_priv.h"
29 
30 namespace pps
31 {
32 
33 // The HW sampling period is programmed using period_exponent following this
34 // formula:
35 //    sample_period = timestamp_period * 2^(period_exponent + 1)
36 // So our minimum sampling period is twice the timestamp period
37 
get_min_sampling_period_ns()38 uint64_t IntelDriver::get_min_sampling_period_ns()
39 {
40    return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
41 }
42 
IntelDriver()43 IntelDriver::IntelDriver()
44 {
45 }
46 
~IntelDriver()47 IntelDriver::~IntelDriver()
48 {
49 }
50 
enable_counter(uint32_t counter_id)51 void IntelDriver::enable_counter(uint32_t counter_id)
52 {
53    auto &counter = counters[counter_id];
54 
55    enabled_counters.emplace_back(counter);
56 }
57 
enable_all_counters()58 void IntelDriver::enable_all_counters()
59 {
60    // We should only have one group
61    assert(groups.size() == 1);
62    for (uint32_t counter_id : groups[0].counters) {
63       auto &counter = counters[counter_id];
64       enabled_counters.emplace_back(counter);
65    }
66 }
67 
init_perfcnt()68 bool IntelDriver::init_perfcnt()
69 {
70    /* Note: clock_id's below 128 are reserved.. for custom clock sources,
71     * using the hash of a namespaced string is the recommended approach.
72     * See: https://perfetto.dev/docs/concepts/clock-sync
73     */
74    this->clock_id = intel_pps_clock_id(drm_device.gpu_num);
75 
76    assert(!perf && "Intel perf should not be initialized at this point");
77 
78    perf = std::make_unique<IntelPerf>(drm_device.fd);
79 
80    const char *metric_set_name = getenv("INTEL_PERFETTO_METRIC_SET");
81 
82    struct intel_perf_query_info *default_query = nullptr;
83    selected_query = nullptr;
84    for (auto &query : perf->get_queries()) {
85       if (!strcmp(query->symbol_name, "RenderBasic"))
86          default_query = query;
87       if (metric_set_name && !strcmp(query->symbol_name, metric_set_name))
88          selected_query = query;
89    }
90 
91    assert(default_query);
92 
93    if (!selected_query) {
94       if (metric_set_name) {
95          PPS_LOG_ERROR("Available metric sets:");
96          for (auto &query : perf->get_queries())
97             PPS_LOG_ERROR("   %s", query->symbol_name);
98          PPS_LOG_FATAL("Metric set '%s' not available.", metric_set_name);
99       }
100       selected_query = default_query;
101    }
102 
103    PPS_LOG("Using metric set '%s': %s",
104            selected_query->symbol_name, selected_query->name);
105 
106    // Create group
107    CounterGroup group = {};
108    group.id = groups.size();
109    group.name = selected_query->symbol_name;
110 
111    for (int i = 0; i < selected_query->n_counters; ++i) {
112       intel_perf_query_counter &counter = selected_query->counters[i];
113 
114       // Create counter
115       Counter counter_desc = {};
116       counter_desc.id = counters.size();
117       counter_desc.name = counter.symbol_name;
118       counter_desc.group = group.id;
119       counter_desc.getter = [counter, this](
120          const Counter &c, const Driver &dri) -> Counter::Value {
121          switch (counter.data_type) {
122          case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
123          case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
124          case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
125             return (int64_t)counter.oa_counter_read_uint64(perf->cfg,
126                                                            selected_query,
127                                                            &perf->result);
128             break;
129          case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
130          case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
131             return counter.oa_counter_read_float(perf->cfg,
132                                                  selected_query,
133                                                  &perf->result);
134             break;
135          }
136 
137          return {};
138       };
139 
140       // Add counter id to the group
141       group.counters.emplace_back(counter_desc.id);
142 
143       // Store counter
144       counters.emplace_back(std::move(counter_desc));
145    }
146 
147    // Store group
148    groups.emplace_back(std::move(group));
149 
150    assert(counters.size() && "Failed to query counters");
151 
152    // Clear accumulations
153    intel_perf_query_result_clear(&perf->result);
154 
155    return true;
156 }
157 
enable_perfcnt(uint64_t sampling_period_ns)158 void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
159 {
160    this->sampling_period_ns = sampling_period_ns;
161 
162    intel_gem_read_render_timestamp(drm_device.fd, perf->devinfo.kmd_type,
163                                    &gpu_timestamp_udw);
164    gpu_timestamp_udw &= ~perf->cfg->oa_timestamp_mask;
165    if (!perf->open(sampling_period_ns, selected_query)) {
166       PPS_LOG_FATAL("Failed to open intel perf");
167    }
168 }
169 
disable_perfcnt()170 void IntelDriver::disable_perfcnt()
171 {
172    gpu_timestamp_udw = 0;
173    perf = nullptr;
174    groups.clear();
175    counters.clear();
176    enabled_counters.clear();
177 }
178 
179 /// @brief Some perf record durations can be really short
180 /// @return True if the duration is at least close to the sampling period
close_enough(uint64_t duration,uint64_t sampling_period)181 static bool close_enough(uint64_t duration, uint64_t sampling_period)
182 {
183    return duration > sampling_period - 100000;
184 }
185 
186 /// @brief Transforms the raw data received in from the driver into records
parse_perf_records(const std::vector<uint8_t> & data,const size_t byte_count)187 std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
188    const size_t byte_count)
189 {
190    std::vector<PerfRecord> records;
191    records.reserve(128);
192 
193    PerfRecord record;
194    record.data.reserve(512);
195 
196    const uint8_t *iter = data.data();
197    const uint8_t *end = iter + byte_count;
198 
199    uint64_t prev_gpu_timestamp = last_gpu_timestamp;
200 
201    while (iter < end) {
202       // Iterate a record at a time
203       auto header = reinterpret_cast<const intel_perf_record_header *>(iter);
204 
205       if (header->type == INTEL_PERF_RECORD_TYPE_SAMPLE) {
206          // Report is next to the header
207          const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
208          uint64_t gpu_timestamp_ldw =
209             intel_perf_report_timestamp(selected_query, &perf->devinfo, report);
210 
211          /* Our HW only provides us with the lower 32 bits of the 36bits
212           * timestamp counter value. If we haven't captured the top bits yet,
213           * do it now. If we see a roll over the lower 32bits capture it
214           * again.
215           */
216          if (gpu_timestamp_udw == 0 ||
217              (gpu_timestamp_udw | gpu_timestamp_ldw) < last_gpu_timestamp) {
218             intel_gem_read_render_timestamp(drm_device.fd,
219                                             perf->devinfo.kmd_type,
220                                             &gpu_timestamp_udw);
221             gpu_timestamp_udw &= ~perf->cfg->oa_timestamp_mask;
222          }
223 
224          uint64_t gpu_timestamp = gpu_timestamp_udw | gpu_timestamp_ldw;
225 
226          auto duration = intel_device_info_timebase_scale(&perf->devinfo,
227                                                           gpu_timestamp - prev_gpu_timestamp);
228 
229          // Skip perf-records that are too short by checking
230          // the distance between last report and this one
231          if (close_enough(duration, sampling_period_ns)) {
232             prev_gpu_timestamp = gpu_timestamp;
233 
234             // Add the new record to the list
235             record.timestamp = gpu_timestamp;
236             record.data.resize(header->size); // Possibly 264?
237             memcpy(record.data.data(), iter, header->size);
238             records.emplace_back(record);
239          }
240       }
241 
242       // Go to the next record
243       iter += header->size;
244    }
245 
246    return records;
247 }
248 
249 /// @brief Read all the available data from the metric set currently in use
read_data_from_metric_set()250 void IntelDriver::read_data_from_metric_set()
251 {
252    assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
253 
254    do {
255       ssize_t bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
256                                                 metric_buffer.size() - total_bytes_read);
257       if (bytes_read <= 0)
258          break;
259 
260       total_bytes_read += std::max(ssize_t(0), bytes_read);
261 
262       // Increase size of the buffer for the next read
263       if (metric_buffer.size() / 2 < total_bytes_read) {
264          metric_buffer.resize(metric_buffer.size() * 2);
265       }
266    } while (true);
267 
268    assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
269 }
270 
dump_perfcnt()271 bool IntelDriver::dump_perfcnt()
272 {
273    if (!perf->oa_stream_ready()) {
274       return false;
275    }
276 
277    read_data_from_metric_set();
278 
279    auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
280    if (new_records.empty()) {
281       // No new records from the GPU yet
282       return false;
283    } else {
284       // Records are parsed correctly, so we can reset the
285       // number of bytes read so far from the metric set
286       total_bytes_read = 0;
287    }
288 
289    APPEND(records, new_records);
290 
291    if (records.size() < 2) {
292       // Not enough records to accumulate
293       return false;
294    }
295 
296    return true;
297 }
298 
gpu_next()299 uint64_t IntelDriver::gpu_next()
300 {
301    if (records.size() < 2) {
302       // Not enough records to accumulate
303       return 0;
304    }
305 
306    // Get first and second
307    auto record_a = reinterpret_cast<const intel_perf_record_header *>(records[0].data.data());
308    auto record_b = reinterpret_cast<const intel_perf_record_header *>(records[1].data.data());
309 
310    intel_perf_query_result_accumulate_fields(&perf->result,
311                                              selected_query,
312                                              record_a + 1,
313                                              record_b + 1,
314                                              false /* no_oa_accumulate */);
315 
316    // Get last timestamp
317    auto gpu_timestamp = records[1].timestamp;
318 
319    // Consume first record
320    records.erase(std::begin(records), std::begin(records) + 1);
321 
322    return intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
323 }
324 
next()325 uint64_t IntelDriver::next()
326 {
327    // Reset accumulation
328    intel_perf_query_result_clear(&perf->result);
329    return gpu_next();
330 }
331 
gpu_clock_id() const332 uint32_t IntelDriver::gpu_clock_id() const
333 {
334    return this->clock_id;
335 }
336 
gpu_timestamp() const337 uint64_t IntelDriver::gpu_timestamp() const
338 {
339    uint64_t timestamp;
340    intel_gem_read_render_timestamp(drm_device.fd, perf->devinfo.kmd_type,
341                                    &timestamp);
342    return intel_device_info_timebase_scale(&perf->devinfo, timestamp);
343 }
344 
cpu_gpu_timestamp(uint64_t & cpu_timestamp,uint64_t & gpu_timestamp) const345 bool IntelDriver::cpu_gpu_timestamp(uint64_t &cpu_timestamp,
346                                     uint64_t &gpu_timestamp) const
347 {
348    if (!intel_gem_read_correlate_cpu_gpu_timestamp(drm_device.fd,
349                                                    perf->devinfo.kmd_type,
350                                                    INTEL_ENGINE_CLASS_RENDER, 0,
351                                                    CLOCK_BOOTTIME,
352                                                    &cpu_timestamp,
353                                                    &gpu_timestamp,
354                                                    NULL))
355       return false;
356 
357    gpu_timestamp =
358       intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
359    return true;
360 }
361 
362 } // namespace pps
363