1 /*
2 * Copyright (C) 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/profiling/perf/event_reader.h"
18
19 #include <linux/perf_event.h>
20 #include <sys/ioctl.h>
21 #include <sys/mman.h>
22 #include <sys/syscall.h>
23 #include <sys/types.h>
24 #include <unistd.h>
25
26 #include "perfetto/ext/base/utils.h"
27 #include "src/profiling/perf/regs_parsing.h"
28
29 namespace perfetto {
30 namespace profiling {
31
32 namespace {
33
34 template <typename T>
ReadValue(T * value_out,const char * ptr)35 const char* ReadValue(T* value_out, const char* ptr) {
36 memcpy(value_out, reinterpret_cast<const void*>(ptr), sizeof(T));
37 return ptr + sizeof(T);
38 }
39
40 template <typename T>
ReadValues(T * out,const char * ptr,size_t num_values)41 const char* ReadValues(T* out, const char* ptr, size_t num_values) {
42 size_t sz = sizeof(T) * num_values;
43 memcpy(out, reinterpret_cast<const void*>(ptr), sz);
44 return ptr + sz;
45 }
46
IsPowerOfTwo(size_t v)47 bool IsPowerOfTwo(size_t v) {
48 return (v != 0 && ((v & (v - 1)) == 0));
49 }
50
perf_event_open(perf_event_attr * attr,pid_t pid,int cpu,int group_fd,unsigned long flags)51 static int perf_event_open(perf_event_attr* attr,
52 pid_t pid,
53 int cpu,
54 int group_fd,
55 unsigned long flags) {
56 return static_cast<int>(
57 syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags));
58 }
59
PerfEventOpen(uint32_t cpu,perf_event_attr * perf_attr,int group_fd=-1)60 base::ScopedFile PerfEventOpen(uint32_t cpu,
61 perf_event_attr* perf_attr,
62 int group_fd = -1) {
63 base::ScopedFile perf_fd{perf_event_open(perf_attr, /*pid=*/-1,
64 static_cast<int>(cpu), group_fd,
65 PERF_FLAG_FD_CLOEXEC)};
66 return perf_fd;
67 }
68
69 // If counting tracepoints, set an event filter if requested.
MaybeApplyTracepointFilter(int fd,const PerfCounter & event)70 bool MaybeApplyTracepointFilter(int fd, const PerfCounter& event) {
71 if (event.type != PerfCounter::Type::kTracepoint ||
72 event.tracepoint_filter.empty()) {
73 return true;
74 }
75 PERFETTO_DCHECK(event.attr_type == PERF_TYPE_TRACEPOINT);
76
77 if (ioctl(fd, PERF_EVENT_IOC_SET_FILTER, event.tracepoint_filter.c_str())) {
78 PERFETTO_PLOG("Failed ioctl to set event filter");
79 return false;
80 }
81 return true;
82 }
83
84 } // namespace
85
PerfRingBuffer(PerfRingBuffer && other)86 PerfRingBuffer::PerfRingBuffer(PerfRingBuffer&& other) noexcept
87 : metadata_page_(other.metadata_page_),
88 mmap_sz_(other.mmap_sz_),
89 data_buf_(other.data_buf_),
90 data_buf_sz_(other.data_buf_sz_) {
91 other.metadata_page_ = nullptr;
92 other.mmap_sz_ = 0;
93 other.data_buf_ = nullptr;
94 other.data_buf_sz_ = 0;
95 }
96
operator =(PerfRingBuffer && other)97 PerfRingBuffer& PerfRingBuffer::operator=(PerfRingBuffer&& other) noexcept {
98 if (this == &other)
99 return *this;
100
101 this->~PerfRingBuffer();
102 new (this) PerfRingBuffer(std::move(other));
103 return *this;
104 }
105
~PerfRingBuffer()106 PerfRingBuffer::~PerfRingBuffer() {
107 if (!valid())
108 return;
109
110 if (munmap(reinterpret_cast<void*>(metadata_page_), mmap_sz_) != 0)
111 PERFETTO_PLOG("failed munmap");
112 }
113
Allocate(int perf_fd,size_t data_page_count)114 std::optional<PerfRingBuffer> PerfRingBuffer::Allocate(int perf_fd,
115 size_t data_page_count) {
116 // perf_event_open requires the ring buffer to be a power of two in size.
117 PERFETTO_DCHECK(IsPowerOfTwo(data_page_count));
118
119 PerfRingBuffer ret;
120
121 // mmap request is one page larger than the buffer size (for the metadata).
122 ret.data_buf_sz_ = data_page_count * base::GetSysPageSize();
123 ret.mmap_sz_ = ret.data_buf_sz_ + base::GetSysPageSize();
124
125 // If PROT_WRITE, kernel won't overwrite unread samples.
126 void* mmap_addr = mmap(nullptr, ret.mmap_sz_, PROT_READ | PROT_WRITE,
127 MAP_SHARED, perf_fd, 0);
128 if (mmap_addr == MAP_FAILED) {
129 PERFETTO_PLOG("failed mmap");
130 return std::nullopt;
131 }
132
133 // Expected layout is [ metadata page ] [ data pages ... ]
134 ret.metadata_page_ = reinterpret_cast<perf_event_mmap_page*>(mmap_addr);
135 ret.data_buf_ = reinterpret_cast<char*>(mmap_addr) + base::GetSysPageSize();
136 PERFETTO_CHECK(ret.metadata_page_->data_offset == base::GetSysPageSize());
137 PERFETTO_CHECK(ret.metadata_page_->data_size == ret.data_buf_sz_);
138
139 PERFETTO_DCHECK(IsPowerOfTwo(ret.data_buf_sz_));
140
141 return std::make_optional(std::move(ret));
142 }
143
144 // See |perf_output_put_handle| for the necessary synchronization between the
145 // kernel and this userspace thread (which are using the same shared memory, but
146 // might be on different cores).
147 // TODO(rsavitski): is there false sharing between |data_tail| and |data_head|?
148 // Is there an argument for maintaining our own copy of |data_tail| instead of
149 // reloading it?
ReadRecordNonconsuming()150 char* PerfRingBuffer::ReadRecordNonconsuming() {
151 static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "");
152
153 PERFETTO_DCHECK(valid());
154
155 // |data_tail| is written only by this userspace thread, so we can safely read
156 // it without any synchronization.
157 uint64_t read_offset = metadata_page_->data_tail;
158
159 // |data_head| is written by the kernel, perform an acquiring load such that
160 // the payload reads below are ordered after this load.
161 uint64_t write_offset =
162 reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_head)
163 ->load(std::memory_order_acquire);
164
165 PERFETTO_DCHECK(read_offset <= write_offset);
166 if (write_offset == read_offset)
167 return nullptr; // no new data
168
169 size_t read_pos = static_cast<size_t>(read_offset & (data_buf_sz_ - 1));
170
171 // event header (64 bits) guaranteed to be contiguous
172 PERFETTO_DCHECK(read_pos <= data_buf_sz_ - sizeof(perf_event_header));
173 PERFETTO_DCHECK(0 == reinterpret_cast<size_t>(data_buf_ + read_pos) %
174 alignof(perf_event_header));
175
176 perf_event_header* evt_header =
177 reinterpret_cast<perf_event_header*>(data_buf_ + read_pos);
178 uint16_t evt_size = evt_header->size;
179
180 // event wrapped - reconstruct it, and return a pointer to the buffer
181 if (read_pos + evt_size > data_buf_sz_) {
182 PERFETTO_DLOG("PerfRingBuffer: returning reconstructed event");
183
184 size_t prefix_sz = data_buf_sz_ - read_pos;
185 memcpy(&reconstructed_record_[0], data_buf_ + read_pos, prefix_sz);
186 memcpy(&reconstructed_record_[0] + prefix_sz, data_buf_,
187 evt_size - prefix_sz);
188 return &reconstructed_record_[0];
189 } else {
190 // usual case - contiguous sample
191 return data_buf_ + read_pos;
192 }
193 }
194
Consume(size_t bytes)195 void PerfRingBuffer::Consume(size_t bytes) {
196 PERFETTO_DCHECK(valid());
197
198 // Advance |data_tail|, which is written only by this thread. The store of the
199 // updated value needs to have release semantics such that the preceding
200 // payload reads are ordered before it. The reader in this case is the kernel,
201 // which reads |data_tail| to calculate the available ring buffer capacity
202 // before trying to store a new record.
203 uint64_t updated_tail = metadata_page_->data_tail + bytes;
204 reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_tail)
205 ->store(updated_tail, std::memory_order_release);
206 }
207
EventReader(uint32_t cpu,perf_event_attr event_attr,base::ScopedFile perf_fd,std::vector<base::ScopedFile> followers_fds,PerfRingBuffer ring_buffer)208 EventReader::EventReader(uint32_t cpu,
209 perf_event_attr event_attr,
210 base::ScopedFile perf_fd,
211 std::vector<base::ScopedFile> followers_fds,
212 PerfRingBuffer ring_buffer)
213 : cpu_(cpu),
214 event_attr_(event_attr),
215 perf_fd_(std::move(perf_fd)),
216 follower_fds_(std::move(followers_fds)),
217 ring_buffer_(std::move(ring_buffer)) {}
218
operator =(EventReader && other)219 EventReader& EventReader::operator=(EventReader&& other) noexcept {
220 if (this == &other)
221 return *this;
222
223 this->~EventReader();
224 new (this) EventReader(std::move(other));
225 return *this;
226 }
227
ConfigureEvents(uint32_t cpu,const EventConfig & event_cfg)228 std::optional<EventReader> EventReader::ConfigureEvents(
229 uint32_t cpu,
230 const EventConfig& event_cfg) {
231 auto timebase_fd = PerfEventOpen(cpu, event_cfg.perf_attr());
232 if (!timebase_fd) {
233 PERFETTO_PLOG("Failed perf_event_open");
234 return std::nullopt;
235 }
236
237 // Open followers.
238 std::vector<base::ScopedFile> follower_fds;
239 for (auto follower_attr : event_cfg.perf_attr_followers()) {
240 auto follower_fd = PerfEventOpen(cpu, &follower_attr, timebase_fd.get());
241 if (!follower_fd) {
242 PERFETTO_PLOG("Failed perf_event_open (follower)");
243 return std::nullopt;
244 }
245 follower_fds.push_back(std::move(follower_fd));
246 }
247
248 // Apply the tracepoint to the timebase.
249 if (!MaybeApplyTracepointFilter(timebase_fd.get(),
250 event_cfg.timebase_event()))
251 return std::nullopt;
252
253 // Apply the tracepoint to the followers.
254 if (follower_fds.size() != event_cfg.follower_events().size()) {
255 return std::nullopt;
256 }
257
258 for (size_t i = 0; i < follower_fds.size(); ++i) {
259 if (!MaybeApplyTracepointFilter(follower_fds[i].get(),
260 event_cfg.follower_events()[i]))
261 return std::nullopt;
262 }
263
264 auto ring_buffer = PerfRingBuffer::Allocate(timebase_fd.get(),
265 event_cfg.ring_buffer_pages());
266 if (!ring_buffer.has_value()) {
267 return std::nullopt;
268 }
269 return EventReader(cpu, *event_cfg.perf_attr(), std::move(timebase_fd),
270 std::move(follower_fds), std::move(ring_buffer.value()));
271 }
272
ReadUntilSample(std::function<void (uint64_t)> records_lost_callback)273 std::optional<ParsedSample> EventReader::ReadUntilSample(
274 std::function<void(uint64_t)> records_lost_callback) {
275 for (;;) {
276 char* event = ring_buffer_.ReadRecordNonconsuming();
277 if (!event)
278 return std::nullopt; // caught up with the writer
279
280 auto* event_hdr = reinterpret_cast<const perf_event_header*>(event);
281
282 if (event_hdr->type == PERF_RECORD_SAMPLE) {
283 ParsedSample sample = ParseSampleRecord(cpu_, event);
284 ring_buffer_.Consume(event_hdr->size);
285 return std::make_optional(std::move(sample));
286 }
287
288 if (event_hdr->type == PERF_RECORD_LOST) {
289 /*
290 * struct {
291 * struct perf_event_header header;
292 * u64 id;
293 * u64 lost;
294 * struct sample_id sample_id;
295 * };
296 */
297 uint64_t records_lost = *reinterpret_cast<const uint64_t*>(
298 event + sizeof(perf_event_header) + sizeof(uint64_t));
299
300 records_lost_callback(records_lost);
301 ring_buffer_.Consume(event_hdr->size);
302 continue; // keep looking for a sample
303 }
304
305 // Kernel had to throttle irqs.
306 if (event_hdr->type == PERF_RECORD_THROTTLE ||
307 event_hdr->type == PERF_RECORD_UNTHROTTLE) {
308 ring_buffer_.Consume(event_hdr->size);
309 continue; // keep looking for a sample
310 }
311
312 PERFETTO_DFATAL_OR_ELOG("Unsupported event type [%zu]",
313 static_cast<size_t>(event_hdr->type));
314 ring_buffer_.Consume(event_hdr->size);
315 }
316 }
317
318 // Generally, samples can belong to any cpu (which can be recorded with
319 // PERF_SAMPLE_CPU). However, this producer uses only cpu-scoped events,
320 // therefore it is already known.
ParseSampleRecord(uint32_t cpu,const char * record_start)321 ParsedSample EventReader::ParseSampleRecord(uint32_t cpu,
322 const char* record_start) {
323 if (event_attr_.sample_type &
324 (~uint64_t(PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_STACK_USER |
325 PERF_SAMPLE_REGS_USER | PERF_SAMPLE_CALLCHAIN |
326 PERF_SAMPLE_READ))) {
327 PERFETTO_FATAL("Unsupported sampling option");
328 }
329
330 auto* event_hdr = reinterpret_cast<const perf_event_header*>(record_start);
331 size_t sample_size = event_hdr->size;
332
333 ParsedSample sample = {};
334 sample.common.cpu = cpu;
335 sample.common.cpu_mode = event_hdr->misc & PERF_RECORD_MISC_CPUMODE_MASK;
336
337 // Parse the payload, which consists of concatenated data for each
338 // |attr.sample_type| flag.
339 const char* parse_pos = record_start + sizeof(perf_event_header);
340
341 if (event_attr_.sample_type & PERF_SAMPLE_TID) {
342 uint32_t pid = 0;
343 uint32_t tid = 0;
344 parse_pos = ReadValue(&pid, parse_pos);
345 parse_pos = ReadValue(&tid, parse_pos);
346 sample.common.pid = static_cast<pid_t>(pid);
347 sample.common.tid = static_cast<pid_t>(tid);
348 }
349
350 if (event_attr_.sample_type & PERF_SAMPLE_TIME) {
351 parse_pos = ReadValue(&sample.common.timestamp, parse_pos);
352 }
353
354 if (event_attr_.sample_type & PERF_SAMPLE_READ) {
355 if (event_attr_.read_format & PERF_FORMAT_GROUP) {
356 // When PERF_FORMAT_GROUP is specified, the record starts with the number
357 // of events it contains followed by the events. The event list always
358 // starts with the value of the timebase.
359 // In a ParsedSample, the value of the timebase goes into timebase_count
360 // and the value of the followers events goes into follower_counts.
361 uint64_t nr = 0;
362 parse_pos = ReadValue(&nr, parse_pos);
363 PERFETTO_CHECK(nr != 0);
364 parse_pos = ReadValue(&sample.common.timebase_count, parse_pos);
365 sample.common.follower_counts.resize(nr - 1);
366 for (size_t i = 0; i < nr - 1; ++i) {
367 parse_pos = ReadValue(&sample.common.follower_counts[i], parse_pos);
368 }
369 } else {
370 parse_pos = ReadValue(&sample.common.timebase_count, parse_pos);
371 }
372 }
373
374 if (event_attr_.sample_type & PERF_SAMPLE_CALLCHAIN) {
375 uint64_t chain_len = 0;
376 parse_pos = ReadValue(&chain_len, parse_pos);
377 sample.kernel_ips.resize(static_cast<size_t>(chain_len));
378 parse_pos = ReadValues<uint64_t>(sample.kernel_ips.data(), parse_pos,
379 static_cast<size_t>(chain_len));
380 }
381
382 if (event_attr_.sample_type & PERF_SAMPLE_REGS_USER) {
383 // Can be empty, e.g. if we sampled a kernel thread.
384 sample.regs = ReadPerfUserRegsData(&parse_pos);
385 }
386
387 if (event_attr_.sample_type & PERF_SAMPLE_STACK_USER) {
388 // Maximum possible sampled stack size for this sample. Can be lower than
389 // the requested size if there wasn't enough room in the sample (which is
390 // limited to 64k).
391 uint64_t max_stack_size;
392 parse_pos = ReadValue(&max_stack_size, parse_pos);
393
394 const char* stack_start = parse_pos;
395 parse_pos += max_stack_size; // skip to dyn_size
396
397 // Payload written conditionally, e.g. kernel threads don't have a
398 // user stack.
399 if (max_stack_size > 0) {
400 uint64_t filled_stack_size;
401 parse_pos = ReadValue(&filled_stack_size, parse_pos);
402
403 // copy stack bytes into a vector
404 size_t payload_sz = static_cast<size_t>(filled_stack_size);
405 sample.stack.resize(payload_sz);
406 memcpy(sample.stack.data(), stack_start, payload_sz);
407
408 // remember whether the stack sample is (most likely) truncated
409 sample.stack_maxed = (filled_stack_size == max_stack_size);
410 }
411 }
412
413 PERFETTO_CHECK(parse_pos == record_start + sample_size);
414 return sample;
415 }
416
EnableEvents()417 void EventReader::EnableEvents() {
418 int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_ENABLE);
419 PERFETTO_CHECK(ret == 0);
420 }
421
DisableEvents()422 void EventReader::DisableEvents() {
423 int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_DISABLE);
424 PERFETTO_CHECK(ret == 0);
425 }
426
427 } // namespace profiling
428 } // namespace perfetto
429