1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Arm Statistical Profiling Extensions (SPE) support
4 * Copyright (c) 2017-2018, Arm Ltd.
5 */
6
7 #include <byteswap.h>
8 #include <endian.h>
9 #include <errno.h>
10 #include <inttypes.h>
11 #include <linux/bitops.h>
12 #include <linux/kernel.h>
13 #include <linux/log2.h>
14 #include <linux/types.h>
15 #include <linux/zalloc.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18
19 #include "auxtrace.h"
20 #include "color.h"
21 #include "debug.h"
22 #include "evlist.h"
23 #include "evsel.h"
24 #include "machine.h"
25 #include "session.h"
26 #include "symbol.h"
27 #include "thread.h"
28 #include "thread-stack.h"
29 #include "tsc.h"
30 #include "tool.h"
31 #include "util/synthetic-events.h"
32
33 #include "arm-spe.h"
34 #include "arm-spe-decoder/arm-spe-decoder.h"
35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
36
37 #include "../../arch/arm64/include/asm/cputype.h"
38 #define MAX_TIMESTAMP (~0ULL)
39
40 #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST))
41
42 struct arm_spe {
43 struct auxtrace auxtrace;
44 struct auxtrace_queues queues;
45 struct auxtrace_heap heap;
46 struct itrace_synth_opts synth_opts;
47 u32 auxtrace_type;
48 struct perf_session *session;
49 struct machine *machine;
50 u32 pmu_type;
51
52 struct perf_tsc_conversion tc;
53
54 u8 timeless_decoding;
55 u8 data_queued;
56
57 u64 sample_type;
58 u8 sample_flc;
59 u8 sample_llc;
60 u8 sample_tlb;
61 u8 sample_branch;
62 u8 sample_remote_access;
63 u8 sample_memory;
64 u8 sample_instructions;
65 u64 instructions_sample_period;
66
67 u64 l1d_miss_id;
68 u64 l1d_access_id;
69 u64 llc_miss_id;
70 u64 llc_access_id;
71 u64 tlb_miss_id;
72 u64 tlb_access_id;
73 u64 branch_id;
74 u64 remote_access_id;
75 u64 memory_id;
76 u64 instructions_id;
77
78 u64 kernel_start;
79
80 unsigned long num_events;
81 u8 use_ctx_pkt_for_pid;
82
83 u64 **metadata;
84 u64 metadata_ver;
85 u64 metadata_nr_cpu;
86 bool is_homogeneous;
87 };
88
89 struct arm_spe_queue {
90 struct arm_spe *spe;
91 unsigned int queue_nr;
92 struct auxtrace_buffer *buffer;
93 struct auxtrace_buffer *old_buffer;
94 union perf_event *event_buf;
95 bool on_heap;
96 bool done;
97 pid_t pid;
98 pid_t tid;
99 int cpu;
100 struct arm_spe_decoder *decoder;
101 u64 time;
102 u64 timestamp;
103 struct thread *thread;
104 u64 period_instructions;
105 u32 flags;
106 };
107
108 struct data_source_handle {
109 const struct midr_range *midr_ranges;
110 void (*ds_synth)(const struct arm_spe_record *record,
111 union perf_mem_data_src *data_src);
112 };
113
114 #define DS(range, func) \
115 { \
116 .midr_ranges = range, \
117 .ds_synth = arm_spe__synth_##func, \
118 }
119
arm_spe_dump(struct arm_spe * spe __maybe_unused,unsigned char * buf,size_t len)120 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
121 unsigned char *buf, size_t len)
122 {
123 struct arm_spe_pkt packet;
124 size_t pos = 0;
125 int ret, pkt_len, i;
126 char desc[ARM_SPE_PKT_DESC_MAX];
127 const char *color = PERF_COLOR_BLUE;
128
129 color_fprintf(stdout, color,
130 ". ... ARM SPE data: size %#zx bytes\n",
131 len);
132
133 while (len) {
134 ret = arm_spe_get_packet(buf, len, &packet);
135 if (ret > 0)
136 pkt_len = ret;
137 else
138 pkt_len = 1;
139 printf(".");
140 color_fprintf(stdout, color, " %08zx: ", pos);
141 for (i = 0; i < pkt_len; i++)
142 color_fprintf(stdout, color, " %02x", buf[i]);
143 for (; i < 16; i++)
144 color_fprintf(stdout, color, " ");
145 if (ret > 0) {
146 ret = arm_spe_pkt_desc(&packet, desc,
147 ARM_SPE_PKT_DESC_MAX);
148 if (!ret)
149 color_fprintf(stdout, color, " %s\n", desc);
150 } else {
151 color_fprintf(stdout, color, " Bad packet!\n");
152 }
153 pos += pkt_len;
154 buf += pkt_len;
155 len -= pkt_len;
156 }
157 }
158
arm_spe_dump_event(struct arm_spe * spe,unsigned char * buf,size_t len)159 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
160 size_t len)
161 {
162 printf(".\n");
163 arm_spe_dump(spe, buf, len);
164 }
165
arm_spe_get_trace(struct arm_spe_buffer * b,void * data)166 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
167 {
168 struct arm_spe_queue *speq = data;
169 struct auxtrace_buffer *buffer = speq->buffer;
170 struct auxtrace_buffer *old_buffer = speq->old_buffer;
171 struct auxtrace_queue *queue;
172
173 queue = &speq->spe->queues.queue_array[speq->queue_nr];
174
175 buffer = auxtrace_buffer__next(queue, buffer);
176 /* If no more data, drop the previous auxtrace_buffer and return */
177 if (!buffer) {
178 if (old_buffer)
179 auxtrace_buffer__drop_data(old_buffer);
180 b->len = 0;
181 return 0;
182 }
183
184 speq->buffer = buffer;
185
186 /* If the aux_buffer doesn't have data associated, try to load it */
187 if (!buffer->data) {
188 /* get the file desc associated with the perf data file */
189 int fd = perf_data__fd(speq->spe->session->data);
190
191 buffer->data = auxtrace_buffer__get_data(buffer, fd);
192 if (!buffer->data)
193 return -ENOMEM;
194 }
195
196 b->len = buffer->size;
197 b->buf = buffer->data;
198
199 if (b->len) {
200 if (old_buffer)
201 auxtrace_buffer__drop_data(old_buffer);
202 speq->old_buffer = buffer;
203 } else {
204 auxtrace_buffer__drop_data(buffer);
205 return arm_spe_get_trace(b, data);
206 }
207
208 return 0;
209 }
210
arm_spe__alloc_queue(struct arm_spe * spe,unsigned int queue_nr)211 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
212 unsigned int queue_nr)
213 {
214 struct arm_spe_params params = { .get_trace = 0, };
215 struct arm_spe_queue *speq;
216
217 speq = zalloc(sizeof(*speq));
218 if (!speq)
219 return NULL;
220
221 speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
222 if (!speq->event_buf)
223 goto out_free;
224
225 speq->spe = spe;
226 speq->queue_nr = queue_nr;
227 speq->pid = -1;
228 speq->tid = -1;
229 speq->cpu = -1;
230 speq->period_instructions = 0;
231
232 /* params set */
233 params.get_trace = arm_spe_get_trace;
234 params.data = speq;
235
236 /* create new decoder */
237 speq->decoder = arm_spe_decoder_new(¶ms);
238 if (!speq->decoder)
239 goto out_free;
240
241 return speq;
242
243 out_free:
244 zfree(&speq->event_buf);
245 free(speq);
246
247 return NULL;
248 }
249
arm_spe_cpumode(struct arm_spe * spe,u64 ip)250 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
251 {
252 return ip >= spe->kernel_start ?
253 PERF_RECORD_MISC_KERNEL :
254 PERF_RECORD_MISC_USER;
255 }
256
arm_spe_set_pid_tid_cpu(struct arm_spe * spe,struct auxtrace_queue * queue)257 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
258 struct auxtrace_queue *queue)
259 {
260 struct arm_spe_queue *speq = queue->priv;
261 pid_t tid;
262
263 tid = machine__get_current_tid(spe->machine, speq->cpu);
264 if (tid != -1) {
265 speq->tid = tid;
266 thread__zput(speq->thread);
267 } else
268 speq->tid = queue->tid;
269
270 if ((!speq->thread) && (speq->tid != -1)) {
271 speq->thread = machine__find_thread(spe->machine, -1,
272 speq->tid);
273 }
274
275 if (speq->thread) {
276 speq->pid = thread__pid(speq->thread);
277 if (queue->cpu == -1)
278 speq->cpu = thread__cpu(speq->thread);
279 }
280 }
281
arm_spe_set_tid(struct arm_spe_queue * speq,pid_t tid)282 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
283 {
284 struct arm_spe *spe = speq->spe;
285 int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);
286
287 if (err)
288 return err;
289
290 arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
291
292 return 0;
293 }
294
arm_spe__get_metadata_by_cpu(struct arm_spe * spe,u64 cpu)295 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu)
296 {
297 u64 i;
298
299 if (!spe->metadata)
300 return NULL;
301
302 for (i = 0; i < spe->metadata_nr_cpu; i++)
303 if (spe->metadata[i][ARM_SPE_CPU] == cpu)
304 return spe->metadata[i];
305
306 return NULL;
307 }
308
arm_spe__synth_simd_flags(const struct arm_spe_record * record)309 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
310 {
311 struct simd_flags simd_flags = {};
312
313 if ((record->op & ARM_SPE_OP_LDST) && (record->op & ARM_SPE_OP_SVE_LDST))
314 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
315
316 if ((record->op & ARM_SPE_OP_OTHER) && (record->op & ARM_SPE_OP_SVE_OTHER))
317 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
318
319 if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
320 simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL;
321
322 if (record->type & ARM_SPE_SVE_EMPTY_PRED)
323 simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY;
324
325 return simd_flags;
326 }
327
arm_spe_prep_sample(struct arm_spe * spe,struct arm_spe_queue * speq,union perf_event * event,struct perf_sample * sample)328 static void arm_spe_prep_sample(struct arm_spe *spe,
329 struct arm_spe_queue *speq,
330 union perf_event *event,
331 struct perf_sample *sample)
332 {
333 struct arm_spe_record *record = &speq->decoder->record;
334
335 if (!spe->timeless_decoding)
336 sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
337
338 sample->ip = record->from_ip;
339 sample->cpumode = arm_spe_cpumode(spe, sample->ip);
340 sample->pid = speq->pid;
341 sample->tid = speq->tid;
342 sample->period = 1;
343 sample->cpu = speq->cpu;
344 sample->simd_flags = arm_spe__synth_simd_flags(record);
345
346 event->sample.header.type = PERF_RECORD_SAMPLE;
347 event->sample.header.misc = sample->cpumode;
348 event->sample.header.size = sizeof(struct perf_event_header);
349 }
350
arm_spe__inject_event(union perf_event * event,struct perf_sample * sample,u64 type)351 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
352 {
353 event->header.size = perf_event__sample_event_size(sample, type, 0);
354 return perf_event__synthesize_sample(event, type, 0, sample);
355 }
356
357 static inline int
arm_spe_deliver_synth_event(struct arm_spe * spe,struct arm_spe_queue * speq __maybe_unused,union perf_event * event,struct perf_sample * sample)358 arm_spe_deliver_synth_event(struct arm_spe *spe,
359 struct arm_spe_queue *speq __maybe_unused,
360 union perf_event *event,
361 struct perf_sample *sample)
362 {
363 int ret;
364
365 if (spe->synth_opts.inject) {
366 ret = arm_spe__inject_event(event, sample, spe->sample_type);
367 if (ret)
368 return ret;
369 }
370
371 ret = perf_session__deliver_synth_event(spe->session, event, sample);
372 if (ret)
373 pr_err("ARM SPE: failed to deliver event, error %d\n", ret);
374
375 return ret;
376 }
377
arm_spe__synth_mem_sample(struct arm_spe_queue * speq,u64 spe_events_id,u64 data_src)378 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
379 u64 spe_events_id, u64 data_src)
380 {
381 struct arm_spe *spe = speq->spe;
382 struct arm_spe_record *record = &speq->decoder->record;
383 union perf_event *event = speq->event_buf;
384 struct perf_sample sample = { .ip = 0, };
385
386 arm_spe_prep_sample(spe, speq, event, &sample);
387
388 sample.id = spe_events_id;
389 sample.stream_id = spe_events_id;
390 sample.addr = record->virt_addr;
391 sample.phys_addr = record->phys_addr;
392 sample.data_src = data_src;
393 sample.weight = record->latency;
394
395 return arm_spe_deliver_synth_event(spe, speq, event, &sample);
396 }
397
arm_spe__synth_branch_sample(struct arm_spe_queue * speq,u64 spe_events_id)398 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
399 u64 spe_events_id)
400 {
401 struct arm_spe *spe = speq->spe;
402 struct arm_spe_record *record = &speq->decoder->record;
403 union perf_event *event = speq->event_buf;
404 struct perf_sample sample = { .ip = 0, };
405
406 arm_spe_prep_sample(spe, speq, event, &sample);
407
408 sample.id = spe_events_id;
409 sample.stream_id = spe_events_id;
410 sample.addr = record->to_ip;
411 sample.weight = record->latency;
412 sample.flags = speq->flags;
413
414 return arm_spe_deliver_synth_event(spe, speq, event, &sample);
415 }
416
arm_spe__synth_instruction_sample(struct arm_spe_queue * speq,u64 spe_events_id,u64 data_src)417 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
418 u64 spe_events_id, u64 data_src)
419 {
420 struct arm_spe *spe = speq->spe;
421 struct arm_spe_record *record = &speq->decoder->record;
422 union perf_event *event = speq->event_buf;
423 struct perf_sample sample = { .ip = 0, };
424
425 /*
426 * Handles perf instruction sampling period.
427 */
428 speq->period_instructions++;
429 if (speq->period_instructions < spe->instructions_sample_period)
430 return 0;
431 speq->period_instructions = 0;
432
433 arm_spe_prep_sample(spe, speq, event, &sample);
434
435 sample.id = spe_events_id;
436 sample.stream_id = spe_events_id;
437 sample.addr = record->to_ip;
438 sample.phys_addr = record->phys_addr;
439 sample.data_src = data_src;
440 sample.period = spe->instructions_sample_period;
441 sample.weight = record->latency;
442 sample.flags = speq->flags;
443
444 return arm_spe_deliver_synth_event(spe, speq, event, &sample);
445 }
446
447 static const struct midr_range common_ds_encoding_cpus[] = {
448 MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
449 MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
450 MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
451 MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
452 MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
453 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
454 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
455 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
456 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
457 {},
458 };
459
460 static const struct midr_range ampereone_ds_encoding_cpus[] = {
461 MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
462 {},
463 };
464
arm_spe__sample_flags(struct arm_spe_queue * speq)465 static void arm_spe__sample_flags(struct arm_spe_queue *speq)
466 {
467 const struct arm_spe_record *record = &speq->decoder->record;
468
469 speq->flags = 0;
470 if (record->op & ARM_SPE_OP_BRANCH_ERET) {
471 speq->flags = PERF_IP_FLAG_BRANCH;
472
473 if (record->type & ARM_SPE_BRANCH_MISS)
474 speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
475 }
476 }
477
arm_spe__synth_data_source_common(const struct arm_spe_record * record,union perf_mem_data_src * data_src)478 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
479 union perf_mem_data_src *data_src)
480 {
481 /*
482 * Even though four levels of cache hierarchy are possible, no known
483 * production Neoverse systems currently include more than three levels
484 * so for the time being we assume three exist. If a production system
485 * is built with four the this function would have to be changed to
486 * detect the number of levels for reporting.
487 */
488
489 /*
490 * We have no data on the hit level or data source for stores in the
491 * Neoverse SPE records.
492 */
493 if (record->op & ARM_SPE_OP_ST) {
494 data_src->mem_lvl = PERF_MEM_LVL_NA;
495 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
496 data_src->mem_snoop = PERF_MEM_SNOOP_NA;
497 return;
498 }
499
500 switch (record->source) {
501 case ARM_SPE_COMMON_DS_L1D:
502 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
503 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
504 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
505 break;
506 case ARM_SPE_COMMON_DS_L2:
507 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
508 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
509 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
510 break;
511 case ARM_SPE_COMMON_DS_PEER_CORE:
512 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
513 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
514 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
515 break;
516 /*
517 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
518 * transfer, so set SNOOPX_PEER
519 */
520 case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
521 case ARM_SPE_COMMON_DS_PEER_CLUSTER:
522 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
523 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
524 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
525 break;
526 /*
527 * System cache is assumed to be L3
528 */
529 case ARM_SPE_COMMON_DS_SYS_CACHE:
530 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
531 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
532 data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
533 break;
534 /*
535 * We don't know what level it hit in, except it came from the other
536 * socket
537 */
538 case ARM_SPE_COMMON_DS_REMOTE:
539 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1;
540 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
541 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
542 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
543 break;
544 case ARM_SPE_COMMON_DS_DRAM:
545 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
546 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
547 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
548 break;
549 default:
550 break;
551 }
552 }
553
554 /*
555 * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
556 * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
557 */
arm_spe__synth_data_source_ampereone(const struct arm_spe_record * record,union perf_mem_data_src * data_src)558 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
559 union perf_mem_data_src *data_src)
560 {
561 struct arm_spe_record common_record;
562
563 switch (record->source) {
564 case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
565 common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
566 break;
567 case ARM_SPE_AMPEREONE_SLC:
568 common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
569 break;
570 case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
571 common_record.source = ARM_SPE_COMMON_DS_REMOTE;
572 break;
573 case ARM_SPE_AMPEREONE_DDR:
574 common_record.source = ARM_SPE_COMMON_DS_DRAM;
575 break;
576 case ARM_SPE_AMPEREONE_L1D:
577 common_record.source = ARM_SPE_COMMON_DS_L1D;
578 break;
579 case ARM_SPE_AMPEREONE_L2D:
580 common_record.source = ARM_SPE_COMMON_DS_L2;
581 break;
582 default:
583 pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
584 record->source);
585 return;
586 }
587
588 common_record.op = record->op;
589 arm_spe__synth_data_source_common(&common_record, data_src);
590 }
591
592 static const struct data_source_handle data_source_handles[] = {
593 DS(common_ds_encoding_cpus, data_source_common),
594 DS(ampereone_ds_encoding_cpus, data_source_ampereone),
595 };
596
arm_spe__synth_memory_level(const struct arm_spe_record * record,union perf_mem_data_src * data_src)597 static void arm_spe__synth_memory_level(const struct arm_spe_record *record,
598 union perf_mem_data_src *data_src)
599 {
600 if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
601 data_src->mem_lvl = PERF_MEM_LVL_L3;
602
603 if (record->type & ARM_SPE_LLC_MISS)
604 data_src->mem_lvl |= PERF_MEM_LVL_MISS;
605 else
606 data_src->mem_lvl |= PERF_MEM_LVL_HIT;
607 } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
608 data_src->mem_lvl = PERF_MEM_LVL_L1;
609
610 if (record->type & ARM_SPE_L1D_MISS)
611 data_src->mem_lvl |= PERF_MEM_LVL_MISS;
612 else
613 data_src->mem_lvl |= PERF_MEM_LVL_HIT;
614 }
615
616 if (record->type & ARM_SPE_REMOTE_ACCESS)
617 data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
618 }
619
arm_spe__synth_ds(struct arm_spe_queue * speq,const struct arm_spe_record * record,union perf_mem_data_src * data_src)620 static bool arm_spe__synth_ds(struct arm_spe_queue *speq,
621 const struct arm_spe_record *record,
622 union perf_mem_data_src *data_src)
623 {
624 struct arm_spe *spe = speq->spe;
625 u64 *metadata = NULL;
626 u64 midr;
627 unsigned int i;
628
629 /* Metadata version 1 assumes all CPUs are the same (old behavior) */
630 if (spe->metadata_ver == 1) {
631 const char *cpuid;
632
633 pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
634 cpuid = perf_env__cpuid(spe->session->evlist->env);
635 midr = strtol(cpuid, NULL, 16);
636 } else {
637 /* CPU ID is -1 for per-thread mode */
638 if (speq->cpu < 0) {
639 /*
640 * On the heterogeneous system, due to CPU ID is -1,
641 * cannot confirm the data source packet is supported.
642 */
643 if (!spe->is_homogeneous)
644 return false;
645
646 /* In homogeneous system, simply use CPU0's metadata */
647 if (spe->metadata)
648 metadata = spe->metadata[0];
649 } else {
650 metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
651 }
652
653 if (!metadata)
654 return false;
655
656 midr = metadata[ARM_SPE_CPU_MIDR];
657 }
658
659 for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
660 if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
661 data_source_handles[i].ds_synth(record, data_src);
662 return true;
663 }
664 }
665
666 return false;
667 }
668
arm_spe__synth_data_source(struct arm_spe_queue * speq,const struct arm_spe_record * record)669 static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,
670 const struct arm_spe_record *record)
671 {
672 union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA };
673
674 /* Only synthesize data source for LDST operations */
675 if (!is_ldst_op(record->op))
676 return 0;
677
678 if (record->op & ARM_SPE_OP_LD)
679 data_src.mem_op = PERF_MEM_OP_LOAD;
680 else if (record->op & ARM_SPE_OP_ST)
681 data_src.mem_op = PERF_MEM_OP_STORE;
682 else
683 return 0;
684
685 if (!arm_spe__synth_ds(speq, record, &data_src))
686 arm_spe__synth_memory_level(record, &data_src);
687
688 if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
689 data_src.mem_dtlb = PERF_MEM_TLB_WK;
690
691 if (record->type & ARM_SPE_TLB_MISS)
692 data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
693 else
694 data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
695 }
696
697 return data_src.val;
698 }
699
arm_spe_sample(struct arm_spe_queue * speq)700 static int arm_spe_sample(struct arm_spe_queue *speq)
701 {
702 const struct arm_spe_record *record = &speq->decoder->record;
703 struct arm_spe *spe = speq->spe;
704 u64 data_src;
705 int err;
706
707 arm_spe__sample_flags(speq);
708 data_src = arm_spe__synth_data_source(speq, record);
709
710 if (spe->sample_flc) {
711 if (record->type & ARM_SPE_L1D_MISS) {
712 err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
713 data_src);
714 if (err)
715 return err;
716 }
717
718 if (record->type & ARM_SPE_L1D_ACCESS) {
719 err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
720 data_src);
721 if (err)
722 return err;
723 }
724 }
725
726 if (spe->sample_llc) {
727 if (record->type & ARM_SPE_LLC_MISS) {
728 err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
729 data_src);
730 if (err)
731 return err;
732 }
733
734 if (record->type & ARM_SPE_LLC_ACCESS) {
735 err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
736 data_src);
737 if (err)
738 return err;
739 }
740 }
741
742 if (spe->sample_tlb) {
743 if (record->type & ARM_SPE_TLB_MISS) {
744 err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
745 data_src);
746 if (err)
747 return err;
748 }
749
750 if (record->type & ARM_SPE_TLB_ACCESS) {
751 err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
752 data_src);
753 if (err)
754 return err;
755 }
756 }
757
758 if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
759 err = arm_spe__synth_branch_sample(speq, spe->branch_id);
760 if (err)
761 return err;
762 }
763
764 if (spe->sample_remote_access &&
765 (record->type & ARM_SPE_REMOTE_ACCESS)) {
766 err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
767 data_src);
768 if (err)
769 return err;
770 }
771
772 /*
773 * When data_src is zero it means the record is not a memory operation,
774 * skip to synthesize memory sample for this case.
775 */
776 if (spe->sample_memory && is_ldst_op(record->op)) {
777 err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
778 if (err)
779 return err;
780 }
781
782 if (spe->sample_instructions) {
783 err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
784 if (err)
785 return err;
786 }
787
788 return 0;
789 }
790
arm_spe_run_decoder(struct arm_spe_queue * speq,u64 * timestamp)791 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
792 {
793 struct arm_spe *spe = speq->spe;
794 struct arm_spe_record *record;
795 int ret;
796
797 if (!spe->kernel_start)
798 spe->kernel_start = machine__kernel_start(spe->machine);
799
800 while (1) {
801 /*
802 * The usual logic is firstly to decode the packets, and then
803 * based the record to synthesize sample; but here the flow is
804 * reversed: it calls arm_spe_sample() for synthesizing samples
805 * prior to arm_spe_decode().
806 *
807 * Two reasons for this code logic:
808 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
809 * has decoded trace data and generated a record, but the record
810 * is left to generate sample until run to here, so it's correct
811 * to synthesize sample for the left record.
812 * 2. After decoding trace data, it needs to compare the record
813 * timestamp with the coming perf event, if the record timestamp
814 * is later than the perf event, it needs bail out and pushs the
815 * record into auxtrace heap, thus the record can be deferred to
816 * synthesize sample until run to here at the next time; so this
817 * can correlate samples between Arm SPE trace data and other
818 * perf events with correct time ordering.
819 */
820
821 /*
822 * Update pid/tid info.
823 */
824 record = &speq->decoder->record;
825 if (!spe->timeless_decoding && record->context_id != (u64)-1) {
826 ret = arm_spe_set_tid(speq, record->context_id);
827 if (ret)
828 return ret;
829
830 spe->use_ctx_pkt_for_pid = true;
831 }
832
833 ret = arm_spe_sample(speq);
834 if (ret)
835 return ret;
836
837 ret = arm_spe_decode(speq->decoder);
838 if (!ret) {
839 pr_debug("No data or all data has been processed.\n");
840 return 1;
841 }
842
843 /*
844 * Error is detected when decode SPE trace data, continue to
845 * the next trace data and find out more records.
846 */
847 if (ret < 0)
848 continue;
849
850 record = &speq->decoder->record;
851
852 /* Update timestamp for the last record */
853 if (record->timestamp > speq->timestamp)
854 speq->timestamp = record->timestamp;
855
856 /*
857 * If the timestamp of the queue is later than timestamp of the
858 * coming perf event, bail out so can allow the perf event to
859 * be processed ahead.
860 */
861 if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
862 *timestamp = speq->timestamp;
863 return 0;
864 }
865 }
866
867 return 0;
868 }
869
arm_spe__setup_queue(struct arm_spe * spe,struct auxtrace_queue * queue,unsigned int queue_nr)870 static int arm_spe__setup_queue(struct arm_spe *spe,
871 struct auxtrace_queue *queue,
872 unsigned int queue_nr)
873 {
874 struct arm_spe_queue *speq = queue->priv;
875 struct arm_spe_record *record;
876
877 if (list_empty(&queue->head) || speq)
878 return 0;
879
880 speq = arm_spe__alloc_queue(spe, queue_nr);
881
882 if (!speq)
883 return -ENOMEM;
884
885 queue->priv = speq;
886
887 if (queue->cpu != -1)
888 speq->cpu = queue->cpu;
889
890 if (!speq->on_heap) {
891 int ret;
892
893 if (spe->timeless_decoding)
894 return 0;
895
896 retry:
897 ret = arm_spe_decode(speq->decoder);
898
899 if (!ret)
900 return 0;
901
902 if (ret < 0)
903 goto retry;
904
905 record = &speq->decoder->record;
906
907 speq->timestamp = record->timestamp;
908 ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
909 if (ret)
910 return ret;
911 speq->on_heap = true;
912 }
913
914 return 0;
915 }
916
arm_spe__setup_queues(struct arm_spe * spe)917 static int arm_spe__setup_queues(struct arm_spe *spe)
918 {
919 unsigned int i;
920 int ret;
921
922 for (i = 0; i < spe->queues.nr_queues; i++) {
923 ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
924 if (ret)
925 return ret;
926 }
927
928 return 0;
929 }
930
arm_spe__update_queues(struct arm_spe * spe)931 static int arm_spe__update_queues(struct arm_spe *spe)
932 {
933 if (spe->queues.new_data) {
934 spe->queues.new_data = false;
935 return arm_spe__setup_queues(spe);
936 }
937
938 return 0;
939 }
940
arm_spe__is_timeless_decoding(struct arm_spe * spe)941 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
942 {
943 struct evsel *evsel;
944 struct evlist *evlist = spe->session->evlist;
945 bool timeless_decoding = true;
946
947 /*
948 * Circle through the list of event and complain if we find one
949 * with the time bit set.
950 */
951 evlist__for_each_entry(evlist, evsel) {
952 if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
953 timeless_decoding = false;
954 }
955
956 return timeless_decoding;
957 }
958
arm_spe_process_queues(struct arm_spe * spe,u64 timestamp)959 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
960 {
961 unsigned int queue_nr;
962 u64 ts;
963 int ret;
964
965 while (1) {
966 struct auxtrace_queue *queue;
967 struct arm_spe_queue *speq;
968
969 if (!spe->heap.heap_cnt)
970 return 0;
971
972 if (spe->heap.heap_array[0].ordinal >= timestamp)
973 return 0;
974
975 queue_nr = spe->heap.heap_array[0].queue_nr;
976 queue = &spe->queues.queue_array[queue_nr];
977 speq = queue->priv;
978
979 auxtrace_heap__pop(&spe->heap);
980
981 if (spe->heap.heap_cnt) {
982 ts = spe->heap.heap_array[0].ordinal + 1;
983 if (ts > timestamp)
984 ts = timestamp;
985 } else {
986 ts = timestamp;
987 }
988
989 /*
990 * A previous context-switch event has set pid/tid in the machine's context, so
991 * here we need to update the pid/tid in the thread and SPE queue.
992 */
993 if (!spe->use_ctx_pkt_for_pid)
994 arm_spe_set_pid_tid_cpu(spe, queue);
995
996 ret = arm_spe_run_decoder(speq, &ts);
997 if (ret < 0) {
998 auxtrace_heap__add(&spe->heap, queue_nr, ts);
999 return ret;
1000 }
1001
1002 if (!ret) {
1003 ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
1004 if (ret < 0)
1005 return ret;
1006 } else {
1007 speq->on_heap = false;
1008 }
1009 }
1010
1011 return 0;
1012 }
1013
arm_spe_process_timeless_queues(struct arm_spe * spe,pid_t tid,u64 time_)1014 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
1015 u64 time_)
1016 {
1017 struct auxtrace_queues *queues = &spe->queues;
1018 unsigned int i;
1019 u64 ts = 0;
1020
1021 for (i = 0; i < queues->nr_queues; i++) {
1022 struct auxtrace_queue *queue = &spe->queues.queue_array[i];
1023 struct arm_spe_queue *speq = queue->priv;
1024
1025 if (speq && (tid == -1 || speq->tid == tid)) {
1026 speq->time = time_;
1027 arm_spe_set_pid_tid_cpu(spe, queue);
1028 arm_spe_run_decoder(speq, &ts);
1029 }
1030 }
1031 return 0;
1032 }
1033
arm_spe_context_switch(struct arm_spe * spe,union perf_event * event,struct perf_sample * sample)1034 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
1035 struct perf_sample *sample)
1036 {
1037 pid_t pid, tid;
1038 int cpu;
1039
1040 if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
1041 return 0;
1042
1043 pid = event->context_switch.next_prev_pid;
1044 tid = event->context_switch.next_prev_tid;
1045 cpu = sample->cpu;
1046
1047 if (tid == -1)
1048 pr_warning("context_switch event has no tid\n");
1049
1050 return machine__set_current_tid(spe->machine, cpu, pid, tid);
1051 }
1052
arm_spe_process_event(struct perf_session * session,union perf_event * event,struct perf_sample * sample,const struct perf_tool * tool)1053 static int arm_spe_process_event(struct perf_session *session,
1054 union perf_event *event,
1055 struct perf_sample *sample,
1056 const struct perf_tool *tool)
1057 {
1058 int err = 0;
1059 u64 timestamp;
1060 struct arm_spe *spe = container_of(session->auxtrace,
1061 struct arm_spe, auxtrace);
1062
1063 if (dump_trace)
1064 return 0;
1065
1066 if (!tool->ordered_events) {
1067 pr_err("SPE trace requires ordered events\n");
1068 return -EINVAL;
1069 }
1070
1071 if (sample->time && (sample->time != (u64) -1))
1072 timestamp = perf_time_to_tsc(sample->time, &spe->tc);
1073 else
1074 timestamp = 0;
1075
1076 if (timestamp || spe->timeless_decoding) {
1077 err = arm_spe__update_queues(spe);
1078 if (err)
1079 return err;
1080 }
1081
1082 if (spe->timeless_decoding) {
1083 if (event->header.type == PERF_RECORD_EXIT) {
1084 err = arm_spe_process_timeless_queues(spe,
1085 event->fork.tid,
1086 sample->time);
1087 }
1088 } else if (timestamp) {
1089 err = arm_spe_process_queues(spe, timestamp);
1090 if (err)
1091 return err;
1092
1093 if (!spe->use_ctx_pkt_for_pid &&
1094 (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
1095 event->header.type == PERF_RECORD_SWITCH))
1096 err = arm_spe_context_switch(spe, event, sample);
1097 }
1098
1099 return err;
1100 }
1101
arm_spe_process_auxtrace_event(struct perf_session * session,union perf_event * event,const struct perf_tool * tool __maybe_unused)1102 static int arm_spe_process_auxtrace_event(struct perf_session *session,
1103 union perf_event *event,
1104 const struct perf_tool *tool __maybe_unused)
1105 {
1106 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1107 auxtrace);
1108
1109 if (!spe->data_queued) {
1110 struct auxtrace_buffer *buffer;
1111 off_t data_offset;
1112 int fd = perf_data__fd(session->data);
1113 int err;
1114
1115 if (perf_data__is_pipe(session->data)) {
1116 data_offset = 0;
1117 } else {
1118 data_offset = lseek(fd, 0, SEEK_CUR);
1119 if (data_offset == -1)
1120 return -errno;
1121 }
1122
1123 err = auxtrace_queues__add_event(&spe->queues, session, event,
1124 data_offset, &buffer);
1125 if (err)
1126 return err;
1127
1128 /* Dump here now we have copied a piped trace out of the pipe */
1129 if (dump_trace) {
1130 if (auxtrace_buffer__get_data(buffer, fd)) {
1131 arm_spe_dump_event(spe, buffer->data,
1132 buffer->size);
1133 auxtrace_buffer__put_data(buffer);
1134 }
1135 }
1136 }
1137
1138 return 0;
1139 }
1140
arm_spe_flush(struct perf_session * session __maybe_unused,const struct perf_tool * tool __maybe_unused)1141 static int arm_spe_flush(struct perf_session *session __maybe_unused,
1142 const struct perf_tool *tool __maybe_unused)
1143 {
1144 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1145 auxtrace);
1146 int ret;
1147
1148 if (dump_trace)
1149 return 0;
1150
1151 if (!tool->ordered_events)
1152 return -EINVAL;
1153
1154 ret = arm_spe__update_queues(spe);
1155 if (ret < 0)
1156 return ret;
1157
1158 if (spe->timeless_decoding)
1159 return arm_spe_process_timeless_queues(spe, -1,
1160 MAX_TIMESTAMP - 1);
1161
1162 ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
1163 if (ret)
1164 return ret;
1165
1166 if (!spe->use_ctx_pkt_for_pid)
1167 ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
1168 "Matching of TIDs to SPE events could be inaccurate.\n");
1169
1170 return 0;
1171 }
1172
arm_spe__alloc_per_cpu_metadata(u64 * buf,int per_cpu_size)1173 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
1174 {
1175 u64 *metadata;
1176
1177 metadata = zalloc(per_cpu_size);
1178 if (!metadata)
1179 return NULL;
1180
1181 memcpy(metadata, buf, per_cpu_size);
1182 return metadata;
1183 }
1184
arm_spe__free_metadata(u64 ** metadata,int nr_cpu)1185 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
1186 {
1187 int i;
1188
1189 for (i = 0; i < nr_cpu; i++)
1190 zfree(&metadata[i]);
1191 free(metadata);
1192 }
1193
arm_spe__alloc_metadata(struct perf_record_auxtrace_info * info,u64 * ver,int * nr_cpu)1194 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
1195 u64 *ver, int *nr_cpu)
1196 {
1197 u64 *ptr = (u64 *)info->priv;
1198 u64 metadata_size;
1199 u64 **metadata = NULL;
1200 int hdr_sz, per_cpu_sz, i;
1201
1202 metadata_size = info->header.size -
1203 sizeof(struct perf_record_auxtrace_info);
1204
1205 /* Metadata version 1 */
1206 if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
1207 *ver = 1;
1208 *nr_cpu = 0;
1209 /* No per CPU metadata */
1210 return NULL;
1211 }
1212
1213 *ver = ptr[ARM_SPE_HEADER_VERSION];
1214 hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
1215 *nr_cpu = ptr[ARM_SPE_CPUS_NUM];
1216
1217 metadata = calloc(*nr_cpu, sizeof(*metadata));
1218 if (!metadata)
1219 return NULL;
1220
1221 /* Locate the start address of per CPU metadata */
1222 ptr += hdr_sz;
1223 per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
1224
1225 for (i = 0; i < *nr_cpu; i++) {
1226 metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
1227 if (!metadata[i])
1228 goto err_per_cpu_metadata;
1229
1230 ptr += per_cpu_sz / sizeof(u64);
1231 }
1232
1233 return metadata;
1234
1235 err_per_cpu_metadata:
1236 arm_spe__free_metadata(metadata, *nr_cpu);
1237 return NULL;
1238 }
1239
arm_spe_free_queue(void * priv)1240 static void arm_spe_free_queue(void *priv)
1241 {
1242 struct arm_spe_queue *speq = priv;
1243
1244 if (!speq)
1245 return;
1246 thread__zput(speq->thread);
1247 arm_spe_decoder_free(speq->decoder);
1248 zfree(&speq->event_buf);
1249 free(speq);
1250 }
1251
arm_spe_free_events(struct perf_session * session)1252 static void arm_spe_free_events(struct perf_session *session)
1253 {
1254 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1255 auxtrace);
1256 struct auxtrace_queues *queues = &spe->queues;
1257 unsigned int i;
1258
1259 for (i = 0; i < queues->nr_queues; i++) {
1260 arm_spe_free_queue(queues->queue_array[i].priv);
1261 queues->queue_array[i].priv = NULL;
1262 }
1263 auxtrace_queues__free(queues);
1264 }
1265
arm_spe_free(struct perf_session * session)1266 static void arm_spe_free(struct perf_session *session)
1267 {
1268 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1269 auxtrace);
1270
1271 auxtrace_heap__free(&spe->heap);
1272 arm_spe_free_events(session);
1273 session->auxtrace = NULL;
1274 arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
1275 free(spe);
1276 }
1277
arm_spe_evsel_is_auxtrace(struct perf_session * session,struct evsel * evsel)1278 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
1279 struct evsel *evsel)
1280 {
1281 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);
1282
1283 return evsel->core.attr.type == spe->pmu_type;
1284 }
1285
1286 static const char * const metadata_hdr_v1_fmts[] = {
1287 [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n",
1288 [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n",
1289 };
1290
1291 static const char * const metadata_hdr_fmts[] = {
1292 [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n",
1293 [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n",
1294 [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n",
1295 [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n",
1296 };
1297
1298 static const char * const metadata_per_cpu_fmts[] = {
1299 [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n",
1300 [ARM_SPE_CPU] = " CPU # :%"PRId64"\n",
1301 [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n",
1302 [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n",
1303 [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n",
1304 [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n",
1305 };
1306
arm_spe_print_info(struct arm_spe * spe,__u64 * arr)1307 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
1308 {
1309 unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
1310 const char * const *hdr_fmts;
1311
1312 if (!dump_trace)
1313 return;
1314
1315 if (spe->metadata_ver == 1) {
1316 cpu_num = 0;
1317 hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
1318 hdr_fmts = metadata_hdr_v1_fmts;
1319 } else {
1320 cpu_num = arr[ARM_SPE_CPUS_NUM];
1321 hdr_size = arr[ARM_SPE_HEADER_SIZE];
1322 hdr_fmts = metadata_hdr_fmts;
1323 }
1324
1325 for (i = 0; i < hdr_size; i++)
1326 fprintf(stdout, hdr_fmts[i], arr[i]);
1327
1328 arr += hdr_size;
1329 for (cpu = 0; cpu < cpu_num; cpu++) {
1330 /*
1331 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
1332 * are fixed. The sequential parameter size is decided by the
1333 * field 'ARM_SPE_CPU_NR_PARAMS'.
1334 */
1335 cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
1336 for (i = 0; i < cpu_size; i++)
1337 fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
1338 arr += cpu_size;
1339 }
1340 }
1341
arm_spe_set_event_name(struct evlist * evlist,u64 id,const char * name)1342 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
1343 const char *name)
1344 {
1345 struct evsel *evsel;
1346
1347 evlist__for_each_entry(evlist, evsel) {
1348 if (evsel->core.id && evsel->core.id[0] == id) {
1349 if (evsel->name)
1350 zfree(&evsel->name);
1351 evsel->name = strdup(name);
1352 break;
1353 }
1354 }
1355 }
1356
1357 static int
arm_spe_synth_events(struct arm_spe * spe,struct perf_session * session)1358 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
1359 {
1360 struct evlist *evlist = session->evlist;
1361 struct evsel *evsel;
1362 struct perf_event_attr attr;
1363 bool found = false;
1364 u64 id;
1365 int err;
1366
1367 evlist__for_each_entry(evlist, evsel) {
1368 if (evsel->core.attr.type == spe->pmu_type) {
1369 found = true;
1370 break;
1371 }
1372 }
1373
1374 if (!found) {
1375 pr_debug("No selected events with SPE trace data\n");
1376 return 0;
1377 }
1378
1379 memset(&attr, 0, sizeof(struct perf_event_attr));
1380 attr.size = sizeof(struct perf_event_attr);
1381 attr.type = PERF_TYPE_HARDWARE;
1382 attr.sample_type = evsel->core.attr.sample_type &
1383 (PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
1384 attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
1385 PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
1386 PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
1387 if (spe->timeless_decoding)
1388 attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
1389 else
1390 attr.sample_type |= PERF_SAMPLE_TIME;
1391
1392 spe->sample_type = attr.sample_type;
1393
1394 attr.exclude_user = evsel->core.attr.exclude_user;
1395 attr.exclude_kernel = evsel->core.attr.exclude_kernel;
1396 attr.exclude_hv = evsel->core.attr.exclude_hv;
1397 attr.exclude_host = evsel->core.attr.exclude_host;
1398 attr.exclude_guest = evsel->core.attr.exclude_guest;
1399 attr.sample_id_all = evsel->core.attr.sample_id_all;
1400 attr.read_format = evsel->core.attr.read_format;
1401
1402 /* create new id val to be a fixed offset from evsel id */
1403 id = evsel->core.id[0] + 1000000000;
1404
1405 if (!id)
1406 id = 1;
1407
1408 if (spe->synth_opts.flc) {
1409 spe->sample_flc = true;
1410
1411 /* Level 1 data cache miss */
1412 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1413 if (err)
1414 return err;
1415 spe->l1d_miss_id = id;
1416 arm_spe_set_event_name(evlist, id, "l1d-miss");
1417 id += 1;
1418
1419 /* Level 1 data cache access */
1420 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1421 if (err)
1422 return err;
1423 spe->l1d_access_id = id;
1424 arm_spe_set_event_name(evlist, id, "l1d-access");
1425 id += 1;
1426 }
1427
1428 if (spe->synth_opts.llc) {
1429 spe->sample_llc = true;
1430
1431 /* Last level cache miss */
1432 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1433 if (err)
1434 return err;
1435 spe->llc_miss_id = id;
1436 arm_spe_set_event_name(evlist, id, "llc-miss");
1437 id += 1;
1438
1439 /* Last level cache access */
1440 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1441 if (err)
1442 return err;
1443 spe->llc_access_id = id;
1444 arm_spe_set_event_name(evlist, id, "llc-access");
1445 id += 1;
1446 }
1447
1448 if (spe->synth_opts.tlb) {
1449 spe->sample_tlb = true;
1450
1451 /* TLB miss */
1452 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1453 if (err)
1454 return err;
1455 spe->tlb_miss_id = id;
1456 arm_spe_set_event_name(evlist, id, "tlb-miss");
1457 id += 1;
1458
1459 /* TLB access */
1460 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1461 if (err)
1462 return err;
1463 spe->tlb_access_id = id;
1464 arm_spe_set_event_name(evlist, id, "tlb-access");
1465 id += 1;
1466 }
1467
1468 if (spe->synth_opts.branches) {
1469 spe->sample_branch = true;
1470
1471 /* Branch */
1472 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1473 if (err)
1474 return err;
1475 spe->branch_id = id;
1476 arm_spe_set_event_name(evlist, id, "branch");
1477 id += 1;
1478 }
1479
1480 if (spe->synth_opts.remote_access) {
1481 spe->sample_remote_access = true;
1482
1483 /* Remote access */
1484 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1485 if (err)
1486 return err;
1487 spe->remote_access_id = id;
1488 arm_spe_set_event_name(evlist, id, "remote-access");
1489 id += 1;
1490 }
1491
1492 if (spe->synth_opts.mem) {
1493 spe->sample_memory = true;
1494
1495 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1496 if (err)
1497 return err;
1498 spe->memory_id = id;
1499 arm_spe_set_event_name(evlist, id, "memory");
1500 id += 1;
1501 }
1502
1503 if (spe->synth_opts.instructions) {
1504 if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
1505 pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n");
1506 goto synth_instructions_out;
1507 }
1508 if (spe->synth_opts.period > 1)
1509 pr_warning("Arm SPE has a hardware-based sample period.\n"
1510 "Additional instruction events will be discarded by --itrace\n");
1511
1512 spe->sample_instructions = true;
1513 attr.config = PERF_COUNT_HW_INSTRUCTIONS;
1514 attr.sample_period = spe->synth_opts.period;
1515 spe->instructions_sample_period = attr.sample_period;
1516 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1517 if (err)
1518 return err;
1519 spe->instructions_id = id;
1520 arm_spe_set_event_name(evlist, id, "instructions");
1521 }
1522 synth_instructions_out:
1523
1524 return 0;
1525 }
1526
arm_spe__is_homogeneous(u64 ** metadata,int nr_cpu)1527 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
1528 {
1529 u64 midr;
1530 int i;
1531
1532 if (!nr_cpu)
1533 return false;
1534
1535 for (i = 0; i < nr_cpu; i++) {
1536 if (!metadata[i])
1537 return false;
1538
1539 if (i == 0) {
1540 midr = metadata[i][ARM_SPE_CPU_MIDR];
1541 continue;
1542 }
1543
1544 if (midr != metadata[i][ARM_SPE_CPU_MIDR])
1545 return false;
1546 }
1547
1548 return true;
1549 }
1550
arm_spe_process_auxtrace_info(union perf_event * event,struct perf_session * session)1551 int arm_spe_process_auxtrace_info(union perf_event *event,
1552 struct perf_session *session)
1553 {
1554 struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1555 size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
1556 struct perf_record_time_conv *tc = &session->time_conv;
1557 struct arm_spe *spe;
1558 u64 **metadata = NULL;
1559 u64 metadata_ver;
1560 int nr_cpu, err;
1561
1562 if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
1563 min_sz)
1564 return -EINVAL;
1565
1566 metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
1567 &nr_cpu);
1568 if (!metadata && metadata_ver != 1) {
1569 pr_err("Failed to parse Arm SPE metadata.\n");
1570 return -EINVAL;
1571 }
1572
1573 spe = zalloc(sizeof(struct arm_spe));
1574 if (!spe) {
1575 err = -ENOMEM;
1576 goto err_free_metadata;
1577 }
1578
1579 err = auxtrace_queues__init(&spe->queues);
1580 if (err)
1581 goto err_free;
1582
1583 spe->session = session;
1584 spe->machine = &session->machines.host; /* No kvm support */
1585 spe->auxtrace_type = auxtrace_info->type;
1586 if (metadata_ver == 1)
1587 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1588 else
1589 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
1590 spe->metadata = metadata;
1591 spe->metadata_ver = metadata_ver;
1592 spe->metadata_nr_cpu = nr_cpu;
1593 spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
1594
1595 spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
1596
1597 /*
1598 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
1599 * and the parameters for hardware clock are stored in the session
1600 * context. Passes these parameters to the struct perf_tsc_conversion
1601 * in "spe->tc", which is used for later conversion between clock
1602 * counter and timestamp.
1603 *
1604 * For backward compatibility, copies the fields starting from
1605 * "time_cycles" only if they are contained in the event.
1606 */
1607 spe->tc.time_shift = tc->time_shift;
1608 spe->tc.time_mult = tc->time_mult;
1609 spe->tc.time_zero = tc->time_zero;
1610
1611 if (event_contains(*tc, time_cycles)) {
1612 spe->tc.time_cycles = tc->time_cycles;
1613 spe->tc.time_mask = tc->time_mask;
1614 spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
1615 spe->tc.cap_user_time_short = tc->cap_user_time_short;
1616 }
1617
1618 spe->auxtrace.process_event = arm_spe_process_event;
1619 spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
1620 spe->auxtrace.flush_events = arm_spe_flush;
1621 spe->auxtrace.free_events = arm_spe_free_events;
1622 spe->auxtrace.free = arm_spe_free;
1623 spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
1624 session->auxtrace = &spe->auxtrace;
1625
1626 arm_spe_print_info(spe, &auxtrace_info->priv[0]);
1627
1628 if (dump_trace)
1629 return 0;
1630
1631 if (session->itrace_synth_opts && session->itrace_synth_opts->set)
1632 spe->synth_opts = *session->itrace_synth_opts;
1633 else
1634 itrace_synth_opts__set_default(&spe->synth_opts, false);
1635
1636 err = arm_spe_synth_events(spe, session);
1637 if (err)
1638 goto err_free_queues;
1639
1640 err = auxtrace_queues__process_index(&spe->queues, session);
1641 if (err)
1642 goto err_free_queues;
1643
1644 if (spe->queues.populated)
1645 spe->data_queued = true;
1646
1647 return 0;
1648
1649 err_free_queues:
1650 auxtrace_queues__free(&spe->queues);
1651 session->auxtrace = NULL;
1652 err_free:
1653 free(spe);
1654 err_free_metadata:
1655 arm_spe__free_metadata(metadata, nr_cpu);
1656 return err;
1657 }
1658