1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Arm Statistical Profiling Extensions (SPE) support
4 * Copyright (c) 2017-2018, Arm Ltd.
5 */
6
7 #include <byteswap.h>
8 #include <endian.h>
9 #include <errno.h>
10 #include <inttypes.h>
11 #include <linux/bitops.h>
12 #include <linux/kernel.h>
13 #include <linux/log2.h>
14 #include <linux/types.h>
15 #include <linux/zalloc.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18
19 #include "auxtrace.h"
20 #include "color.h"
21 #include "debug.h"
22 #include "evlist.h"
23 #include "evsel.h"
24 #include "machine.h"
25 #include "session.h"
26 #include "symbol.h"
27 #include "thread.h"
28 #include "thread-stack.h"
29 #include "tsc.h"
30 #include "tool.h"
31 #include "util/synthetic-events.h"
32
33 #include "arm-spe.h"
34 #include "arm-spe-decoder/arm-spe-decoder.h"
35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
36
37 #include "../../arch/arm64/include/asm/cputype.h"
38 #define MAX_TIMESTAMP (~0ULL)
39
40 #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST))
41
42 #define ARM_SPE_CACHE_EVENT(lvl) \
43 (ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS)
44
45 #define arm_spe_is_cache_level(type, lvl) \
46 ((type) & ARM_SPE_CACHE_EVENT(lvl))
47
48 #define arm_spe_is_cache_hit(type, lvl) \
49 (((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS)
50
51 #define arm_spe_is_cache_miss(type, lvl) \
52 ((type) & ARM_SPE_##lvl##_MISS)
53
54 struct arm_spe {
55 struct auxtrace auxtrace;
56 struct auxtrace_queues queues;
57 struct auxtrace_heap heap;
58 struct itrace_synth_opts synth_opts;
59 u32 auxtrace_type;
60 struct perf_session *session;
61 struct machine *machine;
62 u32 pmu_type;
63
64 struct perf_tsc_conversion tc;
65
66 u8 timeless_decoding;
67 u8 data_queued;
68
69 u64 sample_type;
70 u8 sample_flc;
71 u8 sample_llc;
72 u8 sample_tlb;
73 u8 sample_branch;
74 u8 sample_remote_access;
75 u8 sample_memory;
76 u8 sample_instructions;
77
78 u64 l1d_miss_id;
79 u64 l1d_access_id;
80 u64 llc_miss_id;
81 u64 llc_access_id;
82 u64 tlb_miss_id;
83 u64 tlb_access_id;
84 u64 branch_id;
85 u64 remote_access_id;
86 u64 memory_id;
87 u64 instructions_id;
88
89 u64 kernel_start;
90
91 unsigned long num_events;
92 u8 use_ctx_pkt_for_pid;
93
94 u64 **metadata;
95 u64 metadata_ver;
96 u64 metadata_nr_cpu;
97 bool is_homogeneous;
98 };
99
100 struct arm_spe_queue {
101 struct arm_spe *spe;
102 unsigned int queue_nr;
103 struct auxtrace_buffer *buffer;
104 struct auxtrace_buffer *old_buffer;
105 union perf_event *event_buf;
106 bool on_heap;
107 bool done;
108 pid_t pid;
109 pid_t tid;
110 int cpu;
111 struct arm_spe_decoder *decoder;
112 u64 time;
113 u64 timestamp;
114 struct thread *thread;
115 u64 sample_count;
116 u32 flags;
117 struct branch_stack *last_branch;
118 };
119
120 struct data_source_handle {
121 const struct midr_range *midr_ranges;
122 void (*ds_synth)(const struct arm_spe_record *record,
123 union perf_mem_data_src *data_src);
124 };
125
126 #define DS(range, func) \
127 { \
128 .midr_ranges = range, \
129 .ds_synth = arm_spe__synth_##func, \
130 }
131
arm_spe_dump(struct arm_spe * spe __maybe_unused,unsigned char * buf,size_t len)132 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
133 unsigned char *buf, size_t len)
134 {
135 struct arm_spe_pkt packet;
136 size_t pos = 0;
137 int ret, pkt_len, i;
138 char desc[ARM_SPE_PKT_DESC_MAX];
139 const char *color = PERF_COLOR_BLUE;
140
141 color_fprintf(stdout, color,
142 ". ... ARM SPE data: size %#zx bytes\n",
143 len);
144
145 while (len) {
146 ret = arm_spe_get_packet(buf, len, &packet);
147 if (ret > 0)
148 pkt_len = ret;
149 else
150 pkt_len = 1;
151 printf(".");
152 color_fprintf(stdout, color, " %08zx: ", pos);
153 for (i = 0; i < pkt_len; i++)
154 color_fprintf(stdout, color, " %02x", buf[i]);
155 for (; i < 16; i++)
156 color_fprintf(stdout, color, " ");
157 if (ret > 0) {
158 ret = arm_spe_pkt_desc(&packet, desc,
159 ARM_SPE_PKT_DESC_MAX);
160 if (!ret)
161 color_fprintf(stdout, color, " %s\n", desc);
162 } else {
163 color_fprintf(stdout, color, " Bad packet!\n");
164 }
165 pos += pkt_len;
166 buf += pkt_len;
167 len -= pkt_len;
168 }
169 }
170
arm_spe_dump_event(struct arm_spe * spe,unsigned char * buf,size_t len)171 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
172 size_t len)
173 {
174 printf(".\n");
175 arm_spe_dump(spe, buf, len);
176 }
177
arm_spe_get_trace(struct arm_spe_buffer * b,void * data)178 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
179 {
180 struct arm_spe_queue *speq = data;
181 struct auxtrace_buffer *buffer = speq->buffer;
182 struct auxtrace_buffer *old_buffer = speq->old_buffer;
183 struct auxtrace_queue *queue;
184
185 queue = &speq->spe->queues.queue_array[speq->queue_nr];
186
187 buffer = auxtrace_buffer__next(queue, buffer);
188 /* If no more data, drop the previous auxtrace_buffer and return */
189 if (!buffer) {
190 if (old_buffer)
191 auxtrace_buffer__drop_data(old_buffer);
192 b->len = 0;
193 return 0;
194 }
195
196 speq->buffer = buffer;
197
198 /* If the aux_buffer doesn't have data associated, try to load it */
199 if (!buffer->data) {
200 /* get the file desc associated with the perf data file */
201 int fd = perf_data__fd(speq->spe->session->data);
202
203 buffer->data = auxtrace_buffer__get_data(buffer, fd);
204 if (!buffer->data)
205 return -ENOMEM;
206 }
207
208 b->len = buffer->size;
209 b->buf = buffer->data;
210
211 if (b->len) {
212 if (old_buffer)
213 auxtrace_buffer__drop_data(old_buffer);
214 speq->old_buffer = buffer;
215 } else {
216 auxtrace_buffer__drop_data(buffer);
217 return arm_spe_get_trace(b, data);
218 }
219
220 return 0;
221 }
222
arm_spe__alloc_queue(struct arm_spe * spe,unsigned int queue_nr)223 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
224 unsigned int queue_nr)
225 {
226 struct arm_spe_params params = { .get_trace = 0, };
227 struct arm_spe_queue *speq;
228
229 speq = zalloc(sizeof(*speq));
230 if (!speq)
231 return NULL;
232
233 speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
234 if (!speq->event_buf)
235 goto out_free;
236
237 speq->spe = spe;
238 speq->queue_nr = queue_nr;
239 speq->pid = -1;
240 speq->tid = -1;
241 speq->cpu = -1;
242
243 /* params set */
244 params.get_trace = arm_spe_get_trace;
245 params.data = speq;
246
247 if (spe->synth_opts.last_branch) {
248 size_t sz = sizeof(struct branch_stack);
249
250 /* Allocate up to two entries for PBT + TGT */
251 sz += sizeof(struct branch_entry) *
252 min(spe->synth_opts.last_branch_sz, 2U);
253 speq->last_branch = zalloc(sz);
254 if (!speq->last_branch)
255 goto out_free;
256 }
257
258 /* create new decoder */
259 speq->decoder = arm_spe_decoder_new(¶ms);
260 if (!speq->decoder)
261 goto out_free;
262
263 return speq;
264
265 out_free:
266 zfree(&speq->event_buf);
267 zfree(&speq->last_branch);
268 free(speq);
269
270 return NULL;
271 }
272
arm_spe_cpumode(struct arm_spe * spe,u64 ip)273 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
274 {
275 return ip >= spe->kernel_start ?
276 PERF_RECORD_MISC_KERNEL :
277 PERF_RECORD_MISC_USER;
278 }
279
arm_spe_set_pid_tid_cpu(struct arm_spe * spe,struct auxtrace_queue * queue)280 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
281 struct auxtrace_queue *queue)
282 {
283 struct arm_spe_queue *speq = queue->priv;
284 pid_t tid;
285
286 tid = machine__get_current_tid(spe->machine, speq->cpu);
287 if (tid != -1) {
288 speq->tid = tid;
289 thread__zput(speq->thread);
290 } else
291 speq->tid = queue->tid;
292
293 if ((!speq->thread) && (speq->tid != -1)) {
294 speq->thread = machine__find_thread(spe->machine, -1,
295 speq->tid);
296 }
297
298 if (speq->thread) {
299 speq->pid = thread__pid(speq->thread);
300 if (queue->cpu == -1)
301 speq->cpu = thread__cpu(speq->thread);
302 }
303 }
304
arm_spe_set_tid(struct arm_spe_queue * speq,pid_t tid)305 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
306 {
307 struct arm_spe *spe = speq->spe;
308 int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);
309
310 if (err)
311 return err;
312
313 arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
314
315 return 0;
316 }
317
arm_spe__get_metadata_by_cpu(struct arm_spe * spe,int cpu)318 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu)
319 {
320 u64 i;
321
322 if (!spe->metadata)
323 return NULL;
324
325 /* CPU ID is -1 for per-thread mode */
326 if (cpu < 0) {
327 /*
328 * On the heterogeneous system, due to CPU ID is -1,
329 * cannot confirm the data source packet is supported.
330 */
331 if (!spe->is_homogeneous)
332 return NULL;
333
334 /* In homogeneous system, simply use CPU0's metadata */
335 return spe->metadata[0];
336 }
337
338 for (i = 0; i < spe->metadata_nr_cpu; i++)
339 if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu)
340 return spe->metadata[i];
341
342 return NULL;
343 }
344
arm_spe__synth_simd_flags(const struct arm_spe_record * record)345 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
346 {
347 struct simd_flags simd_flags = {};
348
349 if ((record->op & ARM_SPE_OP_LDST) && (record->op & ARM_SPE_OP_SVE_LDST))
350 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
351
352 if ((record->op & ARM_SPE_OP_OTHER) && (record->op & ARM_SPE_OP_SVE_OTHER))
353 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
354
355 if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
356 simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL;
357
358 if (record->type & ARM_SPE_SVE_EMPTY_PRED)
359 simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY;
360
361 return simd_flags;
362 }
363
arm_spe_prep_sample(struct arm_spe * spe,struct arm_spe_queue * speq,union perf_event * event,struct perf_sample * sample)364 static void arm_spe_prep_sample(struct arm_spe *spe,
365 struct arm_spe_queue *speq,
366 union perf_event *event,
367 struct perf_sample *sample)
368 {
369 struct arm_spe_record *record = &speq->decoder->record;
370
371 if (!spe->timeless_decoding)
372 sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
373
374 sample->ip = record->from_ip;
375 sample->cpumode = arm_spe_cpumode(spe, sample->ip);
376 sample->pid = speq->pid;
377 sample->tid = speq->tid;
378 sample->period = spe->synth_opts.period;
379 sample->cpu = speq->cpu;
380 sample->simd_flags = arm_spe__synth_simd_flags(record);
381
382 event->sample.header.type = PERF_RECORD_SAMPLE;
383 event->sample.header.misc = sample->cpumode;
384 event->sample.header.size = sizeof(struct perf_event_header);
385 }
386
arm_spe__prep_branch_stack(struct arm_spe_queue * speq)387 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
388 {
389 struct arm_spe *spe = speq->spe;
390 struct arm_spe_record *record = &speq->decoder->record;
391 struct branch_stack *bstack = speq->last_branch;
392 struct branch_flags *bs_flags;
393 unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
394 bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
395 bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
396 size_t sz = sizeof(struct branch_stack) +
397 sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
398 int i = 0;
399
400 /* Clean up branch stack */
401 memset(bstack, 0x0, sz);
402
403 if (!have_tgt && !have_pbt)
404 return;
405
406 if (have_tgt) {
407 bstack->entries[i].from = record->from_ip;
408 bstack->entries[i].to = record->to_ip;
409
410 bs_flags = &bstack->entries[i].flags;
411 bs_flags->value = 0;
412
413 if (record->op & ARM_SPE_OP_BR_CR_BL) {
414 if (record->op & ARM_SPE_OP_BR_COND)
415 bs_flags->type |= PERF_BR_COND_CALL;
416 else
417 bs_flags->type |= PERF_BR_CALL;
418 /*
419 * Indirect branch instruction without link (e.g. BR),
420 * take this case as function return.
421 */
422 } else if (record->op & ARM_SPE_OP_BR_CR_RET ||
423 record->op & ARM_SPE_OP_BR_INDIRECT) {
424 if (record->op & ARM_SPE_OP_BR_COND)
425 bs_flags->type |= PERF_BR_COND_RET;
426 else
427 bs_flags->type |= PERF_BR_RET;
428 } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
429 if (record->op & ARM_SPE_OP_BR_COND)
430 bs_flags->type |= PERF_BR_COND;
431 else
432 bs_flags->type |= PERF_BR_UNCOND;
433 } else {
434 if (record->op & ARM_SPE_OP_BR_COND)
435 bs_flags->type |= PERF_BR_COND;
436 else
437 bs_flags->type |= PERF_BR_UNKNOWN;
438 }
439
440 if (record->type & ARM_SPE_BRANCH_MISS) {
441 bs_flags->mispred = 1;
442 bs_flags->predicted = 0;
443 } else {
444 bs_flags->mispred = 0;
445 bs_flags->predicted = 1;
446 }
447
448 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
449 bs_flags->not_taken = 1;
450
451 if (record->type & ARM_SPE_IN_TXN)
452 bs_flags->in_tx = 1;
453
454 bs_flags->cycles = min(record->latency, 0xFFFFU);
455 i++;
456 }
457
458 if (have_pbt) {
459 bs_flags = &bstack->entries[i].flags;
460 bs_flags->type |= PERF_BR_UNKNOWN;
461 bstack->entries[i].to = record->prev_br_tgt;
462 i++;
463 }
464
465 bstack->nr = i;
466 bstack->hw_idx = -1ULL;
467 }
468
arm_spe__inject_event(union perf_event * event,struct perf_sample * sample,u64 type)469 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
470 {
471 event->header.size = perf_event__sample_event_size(sample, type, 0);
472 return perf_event__synthesize_sample(event, type, 0, sample);
473 }
474
475 static inline int
arm_spe_deliver_synth_event(struct arm_spe * spe,struct arm_spe_queue * speq __maybe_unused,union perf_event * event,struct perf_sample * sample)476 arm_spe_deliver_synth_event(struct arm_spe *spe,
477 struct arm_spe_queue *speq __maybe_unused,
478 union perf_event *event,
479 struct perf_sample *sample)
480 {
481 int ret;
482
483 if (spe->synth_opts.inject) {
484 ret = arm_spe__inject_event(event, sample, spe->sample_type);
485 if (ret)
486 return ret;
487 }
488
489 ret = perf_session__deliver_synth_event(spe->session, event, sample);
490 if (ret)
491 pr_err("ARM SPE: failed to deliver event, error %d\n", ret);
492
493 return ret;
494 }
495
arm_spe__synth_mem_sample(struct arm_spe_queue * speq,u64 spe_events_id,union perf_mem_data_src data_src)496 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
497 u64 spe_events_id,
498 union perf_mem_data_src data_src)
499 {
500 struct arm_spe *spe = speq->spe;
501 struct arm_spe_record *record = &speq->decoder->record;
502 union perf_event *event = speq->event_buf;
503 struct perf_sample sample;
504 int ret;
505
506 perf_sample__init(&sample, /*all=*/true);
507 arm_spe_prep_sample(spe, speq, event, &sample);
508
509 sample.id = spe_events_id;
510 sample.stream_id = spe_events_id;
511 sample.addr = record->virt_addr;
512 sample.phys_addr = record->phys_addr;
513 sample.data_src = data_src.val;
514 sample.weight = record->latency;
515
516 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
517 perf_sample__exit(&sample);
518 return ret;
519 }
520
arm_spe__synth_branch_sample(struct arm_spe_queue * speq,u64 spe_events_id)521 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
522 u64 spe_events_id)
523 {
524 struct arm_spe *spe = speq->spe;
525 struct arm_spe_record *record = &speq->decoder->record;
526 union perf_event *event = speq->event_buf;
527 struct perf_sample sample;
528 int ret;
529
530 perf_sample__init(&sample, /*all=*/true);
531 arm_spe_prep_sample(spe, speq, event, &sample);
532
533 sample.id = spe_events_id;
534 sample.stream_id = spe_events_id;
535 sample.addr = record->to_ip;
536 sample.weight = record->latency;
537 sample.flags = speq->flags;
538 sample.branch_stack = speq->last_branch;
539
540 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
541 perf_sample__exit(&sample);
542 return ret;
543 }
544
arm_spe__synth_instruction_sample(struct arm_spe_queue * speq,u64 spe_events_id,union perf_mem_data_src data_src)545 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
546 u64 spe_events_id,
547 union perf_mem_data_src data_src)
548 {
549 struct arm_spe *spe = speq->spe;
550 struct arm_spe_record *record = &speq->decoder->record;
551 union perf_event *event = speq->event_buf;
552 struct perf_sample sample;
553 int ret;
554
555 perf_sample__init(&sample, /*all=*/true);
556 arm_spe_prep_sample(spe, speq, event, &sample);
557
558 sample.id = spe_events_id;
559 sample.stream_id = spe_events_id;
560 sample.addr = record->to_ip;
561 sample.phys_addr = record->phys_addr;
562 sample.data_src = data_src.val;
563 sample.weight = record->latency;
564 sample.flags = speq->flags;
565 sample.branch_stack = speq->last_branch;
566
567 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
568 perf_sample__exit(&sample);
569 return ret;
570 }
571
572 static const struct midr_range common_ds_encoding_cpus[] = {
573 MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
574 MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
575 MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
576 MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
577 MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
578 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
579 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
580 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
581 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
582 {},
583 };
584
585 static const struct midr_range ampereone_ds_encoding_cpus[] = {
586 MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
587 {},
588 };
589
590 static const struct midr_range hisi_hip_ds_encoding_cpus[] = {
591 MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
592 {},
593 };
594
arm_spe__sample_flags(struct arm_spe_queue * speq)595 static void arm_spe__sample_flags(struct arm_spe_queue *speq)
596 {
597 const struct arm_spe_record *record = &speq->decoder->record;
598
599 speq->flags = 0;
600 if (record->op & ARM_SPE_OP_BRANCH_ERET) {
601 speq->flags = PERF_IP_FLAG_BRANCH;
602
603 if (record->type & ARM_SPE_BRANCH_MISS)
604 speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
605
606 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
607 speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
608
609 if (record->type & ARM_SPE_IN_TXN)
610 speq->flags |= PERF_IP_FLAG_IN_TX;
611
612 if (record->op & ARM_SPE_OP_BR_COND)
613 speq->flags |= PERF_IP_FLAG_CONDITIONAL;
614
615 if (record->op & ARM_SPE_OP_BR_CR_BL)
616 speq->flags |= PERF_IP_FLAG_CALL;
617 else if (record->op & ARM_SPE_OP_BR_CR_RET)
618 speq->flags |= PERF_IP_FLAG_RETURN;
619 /*
620 * Indirect branch instruction without link (e.g. BR),
621 * take it as a function return.
622 */
623 else if (record->op & ARM_SPE_OP_BR_INDIRECT)
624 speq->flags |= PERF_IP_FLAG_RETURN;
625 }
626 }
627
arm_spe__synth_data_source_common(const struct arm_spe_record * record,union perf_mem_data_src * data_src)628 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
629 union perf_mem_data_src *data_src)
630 {
631 /*
632 * Even though four levels of cache hierarchy are possible, no known
633 * production Neoverse systems currently include more than three levels
634 * so for the time being we assume three exist. If a production system
635 * is built with four the this function would have to be changed to
636 * detect the number of levels for reporting.
637 */
638
639 /*
640 * We have no data on the hit level or data source for stores in the
641 * Neoverse SPE records.
642 */
643 if (record->op & ARM_SPE_OP_ST) {
644 data_src->mem_lvl = PERF_MEM_LVL_NA;
645 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
646 data_src->mem_snoop = PERF_MEM_SNOOP_NA;
647 return;
648 }
649
650 switch (record->source) {
651 case ARM_SPE_COMMON_DS_L1D:
652 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
653 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
654 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
655 break;
656 case ARM_SPE_COMMON_DS_L2:
657 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
658 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
659 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
660 break;
661 case ARM_SPE_COMMON_DS_PEER_CORE:
662 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
663 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
664 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
665 break;
666 /*
667 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
668 * transfer, so set SNOOPX_PEER
669 */
670 case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
671 case ARM_SPE_COMMON_DS_PEER_CLUSTER:
672 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
673 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
674 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
675 break;
676 /*
677 * System cache is assumed to be L3
678 */
679 case ARM_SPE_COMMON_DS_SYS_CACHE:
680 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
681 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
682 data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
683 break;
684 /*
685 * We don't know what level it hit in, except it came from the other
686 * socket
687 */
688 case ARM_SPE_COMMON_DS_REMOTE:
689 data_src->mem_lvl = PERF_MEM_LVL_NA;
690 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
691 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
692 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
693 break;
694 case ARM_SPE_COMMON_DS_DRAM:
695 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
696 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
697 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
698 break;
699 default:
700 break;
701 }
702 }
703
704 /*
705 * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
706 * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
707 */
arm_spe__synth_data_source_ampereone(const struct arm_spe_record * record,union perf_mem_data_src * data_src)708 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
709 union perf_mem_data_src *data_src)
710 {
711 struct arm_spe_record common_record;
712
713 switch (record->source) {
714 case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
715 common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
716 break;
717 case ARM_SPE_AMPEREONE_SLC:
718 common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
719 break;
720 case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
721 common_record.source = ARM_SPE_COMMON_DS_REMOTE;
722 break;
723 case ARM_SPE_AMPEREONE_DDR:
724 common_record.source = ARM_SPE_COMMON_DS_DRAM;
725 break;
726 case ARM_SPE_AMPEREONE_L1D:
727 common_record.source = ARM_SPE_COMMON_DS_L1D;
728 break;
729 case ARM_SPE_AMPEREONE_L2D:
730 common_record.source = ARM_SPE_COMMON_DS_L2;
731 break;
732 default:
733 pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
734 record->source);
735 return;
736 }
737
738 common_record.op = record->op;
739 arm_spe__synth_data_source_common(&common_record, data_src);
740 }
741
arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record * record,union perf_mem_data_src * data_src)742 static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record,
743 union perf_mem_data_src *data_src)
744 {
745 /* Use common synthesis method to handle store operations */
746 if (record->op & ARM_SPE_OP_ST) {
747 arm_spe__synth_data_source_common(record, data_src);
748 return;
749 }
750
751 switch (record->source) {
752 case ARM_SPE_HISI_HIP_PEER_CPU:
753 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
754 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
755 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
756 break;
757 case ARM_SPE_HISI_HIP_PEER_CPU_HITM:
758 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
759 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
760 data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
761 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
762 break;
763 case ARM_SPE_HISI_HIP_L3:
764 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
765 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
766 data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
767 break;
768 case ARM_SPE_HISI_HIP_L3_HITM:
769 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
770 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
771 data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
772 break;
773 case ARM_SPE_HISI_HIP_PEER_CLUSTER:
774 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
775 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
776 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
777 break;
778 case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM:
779 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
780 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
781 data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
782 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
783 break;
784 case ARM_SPE_HISI_HIP_REMOTE_SOCKET:
785 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
786 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
787 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
788 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
789 break;
790 case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM:
791 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
792 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
793 data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
794 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
795 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
796 break;
797 case ARM_SPE_HISI_HIP_LOCAL_MEM:
798 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
799 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
800 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
801 break;
802 case ARM_SPE_HISI_HIP_REMOTE_MEM:
803 data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
804 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
805 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
806 break;
807 case ARM_SPE_HISI_HIP_NC_DEV:
808 data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT;
809 data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
810 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
811 break;
812 case ARM_SPE_HISI_HIP_L2:
813 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
814 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
815 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
816 break;
817 case ARM_SPE_HISI_HIP_L2_HITM:
818 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
819 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
820 data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
821 break;
822 case ARM_SPE_HISI_HIP_L1:
823 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
824 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
825 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
826 break;
827 default:
828 break;
829 }
830 }
831
832 static const struct data_source_handle data_source_handles[] = {
833 DS(common_ds_encoding_cpus, data_source_common),
834 DS(ampereone_ds_encoding_cpus, data_source_ampereone),
835 DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip),
836 };
837
arm_spe__synth_ld_memory_level(const struct arm_spe_record * record,union perf_mem_data_src * data_src)838 static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record,
839 union perf_mem_data_src *data_src)
840 {
841 /*
842 * To find a cache hit, search in ascending order from the lower level
843 * caches to the higher level caches. This reflects the best scenario
844 * for a cache hit.
845 */
846 if (arm_spe_is_cache_hit(record->type, L1D)) {
847 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
848 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
849 } else if (record->type & ARM_SPE_RECENTLY_FETCHED) {
850 data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
851 data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB;
852 } else if (arm_spe_is_cache_hit(record->type, L2D)) {
853 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
854 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
855 } else if (arm_spe_is_cache_hit(record->type, LLC)) {
856 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
857 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
858 /*
859 * To find a cache miss, search in descending order from the higher
860 * level cache to the lower level cache. This represents the worst
861 * scenario for a cache miss.
862 */
863 } else if (arm_spe_is_cache_miss(record->type, LLC)) {
864 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS;
865 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
866 } else if (arm_spe_is_cache_miss(record->type, L2D)) {
867 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS;
868 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
869 } else if (arm_spe_is_cache_miss(record->type, L1D)) {
870 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
871 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
872 }
873 }
874
arm_spe__synth_st_memory_level(const struct arm_spe_record * record,union perf_mem_data_src * data_src)875 static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record,
876 union perf_mem_data_src *data_src)
877 {
878 /* Record the greatest level info for a store operation. */
879 if (arm_spe_is_cache_level(record->type, LLC)) {
880 data_src->mem_lvl = PERF_MEM_LVL_L3;
881 data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ?
882 PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
883 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
884 } else if (arm_spe_is_cache_level(record->type, L2D)) {
885 data_src->mem_lvl = PERF_MEM_LVL_L2;
886 data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ?
887 PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
888 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
889 } else if (arm_spe_is_cache_level(record->type, L1D)) {
890 data_src->mem_lvl = PERF_MEM_LVL_L1;
891 data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ?
892 PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
893 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
894 }
895 }
896
arm_spe__synth_memory_level(struct arm_spe_queue * speq,const struct arm_spe_record * record,union perf_mem_data_src * data_src)897 static void arm_spe__synth_memory_level(struct arm_spe_queue *speq,
898 const struct arm_spe_record *record,
899 union perf_mem_data_src *data_src)
900 {
901 struct arm_spe *spe = speq->spe;
902
903 /*
904 * The data source packet contains more info for cache levels for
905 * peer snooping. So respect the memory level if has been set by
906 * data source parsing.
907 */
908 if (!data_src->mem_lvl) {
909 if (data_src->mem_op == PERF_MEM_OP_LOAD)
910 arm_spe__synth_ld_memory_level(record, data_src);
911 if (data_src->mem_op == PERF_MEM_OP_STORE)
912 arm_spe__synth_st_memory_level(record, data_src);
913 }
914
915 if (!data_src->mem_lvl) {
916 data_src->mem_lvl = PERF_MEM_LVL_NA;
917 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
918 }
919
920 /*
921 * If 'mem_snoop' has been set by data source packet, skip to set
922 * it at here.
923 */
924 if (!data_src->mem_snoop) {
925 if (record->type & ARM_SPE_DATA_SNOOPED) {
926 if (record->type & ARM_SPE_HITM)
927 data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
928 else
929 data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
930 } else {
931 u64 *metadata =
932 arm_spe__get_metadata_by_cpu(spe, speq->cpu);
933
934 /*
935 * Set NA ("Not available") mode if no meta data or the
936 * SNOOPED event is not supported.
937 */
938 if (!metadata ||
939 !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED))
940 data_src->mem_snoop = PERF_MEM_SNOOP_NA;
941 else
942 data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
943 }
944 }
945
946 if (!data_src->mem_remote) {
947 if (record->type & ARM_SPE_REMOTE_ACCESS)
948 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
949 }
950 }
951
arm_spe__synth_ds(struct arm_spe_queue * speq,const struct arm_spe_record * record,union perf_mem_data_src * data_src)952 static void arm_spe__synth_ds(struct arm_spe_queue *speq,
953 const struct arm_spe_record *record,
954 union perf_mem_data_src *data_src)
955 {
956 struct arm_spe *spe = speq->spe;
957 u64 *metadata = NULL;
958 u64 midr;
959 unsigned int i;
960
961 /* Metadata version 1 assumes all CPUs are the same (old behavior) */
962 if (spe->metadata_ver == 1) {
963 const char *cpuid;
964
965 pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
966 cpuid = perf_env__cpuid(perf_session__env(spe->session));
967 midr = strtol(cpuid, NULL, 16);
968 } else {
969 metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
970 if (!metadata)
971 return;
972
973 midr = metadata[ARM_SPE_CPU_MIDR];
974 }
975
976 for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
977 if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
978 return data_source_handles[i].ds_synth(record, data_src);
979 }
980 }
981
982 return;
983 }
984
985 static union perf_mem_data_src
arm_spe__synth_data_source(struct arm_spe_queue * speq,const struct arm_spe_record * record)986 arm_spe__synth_data_source(struct arm_spe_queue *speq,
987 const struct arm_spe_record *record)
988 {
989 union perf_mem_data_src data_src = {};
990
991 /* Only synthesize data source for LDST operations */
992 if (!is_ldst_op(record->op))
993 return data_src;
994
995 if (record->op & ARM_SPE_OP_LD)
996 data_src.mem_op = PERF_MEM_OP_LOAD;
997 else if (record->op & ARM_SPE_OP_ST)
998 data_src.mem_op = PERF_MEM_OP_STORE;
999 else
1000 return data_src;
1001
1002 arm_spe__synth_ds(speq, record, &data_src);
1003 arm_spe__synth_memory_level(speq, record, &data_src);
1004
1005 if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
1006 data_src.mem_dtlb = PERF_MEM_TLB_WK;
1007
1008 if (record->type & ARM_SPE_TLB_MISS)
1009 data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
1010 else
1011 data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
1012 }
1013
1014 return data_src;
1015 }
1016
arm_spe_sample(struct arm_spe_queue * speq)1017 static int arm_spe_sample(struct arm_spe_queue *speq)
1018 {
1019 const struct arm_spe_record *record = &speq->decoder->record;
1020 struct arm_spe *spe = speq->spe;
1021 union perf_mem_data_src data_src;
1022 int err;
1023
1024 /*
1025 * Discard all samples until period is reached
1026 */
1027 speq->sample_count++;
1028 if (speq->sample_count < spe->synth_opts.period)
1029 return 0;
1030 speq->sample_count = 0;
1031
1032 arm_spe__sample_flags(speq);
1033 data_src = arm_spe__synth_data_source(speq, record);
1034
1035 if (spe->sample_flc) {
1036 if (record->type & ARM_SPE_L1D_MISS) {
1037 err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
1038 data_src);
1039 if (err)
1040 return err;
1041 }
1042
1043 if (record->type & ARM_SPE_L1D_ACCESS) {
1044 err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
1045 data_src);
1046 if (err)
1047 return err;
1048 }
1049 }
1050
1051 if (spe->sample_llc) {
1052 if (record->type & ARM_SPE_LLC_MISS) {
1053 err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
1054 data_src);
1055 if (err)
1056 return err;
1057 }
1058
1059 if (record->type & ARM_SPE_LLC_ACCESS) {
1060 err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
1061 data_src);
1062 if (err)
1063 return err;
1064 }
1065 }
1066
1067 if (spe->sample_tlb) {
1068 if (record->type & ARM_SPE_TLB_MISS) {
1069 err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
1070 data_src);
1071 if (err)
1072 return err;
1073 }
1074
1075 if (record->type & ARM_SPE_TLB_ACCESS) {
1076 err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
1077 data_src);
1078 if (err)
1079 return err;
1080 }
1081 }
1082
1083 if (spe->synth_opts.last_branch &&
1084 (spe->sample_branch || spe->sample_instructions))
1085 arm_spe__prep_branch_stack(speq);
1086
1087 if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
1088 err = arm_spe__synth_branch_sample(speq, spe->branch_id);
1089 if (err)
1090 return err;
1091 }
1092
1093 if (spe->sample_remote_access &&
1094 (record->type & ARM_SPE_REMOTE_ACCESS)) {
1095 err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
1096 data_src);
1097 if (err)
1098 return err;
1099 }
1100
1101 /*
1102 * When data_src is zero it means the record is not a memory operation,
1103 * skip to synthesize memory sample for this case.
1104 */
1105 if (spe->sample_memory && is_ldst_op(record->op)) {
1106 err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
1107 if (err)
1108 return err;
1109 }
1110
1111 if (spe->sample_instructions) {
1112 err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
1113 if (err)
1114 return err;
1115 }
1116
1117 return 0;
1118 }
1119
arm_spe_run_decoder(struct arm_spe_queue * speq,u64 * timestamp)1120 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
1121 {
1122 struct arm_spe *spe = speq->spe;
1123 struct arm_spe_record *record;
1124 int ret;
1125
1126 if (!spe->kernel_start)
1127 spe->kernel_start = machine__kernel_start(spe->machine);
1128
1129 while (1) {
1130 /*
1131 * The usual logic is firstly to decode the packets, and then
1132 * based the record to synthesize sample; but here the flow is
1133 * reversed: it calls arm_spe_sample() for synthesizing samples
1134 * prior to arm_spe_decode().
1135 *
1136 * Two reasons for this code logic:
1137 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
1138 * has decoded trace data and generated a record, but the record
1139 * is left to generate sample until run to here, so it's correct
1140 * to synthesize sample for the left record.
1141 * 2. After decoding trace data, it needs to compare the record
1142 * timestamp with the coming perf event, if the record timestamp
1143 * is later than the perf event, it needs bail out and pushs the
1144 * record into auxtrace heap, thus the record can be deferred to
1145 * synthesize sample until run to here at the next time; so this
1146 * can correlate samples between Arm SPE trace data and other
1147 * perf events with correct time ordering.
1148 */
1149
1150 /*
1151 * Update pid/tid info.
1152 */
1153 record = &speq->decoder->record;
1154 if (!spe->timeless_decoding && record->context_id != (u64)-1) {
1155 ret = arm_spe_set_tid(speq, record->context_id);
1156 if (ret)
1157 return ret;
1158
1159 spe->use_ctx_pkt_for_pid = true;
1160 }
1161
1162 ret = arm_spe_sample(speq);
1163 if (ret)
1164 return ret;
1165
1166 ret = arm_spe_decode(speq->decoder);
1167 if (!ret) {
1168 pr_debug("No data or all data has been processed.\n");
1169 return 1;
1170 }
1171
1172 /*
1173 * Error is detected when decode SPE trace data, continue to
1174 * the next trace data and find out more records.
1175 */
1176 if (ret < 0)
1177 continue;
1178
1179 record = &speq->decoder->record;
1180
1181 /* Update timestamp for the last record */
1182 if (record->timestamp > speq->timestamp)
1183 speq->timestamp = record->timestamp;
1184
1185 /*
1186 * If the timestamp of the queue is later than timestamp of the
1187 * coming perf event, bail out so can allow the perf event to
1188 * be processed ahead.
1189 */
1190 if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
1191 *timestamp = speq->timestamp;
1192 return 0;
1193 }
1194 }
1195
1196 return 0;
1197 }
1198
arm_spe__setup_queue(struct arm_spe * spe,struct auxtrace_queue * queue,unsigned int queue_nr)1199 static int arm_spe__setup_queue(struct arm_spe *spe,
1200 struct auxtrace_queue *queue,
1201 unsigned int queue_nr)
1202 {
1203 struct arm_spe_queue *speq = queue->priv;
1204 struct arm_spe_record *record;
1205
1206 if (list_empty(&queue->head) || speq)
1207 return 0;
1208
1209 speq = arm_spe__alloc_queue(spe, queue_nr);
1210
1211 if (!speq)
1212 return -ENOMEM;
1213
1214 queue->priv = speq;
1215
1216 if (queue->cpu != -1)
1217 speq->cpu = queue->cpu;
1218
1219 if (!speq->on_heap) {
1220 int ret;
1221
1222 if (spe->timeless_decoding)
1223 return 0;
1224
1225 retry:
1226 ret = arm_spe_decode(speq->decoder);
1227
1228 if (!ret)
1229 return 0;
1230
1231 if (ret < 0)
1232 goto retry;
1233
1234 record = &speq->decoder->record;
1235
1236 speq->timestamp = record->timestamp;
1237 ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
1238 if (ret)
1239 return ret;
1240 speq->on_heap = true;
1241 }
1242
1243 return 0;
1244 }
1245
arm_spe__setup_queues(struct arm_spe * spe)1246 static int arm_spe__setup_queues(struct arm_spe *spe)
1247 {
1248 unsigned int i;
1249 int ret;
1250
1251 for (i = 0; i < spe->queues.nr_queues; i++) {
1252 ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
1253 if (ret)
1254 return ret;
1255 }
1256
1257 return 0;
1258 }
1259
arm_spe__update_queues(struct arm_spe * spe)1260 static int arm_spe__update_queues(struct arm_spe *spe)
1261 {
1262 if (spe->queues.new_data) {
1263 spe->queues.new_data = false;
1264 return arm_spe__setup_queues(spe);
1265 }
1266
1267 return 0;
1268 }
1269
arm_spe__is_timeless_decoding(struct arm_spe * spe)1270 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
1271 {
1272 struct evsel *evsel;
1273 struct evlist *evlist = spe->session->evlist;
1274 bool timeless_decoding = true;
1275
1276 /*
1277 * Circle through the list of event and complain if we find one
1278 * with the time bit set.
1279 */
1280 evlist__for_each_entry(evlist, evsel) {
1281 if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
1282 timeless_decoding = false;
1283 }
1284
1285 return timeless_decoding;
1286 }
1287
arm_spe_process_queues(struct arm_spe * spe,u64 timestamp)1288 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
1289 {
1290 unsigned int queue_nr;
1291 u64 ts;
1292 int ret;
1293
1294 while (1) {
1295 struct auxtrace_queue *queue;
1296 struct arm_spe_queue *speq;
1297
1298 if (!spe->heap.heap_cnt)
1299 return 0;
1300
1301 if (spe->heap.heap_array[0].ordinal >= timestamp)
1302 return 0;
1303
1304 queue_nr = spe->heap.heap_array[0].queue_nr;
1305 queue = &spe->queues.queue_array[queue_nr];
1306 speq = queue->priv;
1307
1308 auxtrace_heap__pop(&spe->heap);
1309
1310 if (spe->heap.heap_cnt) {
1311 ts = spe->heap.heap_array[0].ordinal + 1;
1312 if (ts > timestamp)
1313 ts = timestamp;
1314 } else {
1315 ts = timestamp;
1316 }
1317
1318 /*
1319 * A previous context-switch event has set pid/tid in the machine's context, so
1320 * here we need to update the pid/tid in the thread and SPE queue.
1321 */
1322 if (!spe->use_ctx_pkt_for_pid)
1323 arm_spe_set_pid_tid_cpu(spe, queue);
1324
1325 ret = arm_spe_run_decoder(speq, &ts);
1326 if (ret < 0) {
1327 auxtrace_heap__add(&spe->heap, queue_nr, ts);
1328 return ret;
1329 }
1330
1331 if (!ret) {
1332 ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
1333 if (ret < 0)
1334 return ret;
1335 } else {
1336 speq->on_heap = false;
1337 }
1338 }
1339
1340 return 0;
1341 }
1342
arm_spe_process_timeless_queues(struct arm_spe * spe,pid_t tid,u64 time_)1343 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
1344 u64 time_)
1345 {
1346 struct auxtrace_queues *queues = &spe->queues;
1347 unsigned int i;
1348 u64 ts = 0;
1349
1350 for (i = 0; i < queues->nr_queues; i++) {
1351 struct auxtrace_queue *queue = &spe->queues.queue_array[i];
1352 struct arm_spe_queue *speq = queue->priv;
1353
1354 if (speq && (tid == -1 || speq->tid == tid)) {
1355 speq->time = time_;
1356 arm_spe_set_pid_tid_cpu(spe, queue);
1357 arm_spe_run_decoder(speq, &ts);
1358 }
1359 }
1360 return 0;
1361 }
1362
arm_spe_context_switch(struct arm_spe * spe,union perf_event * event,struct perf_sample * sample)1363 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
1364 struct perf_sample *sample)
1365 {
1366 pid_t pid, tid;
1367 int cpu;
1368
1369 if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
1370 return 0;
1371
1372 pid = event->context_switch.next_prev_pid;
1373 tid = event->context_switch.next_prev_tid;
1374 cpu = sample->cpu;
1375
1376 if (tid == -1)
1377 pr_warning("context_switch event has no tid\n");
1378
1379 return machine__set_current_tid(spe->machine, cpu, pid, tid);
1380 }
1381
arm_spe_process_event(struct perf_session * session,union perf_event * event,struct perf_sample * sample,const struct perf_tool * tool)1382 static int arm_spe_process_event(struct perf_session *session,
1383 union perf_event *event,
1384 struct perf_sample *sample,
1385 const struct perf_tool *tool)
1386 {
1387 int err = 0;
1388 u64 timestamp;
1389 struct arm_spe *spe = container_of(session->auxtrace,
1390 struct arm_spe, auxtrace);
1391
1392 if (dump_trace)
1393 return 0;
1394
1395 if (!tool->ordered_events) {
1396 pr_err("SPE trace requires ordered events\n");
1397 return -EINVAL;
1398 }
1399
1400 if (sample->time && (sample->time != (u64) -1))
1401 timestamp = perf_time_to_tsc(sample->time, &spe->tc);
1402 else
1403 timestamp = 0;
1404
1405 if (timestamp || spe->timeless_decoding) {
1406 err = arm_spe__update_queues(spe);
1407 if (err)
1408 return err;
1409 }
1410
1411 if (spe->timeless_decoding) {
1412 if (event->header.type == PERF_RECORD_EXIT) {
1413 err = arm_spe_process_timeless_queues(spe,
1414 event->fork.tid,
1415 sample->time);
1416 }
1417 } else if (timestamp) {
1418 err = arm_spe_process_queues(spe, timestamp);
1419 if (err)
1420 return err;
1421
1422 if (!spe->use_ctx_pkt_for_pid &&
1423 (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
1424 event->header.type == PERF_RECORD_SWITCH))
1425 err = arm_spe_context_switch(spe, event, sample);
1426 }
1427
1428 return err;
1429 }
1430
arm_spe_process_auxtrace_event(struct perf_session * session,union perf_event * event,const struct perf_tool * tool __maybe_unused)1431 static int arm_spe_process_auxtrace_event(struct perf_session *session,
1432 union perf_event *event,
1433 const struct perf_tool *tool __maybe_unused)
1434 {
1435 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1436 auxtrace);
1437
1438 if (!spe->data_queued) {
1439 struct auxtrace_buffer *buffer;
1440 off_t data_offset;
1441 int fd = perf_data__fd(session->data);
1442 int err;
1443
1444 if (perf_data__is_pipe(session->data)) {
1445 data_offset = 0;
1446 } else {
1447 data_offset = lseek(fd, 0, SEEK_CUR);
1448 if (data_offset == -1)
1449 return -errno;
1450 }
1451
1452 err = auxtrace_queues__add_event(&spe->queues, session, event,
1453 data_offset, &buffer);
1454 if (err)
1455 return err;
1456
1457 /* Dump here now we have copied a piped trace out of the pipe */
1458 if (dump_trace) {
1459 if (auxtrace_buffer__get_data(buffer, fd)) {
1460 arm_spe_dump_event(spe, buffer->data,
1461 buffer->size);
1462 auxtrace_buffer__put_data(buffer);
1463 }
1464 }
1465 }
1466
1467 return 0;
1468 }
1469
arm_spe_flush(struct perf_session * session __maybe_unused,const struct perf_tool * tool __maybe_unused)1470 static int arm_spe_flush(struct perf_session *session __maybe_unused,
1471 const struct perf_tool *tool __maybe_unused)
1472 {
1473 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1474 auxtrace);
1475 int ret;
1476
1477 if (dump_trace)
1478 return 0;
1479
1480 if (!tool->ordered_events)
1481 return -EINVAL;
1482
1483 ret = arm_spe__update_queues(spe);
1484 if (ret < 0)
1485 return ret;
1486
1487 if (spe->timeless_decoding)
1488 return arm_spe_process_timeless_queues(spe, -1,
1489 MAX_TIMESTAMP - 1);
1490
1491 ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
1492 if (ret)
1493 return ret;
1494
1495 if (!spe->use_ctx_pkt_for_pid)
1496 ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
1497 "Matching of TIDs to SPE events could be inaccurate.\n");
1498
1499 return 0;
1500 }
1501
arm_spe__alloc_per_cpu_metadata(u64 * buf,int per_cpu_size)1502 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
1503 {
1504 u64 *metadata;
1505
1506 metadata = zalloc(per_cpu_size);
1507 if (!metadata)
1508 return NULL;
1509
1510 memcpy(metadata, buf, per_cpu_size);
1511 return metadata;
1512 }
1513
arm_spe__free_metadata(u64 ** metadata,int nr_cpu)1514 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
1515 {
1516 int i;
1517
1518 for (i = 0; i < nr_cpu; i++)
1519 zfree(&metadata[i]);
1520 free(metadata);
1521 }
1522
arm_spe__alloc_metadata(struct perf_record_auxtrace_info * info,u64 * ver,int * nr_cpu)1523 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
1524 u64 *ver, int *nr_cpu)
1525 {
1526 u64 *ptr = (u64 *)info->priv;
1527 u64 metadata_size;
1528 u64 **metadata = NULL;
1529 int hdr_sz, per_cpu_sz, i;
1530
1531 metadata_size = info->header.size -
1532 sizeof(struct perf_record_auxtrace_info);
1533
1534 /* Metadata version 1 */
1535 if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
1536 *ver = 1;
1537 *nr_cpu = 0;
1538 /* No per CPU metadata */
1539 return NULL;
1540 }
1541
1542 *ver = ptr[ARM_SPE_HEADER_VERSION];
1543 hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
1544 *nr_cpu = ptr[ARM_SPE_CPUS_NUM];
1545
1546 metadata = calloc(*nr_cpu, sizeof(*metadata));
1547 if (!metadata)
1548 return NULL;
1549
1550 /* Locate the start address of per CPU metadata */
1551 ptr += hdr_sz;
1552 per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
1553
1554 for (i = 0; i < *nr_cpu; i++) {
1555 metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
1556 if (!metadata[i])
1557 goto err_per_cpu_metadata;
1558
1559 ptr += per_cpu_sz / sizeof(u64);
1560 }
1561
1562 return metadata;
1563
1564 err_per_cpu_metadata:
1565 arm_spe__free_metadata(metadata, *nr_cpu);
1566 return NULL;
1567 }
1568
arm_spe_free_queue(void * priv)1569 static void arm_spe_free_queue(void *priv)
1570 {
1571 struct arm_spe_queue *speq = priv;
1572
1573 if (!speq)
1574 return;
1575 thread__zput(speq->thread);
1576 arm_spe_decoder_free(speq->decoder);
1577 zfree(&speq->event_buf);
1578 zfree(&speq->last_branch);
1579 free(speq);
1580 }
1581
arm_spe_free_events(struct perf_session * session)1582 static void arm_spe_free_events(struct perf_session *session)
1583 {
1584 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1585 auxtrace);
1586 struct auxtrace_queues *queues = &spe->queues;
1587 unsigned int i;
1588
1589 for (i = 0; i < queues->nr_queues; i++) {
1590 arm_spe_free_queue(queues->queue_array[i].priv);
1591 queues->queue_array[i].priv = NULL;
1592 }
1593 auxtrace_queues__free(queues);
1594 }
1595
arm_spe_free(struct perf_session * session)1596 static void arm_spe_free(struct perf_session *session)
1597 {
1598 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1599 auxtrace);
1600
1601 auxtrace_heap__free(&spe->heap);
1602 arm_spe_free_events(session);
1603 session->auxtrace = NULL;
1604 arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
1605 free(spe);
1606 }
1607
arm_spe_evsel_is_auxtrace(struct perf_session * session,struct evsel * evsel)1608 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
1609 struct evsel *evsel)
1610 {
1611 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);
1612
1613 return evsel->core.attr.type == spe->pmu_type;
1614 }
1615
1616 static const char * const metadata_hdr_v1_fmts[] = {
1617 [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n",
1618 [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n",
1619 };
1620
1621 static const char * const metadata_hdr_fmts[] = {
1622 [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n",
1623 [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n",
1624 [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n",
1625 [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n",
1626 };
1627
1628 static const char * const metadata_per_cpu_fmts[] = {
1629 [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n",
1630 [ARM_SPE_CPU] = " CPU # :%"PRId64"\n",
1631 [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n",
1632 [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n",
1633 [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n",
1634 [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n",
1635 [ARM_SPE_CAP_EVENT_FILTER] = " Event Filter :0x%"PRIx64"\n",
1636 };
1637
arm_spe_print_info(struct arm_spe * spe,__u64 * arr)1638 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
1639 {
1640 unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
1641 const char * const *hdr_fmts;
1642
1643 if (!dump_trace)
1644 return;
1645
1646 if (spe->metadata_ver == 1) {
1647 cpu_num = 0;
1648 hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
1649 hdr_fmts = metadata_hdr_v1_fmts;
1650 } else {
1651 cpu_num = arr[ARM_SPE_CPUS_NUM];
1652 hdr_size = arr[ARM_SPE_HEADER_SIZE];
1653 hdr_fmts = metadata_hdr_fmts;
1654 }
1655
1656 for (i = 0; i < hdr_size; i++)
1657 fprintf(stdout, hdr_fmts[i], arr[i]);
1658
1659 arr += hdr_size;
1660 for (cpu = 0; cpu < cpu_num; cpu++) {
1661 /*
1662 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
1663 * are fixed. The sequential parameter size is decided by the
1664 * field 'ARM_SPE_CPU_NR_PARAMS'.
1665 */
1666 cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
1667 for (i = 0; i < cpu_size; i++)
1668 fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
1669 arr += cpu_size;
1670 }
1671 }
1672
arm_spe_set_event_name(struct evlist * evlist,u64 id,const char * name)1673 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
1674 const char *name)
1675 {
1676 struct evsel *evsel;
1677
1678 evlist__for_each_entry(evlist, evsel) {
1679 if (evsel->core.id && evsel->core.id[0] == id) {
1680 if (evsel->name)
1681 zfree(&evsel->name);
1682 evsel->name = strdup(name);
1683 break;
1684 }
1685 }
1686 }
1687
1688 static int
arm_spe_synth_events(struct arm_spe * spe,struct perf_session * session)1689 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
1690 {
1691 struct evlist *evlist = session->evlist;
1692 struct evsel *evsel;
1693 struct perf_event_attr attr;
1694 bool found = false;
1695 u64 id;
1696 int err;
1697
1698 evlist__for_each_entry(evlist, evsel) {
1699 if (evsel->core.attr.type == spe->pmu_type) {
1700 found = true;
1701 break;
1702 }
1703 }
1704
1705 if (!found) {
1706 pr_debug("No selected events with SPE trace data\n");
1707 return 0;
1708 }
1709
1710 memset(&attr, 0, sizeof(struct perf_event_attr));
1711 attr.size = sizeof(struct perf_event_attr);
1712 attr.type = PERF_TYPE_HARDWARE;
1713 attr.sample_type = evsel->core.attr.sample_type &
1714 (PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
1715 attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
1716 PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
1717 PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
1718 if (spe->timeless_decoding)
1719 attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
1720 else
1721 attr.sample_type |= PERF_SAMPLE_TIME;
1722
1723 spe->sample_type = attr.sample_type;
1724
1725 attr.exclude_user = evsel->core.attr.exclude_user;
1726 attr.exclude_kernel = evsel->core.attr.exclude_kernel;
1727 attr.exclude_hv = evsel->core.attr.exclude_hv;
1728 attr.exclude_host = evsel->core.attr.exclude_host;
1729 attr.exclude_guest = evsel->core.attr.exclude_guest;
1730 attr.sample_id_all = evsel->core.attr.sample_id_all;
1731 attr.read_format = evsel->core.attr.read_format;
1732 attr.sample_period = spe->synth_opts.period;
1733
1734 /* create new id val to be a fixed offset from evsel id */
1735 id = evsel->core.id[0] + 1000000000;
1736
1737 if (!id)
1738 id = 1;
1739
1740 if (spe->synth_opts.flc) {
1741 spe->sample_flc = true;
1742
1743 /* Level 1 data cache miss */
1744 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1745 if (err)
1746 return err;
1747 spe->l1d_miss_id = id;
1748 arm_spe_set_event_name(evlist, id, "l1d-miss");
1749 id += 1;
1750
1751 /* Level 1 data cache access */
1752 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1753 if (err)
1754 return err;
1755 spe->l1d_access_id = id;
1756 arm_spe_set_event_name(evlist, id, "l1d-access");
1757 id += 1;
1758 }
1759
1760 if (spe->synth_opts.llc) {
1761 spe->sample_llc = true;
1762
1763 /* Last level cache miss */
1764 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1765 if (err)
1766 return err;
1767 spe->llc_miss_id = id;
1768 arm_spe_set_event_name(evlist, id, "llc-miss");
1769 id += 1;
1770
1771 /* Last level cache access */
1772 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1773 if (err)
1774 return err;
1775 spe->llc_access_id = id;
1776 arm_spe_set_event_name(evlist, id, "llc-access");
1777 id += 1;
1778 }
1779
1780 if (spe->synth_opts.tlb) {
1781 spe->sample_tlb = true;
1782
1783 /* TLB miss */
1784 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1785 if (err)
1786 return err;
1787 spe->tlb_miss_id = id;
1788 arm_spe_set_event_name(evlist, id, "tlb-miss");
1789 id += 1;
1790
1791 /* TLB access */
1792 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1793 if (err)
1794 return err;
1795 spe->tlb_access_id = id;
1796 arm_spe_set_event_name(evlist, id, "tlb-access");
1797 id += 1;
1798 }
1799
1800 if (spe->synth_opts.last_branch) {
1801 if (spe->synth_opts.last_branch_sz > 2)
1802 pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
1803
1804 attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
1805 /*
1806 * We don't use the hardware index, but the sample generation
1807 * code uses the new format branch_stack with this field,
1808 * so the event attributes must indicate that it's present.
1809 */
1810 attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
1811 }
1812
1813 if (spe->synth_opts.branches) {
1814 spe->sample_branch = true;
1815
1816 /* Branch */
1817 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1818 if (err)
1819 return err;
1820 spe->branch_id = id;
1821 arm_spe_set_event_name(evlist, id, "branch");
1822 id += 1;
1823 }
1824
1825 if (spe->synth_opts.remote_access) {
1826 spe->sample_remote_access = true;
1827
1828 /* Remote access */
1829 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1830 if (err)
1831 return err;
1832 spe->remote_access_id = id;
1833 arm_spe_set_event_name(evlist, id, "remote-access");
1834 id += 1;
1835 }
1836
1837 if (spe->synth_opts.mem) {
1838 spe->sample_memory = true;
1839
1840 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1841 if (err)
1842 return err;
1843 spe->memory_id = id;
1844 arm_spe_set_event_name(evlist, id, "memory");
1845 id += 1;
1846 }
1847
1848 if (spe->synth_opts.instructions) {
1849 spe->sample_instructions = true;
1850 attr.config = PERF_COUNT_HW_INSTRUCTIONS;
1851
1852 err = perf_session__deliver_synth_attr_event(session, &attr, id);
1853 if (err)
1854 return err;
1855 spe->instructions_id = id;
1856 arm_spe_set_event_name(evlist, id, "instructions");
1857 }
1858
1859 return 0;
1860 }
1861
arm_spe__is_homogeneous(u64 ** metadata,int nr_cpu)1862 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
1863 {
1864 u64 midr;
1865 int i;
1866
1867 if (!nr_cpu)
1868 return false;
1869
1870 for (i = 0; i < nr_cpu; i++) {
1871 if (!metadata[i])
1872 return false;
1873
1874 if (i == 0) {
1875 midr = metadata[i][ARM_SPE_CPU_MIDR];
1876 continue;
1877 }
1878
1879 if (midr != metadata[i][ARM_SPE_CPU_MIDR])
1880 return false;
1881 }
1882
1883 return true;
1884 }
1885
arm_spe_process_auxtrace_info(union perf_event * event,struct perf_session * session)1886 int arm_spe_process_auxtrace_info(union perf_event *event,
1887 struct perf_session *session)
1888 {
1889 struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1890 size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
1891 struct perf_record_time_conv *tc = &session->time_conv;
1892 struct arm_spe *spe;
1893 u64 **metadata = NULL;
1894 u64 metadata_ver;
1895 int nr_cpu, err;
1896
1897 if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
1898 min_sz)
1899 return -EINVAL;
1900
1901 metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
1902 &nr_cpu);
1903 if (!metadata && metadata_ver != 1) {
1904 pr_err("Failed to parse Arm SPE metadata.\n");
1905 return -EINVAL;
1906 }
1907
1908 spe = zalloc(sizeof(struct arm_spe));
1909 if (!spe) {
1910 err = -ENOMEM;
1911 goto err_free_metadata;
1912 }
1913
1914 err = auxtrace_queues__init(&spe->queues);
1915 if (err)
1916 goto err_free;
1917
1918 spe->session = session;
1919 spe->machine = &session->machines.host; /* No kvm support */
1920 spe->auxtrace_type = auxtrace_info->type;
1921 if (metadata_ver == 1)
1922 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1923 else
1924 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
1925 spe->metadata = metadata;
1926 spe->metadata_ver = metadata_ver;
1927 spe->metadata_nr_cpu = nr_cpu;
1928 spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
1929
1930 spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
1931
1932 /*
1933 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
1934 * and the parameters for hardware clock are stored in the session
1935 * context. Passes these parameters to the struct perf_tsc_conversion
1936 * in "spe->tc", which is used for later conversion between clock
1937 * counter and timestamp.
1938 *
1939 * For backward compatibility, copies the fields starting from
1940 * "time_cycles" only if they are contained in the event.
1941 */
1942 spe->tc.time_shift = tc->time_shift;
1943 spe->tc.time_mult = tc->time_mult;
1944 spe->tc.time_zero = tc->time_zero;
1945
1946 if (event_contains(*tc, time_cycles)) {
1947 spe->tc.time_cycles = tc->time_cycles;
1948 spe->tc.time_mask = tc->time_mask;
1949 spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
1950 spe->tc.cap_user_time_short = tc->cap_user_time_short;
1951 }
1952
1953 spe->auxtrace.process_event = arm_spe_process_event;
1954 spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
1955 spe->auxtrace.flush_events = arm_spe_flush;
1956 spe->auxtrace.free_events = arm_spe_free_events;
1957 spe->auxtrace.free = arm_spe_free;
1958 spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
1959 session->auxtrace = &spe->auxtrace;
1960
1961 arm_spe_print_info(spe, &auxtrace_info->priv[0]);
1962
1963 if (dump_trace)
1964 return 0;
1965
1966 if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
1967 spe->synth_opts = *session->itrace_synth_opts;
1968 } else {
1969 itrace_synth_opts__set_default(&spe->synth_opts, false);
1970 /* Default nanoseconds period not supported */
1971 spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS;
1972 spe->synth_opts.period = 1;
1973 }
1974
1975 if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
1976 ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n");
1977 err = -EINVAL;
1978 goto err_free_queues;
1979 }
1980 if (spe->synth_opts.period > 1)
1981 ui__warning("Arm SPE has a hardware-based sampling period.\n\n"
1982 "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n");
1983
1984 err = arm_spe_synth_events(spe, session);
1985 if (err)
1986 goto err_free_queues;
1987
1988 err = auxtrace_queues__process_index(&spe->queues, session);
1989 if (err)
1990 goto err_free_queues;
1991
1992 if (spe->queues.populated)
1993 spe->data_queued = true;
1994
1995 return 0;
1996
1997 err_free_queues:
1998 auxtrace_queues__free(&spe->queues);
1999 session->auxtrace = NULL;
2000 err_free:
2001 free(spe);
2002 err_free_metadata:
2003 arm_spe__free_metadata(metadata, nr_cpu);
2004 return err;
2005 }
2006