xref: /linux/tools/perf/util/arm-spe.c (revision 805f9a061372164d43ddef771d7cd63e3ba6d845)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Arm Statistical Profiling Extensions (SPE) support
4  * Copyright (c) 2017-2018, Arm Ltd.
5  */
6 
7 #include <byteswap.h>
8 #include <endian.h>
9 #include <errno.h>
10 #include <inttypes.h>
11 #include <linux/bitops.h>
12 #include <linux/kernel.h>
13 #include <linux/log2.h>
14 #include <linux/types.h>
15 #include <linux/zalloc.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18 
19 #include "auxtrace.h"
20 #include "color.h"
21 #include "debug.h"
22 #include "evlist.h"
23 #include "evsel.h"
24 #include "machine.h"
25 #include "session.h"
26 #include "symbol.h"
27 #include "thread.h"
28 #include "thread-stack.h"
29 #include "tsc.h"
30 #include "tool.h"
31 #include "util/synthetic-events.h"
32 
33 #include "arm-spe.h"
34 #include "arm-spe-decoder/arm-spe-decoder.h"
35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
36 
37 #include "../../arch/arm64/include/asm/cputype.h"
38 #define MAX_TIMESTAMP (~0ULL)
39 
40 #define is_ldst_op(op)		(!!((op) & ARM_SPE_OP_LDST))
41 
42 #define is_simd_op(op)		(!!((op) & (ARM_SPE_OP_SIMD_FP | ARM_SPE_OP_SVE | \
43 					    ARM_SPE_OP_SME | ARM_SPE_OP_ASE)))
44 
45 #define is_mem_op(op)		(is_ldst_op(op) || is_simd_op(op))
46 
47 #define ARM_SPE_CACHE_EVENT(lvl) \
48 	(ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS)
49 
50 #define arm_spe_is_cache_level(type, lvl) \
51 	((type) & ARM_SPE_CACHE_EVENT(lvl))
52 
53 #define arm_spe_is_cache_hit(type, lvl) \
54 	(((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS)
55 
56 #define arm_spe_is_cache_miss(type, lvl) \
57 	((type) & ARM_SPE_##lvl##_MISS)
58 
59 struct arm_spe {
60 	struct auxtrace			auxtrace;
61 	struct auxtrace_queues		queues;
62 	struct auxtrace_heap		heap;
63 	struct itrace_synth_opts        synth_opts;
64 	u32				auxtrace_type;
65 	struct perf_session		*session;
66 	struct machine			*machine;
67 	u32				pmu_type;
68 
69 	struct perf_tsc_conversion	tc;
70 
71 	u8				timeless_decoding;
72 	u8				data_queued;
73 
74 	u64				sample_type;
75 	u8				sample_flc;
76 	u8				sample_llc;
77 	u8				sample_tlb;
78 	u8				sample_branch;
79 	u8				sample_remote_access;
80 	u8				sample_memory;
81 	u8				sample_instructions;
82 
83 	u64				l1d_miss_id;
84 	u64				l1d_access_id;
85 	u64				llc_miss_id;
86 	u64				llc_access_id;
87 	u64				tlb_miss_id;
88 	u64				tlb_access_id;
89 	u64				branch_id;
90 	u64				remote_access_id;
91 	u64				memory_id;
92 	u64				instructions_id;
93 
94 	u64				kernel_start;
95 
96 	unsigned long			num_events;
97 	u8				use_ctx_pkt_for_pid;
98 
99 	u64				**metadata;
100 	u64				metadata_ver;
101 	u64				metadata_nr_cpu;
102 	bool				is_homogeneous;
103 };
104 
105 struct arm_spe_queue {
106 	struct arm_spe			*spe;
107 	unsigned int			queue_nr;
108 	struct auxtrace_buffer		*buffer;
109 	struct auxtrace_buffer		*old_buffer;
110 	union perf_event		*event_buf;
111 	bool				on_heap;
112 	bool				done;
113 	pid_t				pid;
114 	pid_t				tid;
115 	int				cpu;
116 	struct arm_spe_decoder		*decoder;
117 	u64				time;
118 	u64				timestamp;
119 	struct thread			*thread;
120 	u64				sample_count;
121 	u32				flags;
122 	struct branch_stack		*last_branch;
123 };
124 
125 struct data_source_handle {
126 	const struct midr_range *midr_ranges;
127 	void (*ds_synth)(const struct arm_spe_record *record,
128 			 union perf_mem_data_src *data_src);
129 };
130 
131 #define DS(range, func)					\
132 	{						\
133 		.midr_ranges = range,			\
134 		.ds_synth = arm_spe__synth_##func,	\
135 	}
136 
arm_spe_dump(struct arm_spe * spe __maybe_unused,unsigned char * buf,size_t len)137 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
138 			 unsigned char *buf, size_t len)
139 {
140 	struct arm_spe_pkt packet;
141 	size_t pos = 0;
142 	int ret, pkt_len, i;
143 	char desc[ARM_SPE_PKT_DESC_MAX];
144 	const char *color = PERF_COLOR_BLUE;
145 
146 	color_fprintf(stdout, color,
147 		      ". ... ARM SPE data: size %#zx bytes\n",
148 		      len);
149 
150 	while (len) {
151 		ret = arm_spe_get_packet(buf, len, &packet);
152 		if (ret > 0)
153 			pkt_len = ret;
154 		else
155 			pkt_len = 1;
156 		printf(".");
157 		color_fprintf(stdout, color, "  %08zx: ", pos);
158 		for (i = 0; i < pkt_len; i++)
159 			color_fprintf(stdout, color, " %02x", buf[i]);
160 		for (; i < 16; i++)
161 			color_fprintf(stdout, color, "   ");
162 		if (ret > 0) {
163 			ret = arm_spe_pkt_desc(&packet, desc,
164 					       ARM_SPE_PKT_DESC_MAX);
165 			if (!ret)
166 				color_fprintf(stdout, color, " %s\n", desc);
167 		} else {
168 			color_fprintf(stdout, color, " Bad packet!\n");
169 		}
170 		pos += pkt_len;
171 		buf += pkt_len;
172 		len -= pkt_len;
173 	}
174 }
175 
arm_spe_dump_event(struct arm_spe * spe,unsigned char * buf,size_t len)176 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
177 			       size_t len)
178 {
179 	printf(".\n");
180 	arm_spe_dump(spe, buf, len);
181 }
182 
arm_spe_get_trace(struct arm_spe_buffer * b,void * data)183 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
184 {
185 	struct arm_spe_queue *speq = data;
186 	struct auxtrace_buffer *buffer = speq->buffer;
187 	struct auxtrace_buffer *old_buffer = speq->old_buffer;
188 	struct auxtrace_queue *queue;
189 
190 	queue = &speq->spe->queues.queue_array[speq->queue_nr];
191 
192 	buffer = auxtrace_buffer__next(queue, buffer);
193 	/* If no more data, drop the previous auxtrace_buffer and return */
194 	if (!buffer) {
195 		if (old_buffer)
196 			auxtrace_buffer__drop_data(old_buffer);
197 		b->len = 0;
198 		return 0;
199 	}
200 
201 	speq->buffer = buffer;
202 
203 	/* If the aux_buffer doesn't have data associated, try to load it */
204 	if (!buffer->data) {
205 		/* get the file desc associated with the perf data file */
206 		int fd = perf_data__fd(speq->spe->session->data);
207 
208 		buffer->data = auxtrace_buffer__get_data(buffer, fd);
209 		if (!buffer->data)
210 			return -ENOMEM;
211 	}
212 
213 	b->len = buffer->size;
214 	b->buf = buffer->data;
215 
216 	if (b->len) {
217 		if (old_buffer)
218 			auxtrace_buffer__drop_data(old_buffer);
219 		speq->old_buffer = buffer;
220 	} else {
221 		auxtrace_buffer__drop_data(buffer);
222 		return arm_spe_get_trace(b, data);
223 	}
224 
225 	return 0;
226 }
227 
arm_spe__alloc_queue(struct arm_spe * spe,unsigned int queue_nr)228 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
229 		unsigned int queue_nr)
230 {
231 	struct arm_spe_params params = { .get_trace = 0, };
232 	struct arm_spe_queue *speq;
233 
234 	speq = zalloc(sizeof(*speq));
235 	if (!speq)
236 		return NULL;
237 
238 	speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
239 	if (!speq->event_buf)
240 		goto out_free;
241 
242 	speq->spe = spe;
243 	speq->queue_nr = queue_nr;
244 	speq->pid = -1;
245 	speq->tid = -1;
246 	speq->cpu = -1;
247 
248 	/* params set */
249 	params.get_trace = arm_spe_get_trace;
250 	params.data = speq;
251 
252 	if (spe->synth_opts.last_branch) {
253 		size_t sz = sizeof(struct branch_stack);
254 
255 		/* Allocate up to two entries for PBT + TGT */
256 		sz += sizeof(struct branch_entry) *
257 			min(spe->synth_opts.last_branch_sz, 2U);
258 		speq->last_branch = zalloc(sz);
259 		if (!speq->last_branch)
260 			goto out_free;
261 	}
262 
263 	/* create new decoder */
264 	speq->decoder = arm_spe_decoder_new(&params);
265 	if (!speq->decoder)
266 		goto out_free;
267 
268 	return speq;
269 
270 out_free:
271 	zfree(&speq->event_buf);
272 	zfree(&speq->last_branch);
273 	free(speq);
274 
275 	return NULL;
276 }
277 
arm_spe_cpumode(struct arm_spe * spe,u64 ip)278 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
279 {
280 	return ip >= spe->kernel_start ?
281 		PERF_RECORD_MISC_KERNEL :
282 		PERF_RECORD_MISC_USER;
283 }
284 
arm_spe_set_pid_tid_cpu(struct arm_spe * spe,struct auxtrace_queue * queue)285 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
286 				    struct auxtrace_queue *queue)
287 {
288 	struct arm_spe_queue *speq = queue->priv;
289 	pid_t tid;
290 
291 	tid = machine__get_current_tid(spe->machine, speq->cpu);
292 	if (tid != -1) {
293 		speq->tid = tid;
294 		thread__zput(speq->thread);
295 	} else
296 		speq->tid = queue->tid;
297 
298 	if ((!speq->thread) && (speq->tid != -1)) {
299 		speq->thread = machine__find_thread(spe->machine, -1,
300 						    speq->tid);
301 	}
302 
303 	if (speq->thread) {
304 		speq->pid = thread__pid(speq->thread);
305 		if (queue->cpu == -1)
306 			speq->cpu = thread__cpu(speq->thread);
307 	}
308 }
309 
arm_spe_set_tid(struct arm_spe_queue * speq,pid_t tid)310 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
311 {
312 	struct arm_spe *spe = speq->spe;
313 	int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);
314 
315 	if (err)
316 		return err;
317 
318 	arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
319 
320 	return 0;
321 }
322 
arm_spe__get_metadata_by_cpu(struct arm_spe * spe,int cpu)323 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu)
324 {
325 	u64 i;
326 
327 	if (!spe->metadata)
328 		return NULL;
329 
330 	/* CPU ID is -1 for per-thread mode */
331 	if (cpu < 0) {
332 		/*
333 		 * On the heterogeneous system, due to CPU ID is -1,
334 		 * cannot confirm the data source packet is supported.
335 		 */
336 		if (!spe->is_homogeneous)
337 			return NULL;
338 
339 		/* In homogeneous system, simply use CPU0's metadata */
340 		return spe->metadata[0];
341 	}
342 
343 	for (i = 0; i < spe->metadata_nr_cpu; i++)
344 		if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu)
345 			return spe->metadata[i];
346 
347 	return NULL;
348 }
349 
arm_spe__synth_simd_flags(const struct arm_spe_record * record)350 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
351 {
352 	struct simd_flags simd_flags = {};
353 
354 	if (record->op & ARM_SPE_OP_SVE)
355 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
356 
357 	if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
358 		simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL;
359 
360 	if (record->type & ARM_SPE_SVE_EMPTY_PRED)
361 		simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY;
362 
363 	return simd_flags;
364 }
365 
arm_spe_prep_sample(struct arm_spe * spe,struct arm_spe_queue * speq,union perf_event * event,struct perf_sample * sample)366 static void arm_spe_prep_sample(struct arm_spe *spe,
367 				struct arm_spe_queue *speq,
368 				union perf_event *event,
369 				struct perf_sample *sample)
370 {
371 	struct arm_spe_record *record = &speq->decoder->record;
372 
373 	if (!spe->timeless_decoding)
374 		sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
375 
376 	sample->ip = record->from_ip;
377 	sample->cpumode = arm_spe_cpumode(spe, sample->ip);
378 	sample->pid = speq->pid;
379 	sample->tid = speq->tid;
380 	sample->period = spe->synth_opts.period;
381 	sample->cpu = speq->cpu;
382 	sample->simd_flags = arm_spe__synth_simd_flags(record);
383 
384 	event->sample.header.type = PERF_RECORD_SAMPLE;
385 	event->sample.header.misc = sample->cpumode;
386 	event->sample.header.size = sizeof(struct perf_event_header);
387 }
388 
arm_spe__prep_branch_stack(struct arm_spe_queue * speq)389 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
390 {
391 	struct arm_spe *spe = speq->spe;
392 	struct arm_spe_record *record = &speq->decoder->record;
393 	struct branch_stack *bstack = speq->last_branch;
394 	struct branch_flags *bs_flags;
395 	unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
396 	bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
397 	bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
398 	size_t sz = sizeof(struct branch_stack) +
399 		    sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
400 	int i = 0;
401 
402 	/* Clean up branch stack */
403 	memset(bstack, 0x0, sz);
404 
405 	if (!have_tgt && !have_pbt)
406 		return;
407 
408 	if (have_tgt) {
409 		bstack->entries[i].from = record->from_ip;
410 		bstack->entries[i].to = record->to_ip;
411 
412 		bs_flags = &bstack->entries[i].flags;
413 		bs_flags->value = 0;
414 
415 		if (record->op & ARM_SPE_OP_BR_CR_BL) {
416 			if (record->op & ARM_SPE_OP_BR_COND)
417 				bs_flags->type |= PERF_BR_COND_CALL;
418 			else
419 				bs_flags->type |= PERF_BR_CALL;
420 		/*
421 		 * Indirect branch instruction without link (e.g. BR),
422 		 * take this case as function return.
423 		 */
424 		} else if (record->op & ARM_SPE_OP_BR_CR_RET ||
425 			   record->op & ARM_SPE_OP_BR_INDIRECT) {
426 			if (record->op & ARM_SPE_OP_BR_COND)
427 				bs_flags->type |= PERF_BR_COND_RET;
428 			else
429 				bs_flags->type |= PERF_BR_RET;
430 		} else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
431 			if (record->op & ARM_SPE_OP_BR_COND)
432 				bs_flags->type |= PERF_BR_COND;
433 			else
434 				bs_flags->type |= PERF_BR_UNCOND;
435 		} else {
436 			if (record->op & ARM_SPE_OP_BR_COND)
437 				bs_flags->type |= PERF_BR_COND;
438 			else
439 				bs_flags->type |= PERF_BR_UNKNOWN;
440 		}
441 
442 		if (record->type & ARM_SPE_BRANCH_MISS) {
443 			bs_flags->mispred = 1;
444 			bs_flags->predicted = 0;
445 		} else {
446 			bs_flags->mispred = 0;
447 			bs_flags->predicted = 1;
448 		}
449 
450 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
451 			bs_flags->not_taken = 1;
452 
453 		if (record->type & ARM_SPE_IN_TXN)
454 			bs_flags->in_tx = 1;
455 
456 		bs_flags->cycles = min(record->latency, 0xFFFFU);
457 		i++;
458 	}
459 
460 	if (have_pbt) {
461 		bs_flags = &bstack->entries[i].flags;
462 		bs_flags->type |= PERF_BR_UNKNOWN;
463 		bstack->entries[i].to = record->prev_br_tgt;
464 		i++;
465 	}
466 
467 	bstack->nr = i;
468 	bstack->hw_idx = -1ULL;
469 }
470 
arm_spe__inject_event(union perf_event * event,struct perf_sample * sample,u64 type)471 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
472 {
473 	event->header.size = perf_event__sample_event_size(sample, type, 0);
474 	return perf_event__synthesize_sample(event, type, 0, sample);
475 }
476 
477 static inline int
arm_spe_deliver_synth_event(struct arm_spe * spe,struct arm_spe_queue * speq __maybe_unused,union perf_event * event,struct perf_sample * sample)478 arm_spe_deliver_synth_event(struct arm_spe *spe,
479 			    struct arm_spe_queue *speq __maybe_unused,
480 			    union perf_event *event,
481 			    struct perf_sample *sample)
482 {
483 	int ret;
484 
485 	if (spe->synth_opts.inject) {
486 		ret = arm_spe__inject_event(event, sample, spe->sample_type);
487 		if (ret)
488 			return ret;
489 	}
490 
491 	ret = perf_session__deliver_synth_event(spe->session, event, sample);
492 	if (ret)
493 		pr_err("ARM SPE: failed to deliver event, error %d\n", ret);
494 
495 	return ret;
496 }
497 
arm_spe__synth_mem_sample(struct arm_spe_queue * speq,u64 spe_events_id,union perf_mem_data_src data_src)498 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
499 				     u64 spe_events_id,
500 				     union perf_mem_data_src data_src)
501 {
502 	struct arm_spe *spe = speq->spe;
503 	struct arm_spe_record *record = &speq->decoder->record;
504 	union perf_event *event = speq->event_buf;
505 	struct perf_sample sample;
506 	int ret;
507 
508 	perf_sample__init(&sample, /*all=*/true);
509 	arm_spe_prep_sample(spe, speq, event, &sample);
510 
511 	sample.id = spe_events_id;
512 	sample.stream_id = spe_events_id;
513 	sample.addr = record->virt_addr;
514 	sample.phys_addr = record->phys_addr;
515 	sample.data_src = data_src.val;
516 	sample.weight = record->latency;
517 
518 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
519 	perf_sample__exit(&sample);
520 	return ret;
521 }
522 
arm_spe__synth_branch_sample(struct arm_spe_queue * speq,u64 spe_events_id)523 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
524 					u64 spe_events_id)
525 {
526 	struct arm_spe *spe = speq->spe;
527 	struct arm_spe_record *record = &speq->decoder->record;
528 	union perf_event *event = speq->event_buf;
529 	struct perf_sample sample;
530 	int ret;
531 
532 	perf_sample__init(&sample, /*all=*/true);
533 	arm_spe_prep_sample(spe, speq, event, &sample);
534 
535 	sample.id = spe_events_id;
536 	sample.stream_id = spe_events_id;
537 	sample.addr = record->to_ip;
538 	sample.weight = record->latency;
539 	sample.flags = speq->flags;
540 	sample.branch_stack = speq->last_branch;
541 
542 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
543 	perf_sample__exit(&sample);
544 	return ret;
545 }
546 
arm_spe__synth_instruction_sample(struct arm_spe_queue * speq,u64 spe_events_id,union perf_mem_data_src data_src)547 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
548 					     u64 spe_events_id,
549 					     union perf_mem_data_src data_src)
550 {
551 	struct arm_spe *spe = speq->spe;
552 	struct arm_spe_record *record = &speq->decoder->record;
553 	union perf_event *event = speq->event_buf;
554 	struct perf_sample sample;
555 	int ret;
556 
557 	perf_sample__init(&sample, /*all=*/true);
558 	arm_spe_prep_sample(spe, speq, event, &sample);
559 
560 	sample.id = spe_events_id;
561 	sample.stream_id = spe_events_id;
562 	sample.addr = record->to_ip;
563 	sample.phys_addr = record->phys_addr;
564 	sample.data_src = data_src.val;
565 	sample.weight = record->latency;
566 	sample.flags = speq->flags;
567 	sample.branch_stack = speq->last_branch;
568 
569 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
570 	perf_sample__exit(&sample);
571 	return ret;
572 }
573 
574 static const struct midr_range common_ds_encoding_cpus[] = {
575 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
576 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
577 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE),
578 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
579 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
580 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
581 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
582 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
583 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X4),
584 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
585 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
586 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
587 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
588 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
589 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3),
590 	MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
591 	{},
592 };
593 
594 static const struct midr_range ampereone_ds_encoding_cpus[] = {
595 	MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
596 	{},
597 };
598 
599 static const struct midr_range hisi_hip_ds_encoding_cpus[] = {
600 	MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
601 	{},
602 };
603 
arm_spe__sample_flags(struct arm_spe_queue * speq)604 static void arm_spe__sample_flags(struct arm_spe_queue *speq)
605 {
606 	const struct arm_spe_record *record = &speq->decoder->record;
607 
608 	speq->flags = 0;
609 	if (record->op & ARM_SPE_OP_BRANCH_ERET) {
610 		speq->flags = PERF_IP_FLAG_BRANCH;
611 
612 		if (record->type & ARM_SPE_BRANCH_MISS)
613 			speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
614 
615 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
616 			speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
617 
618 		if (record->type & ARM_SPE_IN_TXN)
619 			speq->flags |= PERF_IP_FLAG_IN_TX;
620 
621 		if (record->op & ARM_SPE_OP_BR_COND)
622 			speq->flags |= PERF_IP_FLAG_CONDITIONAL;
623 
624 		if (record->op & ARM_SPE_OP_BR_CR_BL)
625 			speq->flags |= PERF_IP_FLAG_CALL;
626 		else if (record->op & ARM_SPE_OP_BR_CR_RET)
627 			speq->flags |= PERF_IP_FLAG_RETURN;
628 		/*
629 		 * Indirect branch instruction without link (e.g. BR),
630 		 * take it as a function return.
631 		 */
632 		else if (record->op & ARM_SPE_OP_BR_INDIRECT)
633 			speq->flags |= PERF_IP_FLAG_RETURN;
634 	}
635 }
636 
arm_spe__synth_data_source_common(const struct arm_spe_record * record,union perf_mem_data_src * data_src)637 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
638 					      union perf_mem_data_src *data_src)
639 {
640 	/*
641 	 * Even though four levels of cache hierarchy are possible, no known
642 	 * production Neoverse systems currently include more than three levels
643 	 * so for the time being we assume three exist. If a production system
644 	 * is built with four the this function would have to be changed to
645 	 * detect the number of levels for reporting.
646 	 */
647 
648 	/*
649 	 * We have no data on the hit level or data source for stores in the
650 	 * Neoverse SPE records.
651 	 */
652 	if (record->op & ARM_SPE_OP_ST) {
653 		data_src->mem_lvl = PERF_MEM_LVL_NA;
654 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
655 		data_src->mem_snoop = PERF_MEM_SNOOP_NA;
656 		return;
657 	}
658 
659 	switch (record->source) {
660 	case ARM_SPE_COMMON_DS_L1D:
661 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
662 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
663 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
664 		break;
665 	case ARM_SPE_COMMON_DS_L2:
666 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
667 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
668 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
669 		break;
670 	case ARM_SPE_COMMON_DS_PEER_CORE:
671 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
672 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
673 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
674 		break;
675 	/*
676 	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
677 	 * transfer, so set SNOOPX_PEER
678 	 */
679 	case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
680 	case ARM_SPE_COMMON_DS_PEER_CLUSTER:
681 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
682 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
683 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
684 		break;
685 	/*
686 	 * System cache is assumed to be L3
687 	 */
688 	case ARM_SPE_COMMON_DS_SYS_CACHE:
689 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
690 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
691 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
692 		break;
693 	/*
694 	 * We don't know what level it hit in, except it came from the other
695 	 * socket
696 	 */
697 	case ARM_SPE_COMMON_DS_REMOTE:
698 		data_src->mem_lvl = PERF_MEM_LVL_NA;
699 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
700 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
701 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
702 		break;
703 	case ARM_SPE_COMMON_DS_DRAM:
704 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
705 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
706 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
707 		break;
708 	default:
709 		break;
710 	}
711 }
712 
713 /*
714  * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
715  * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
716  */
arm_spe__synth_data_source_ampereone(const struct arm_spe_record * record,union perf_mem_data_src * data_src)717 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
718 						 union perf_mem_data_src *data_src)
719 {
720 	struct arm_spe_record common_record;
721 
722 	switch (record->source) {
723 	case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
724 		common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
725 		break;
726 	case ARM_SPE_AMPEREONE_SLC:
727 		common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
728 		break;
729 	case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
730 		common_record.source = ARM_SPE_COMMON_DS_REMOTE;
731 		break;
732 	case ARM_SPE_AMPEREONE_DDR:
733 		common_record.source = ARM_SPE_COMMON_DS_DRAM;
734 		break;
735 	case ARM_SPE_AMPEREONE_L1D:
736 		common_record.source = ARM_SPE_COMMON_DS_L1D;
737 		break;
738 	case ARM_SPE_AMPEREONE_L2D:
739 		common_record.source = ARM_SPE_COMMON_DS_L2;
740 		break;
741 	default:
742 		pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
743 				record->source);
744 		return;
745 	}
746 
747 	common_record.op = record->op;
748 	arm_spe__synth_data_source_common(&common_record, data_src);
749 }
750 
arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record * record,union perf_mem_data_src * data_src)751 static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record,
752 						union perf_mem_data_src *data_src)
753 {
754 	/* Use common synthesis method to handle store operations */
755 	if (record->op & ARM_SPE_OP_ST) {
756 		arm_spe__synth_data_source_common(record, data_src);
757 		return;
758 	}
759 
760 	switch (record->source) {
761 	case ARM_SPE_HISI_HIP_PEER_CPU:
762 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
763 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
764 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
765 		break;
766 	case ARM_SPE_HISI_HIP_PEER_CPU_HITM:
767 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
768 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
769 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
770 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
771 		break;
772 	case ARM_SPE_HISI_HIP_L3:
773 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
774 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
775 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
776 		break;
777 	case ARM_SPE_HISI_HIP_L3_HITM:
778 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
779 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
780 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
781 		break;
782 	case ARM_SPE_HISI_HIP_PEER_CLUSTER:
783 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
784 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
785 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
786 		break;
787 	case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM:
788 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
789 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
790 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
791 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
792 		break;
793 	case ARM_SPE_HISI_HIP_REMOTE_SOCKET:
794 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
795 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
796 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
797 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
798 		break;
799 	case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM:
800 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
801 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
802 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
803 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
804 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
805 		break;
806 	case ARM_SPE_HISI_HIP_LOCAL_MEM:
807 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
808 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
809 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
810 		break;
811 	case ARM_SPE_HISI_HIP_REMOTE_MEM:
812 		data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
813 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
814 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
815 		break;
816 	case ARM_SPE_HISI_HIP_NC_DEV:
817 		data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT;
818 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
819 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
820 		break;
821 	case ARM_SPE_HISI_HIP_L2:
822 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
823 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
824 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
825 		break;
826 	case ARM_SPE_HISI_HIP_L2_HITM:
827 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
828 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
829 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
830 		break;
831 	case ARM_SPE_HISI_HIP_L1:
832 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
833 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
834 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
835 		break;
836 	default:
837 		break;
838 	}
839 }
840 
841 static const struct data_source_handle data_source_handles[] = {
842 	DS(common_ds_encoding_cpus, data_source_common),
843 	DS(ampereone_ds_encoding_cpus, data_source_ampereone),
844 	DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip),
845 };
846 
arm_spe__synth_ld_memory_level(const struct arm_spe_record * record,union perf_mem_data_src * data_src)847 static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record,
848 					   union perf_mem_data_src *data_src)
849 {
850 	/*
851 	 * To find a cache hit, search in ascending order from the lower level
852 	 * caches to the higher level caches. This reflects the best scenario
853 	 * for a cache hit.
854 	 */
855 	if (arm_spe_is_cache_hit(record->type, L1D)) {
856 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
857 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
858 	} else if (record->type & ARM_SPE_RECENTLY_FETCHED) {
859 		data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
860 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB;
861 	} else if (arm_spe_is_cache_hit(record->type, L2D)) {
862 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
863 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
864 	} else if (arm_spe_is_cache_hit(record->type, LLC)) {
865 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
866 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
867 	/*
868 	 * To find a cache miss, search in descending order from the higher
869 	 * level cache to the lower level cache. This represents the worst
870 	 * scenario for a cache miss.
871 	 */
872 	} else if (arm_spe_is_cache_miss(record->type, LLC)) {
873 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS;
874 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
875 	} else if (arm_spe_is_cache_miss(record->type, L2D)) {
876 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS;
877 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
878 	} else if (arm_spe_is_cache_miss(record->type, L1D)) {
879 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
880 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
881 	}
882 }
883 
arm_spe__synth_st_memory_level(const struct arm_spe_record * record,union perf_mem_data_src * data_src)884 static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record,
885 					   union perf_mem_data_src *data_src)
886 {
887 	/* Record the greatest level info for a store operation. */
888 	if (arm_spe_is_cache_level(record->type, LLC)) {
889 		data_src->mem_lvl = PERF_MEM_LVL_L3;
890 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ?
891 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
892 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
893 	} else if (arm_spe_is_cache_level(record->type, L2D)) {
894 		data_src->mem_lvl = PERF_MEM_LVL_L2;
895 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ?
896 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
897 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
898 	} else if (arm_spe_is_cache_level(record->type, L1D)) {
899 		data_src->mem_lvl = PERF_MEM_LVL_L1;
900 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ?
901 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
902 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
903 	}
904 }
905 
arm_spe__synth_memory_level(struct arm_spe_queue * speq,const struct arm_spe_record * record,union perf_mem_data_src * data_src)906 static void arm_spe__synth_memory_level(struct arm_spe_queue *speq,
907 					const struct arm_spe_record *record,
908 					union perf_mem_data_src *data_src)
909 {
910 	struct arm_spe *spe = speq->spe;
911 
912 	/*
913 	 * The data source packet contains more info for cache levels for
914 	 * peer snooping. So respect the memory level if has been set by
915 	 * data source parsing.
916 	 */
917 	if (!data_src->mem_lvl) {
918 		if (data_src->mem_op == PERF_MEM_OP_LOAD)
919 			arm_spe__synth_ld_memory_level(record, data_src);
920 		if (data_src->mem_op == PERF_MEM_OP_STORE)
921 			arm_spe__synth_st_memory_level(record, data_src);
922 	}
923 
924 	if (!data_src->mem_lvl) {
925 		data_src->mem_lvl = PERF_MEM_LVL_NA;
926 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
927 	}
928 
929 	/*
930 	 * If 'mem_snoop' has been set by data source packet, skip to set
931 	 * it at here.
932 	 */
933 	if (!data_src->mem_snoop) {
934 		if (record->type & ARM_SPE_DATA_SNOOPED) {
935 			if (record->type & ARM_SPE_HITM)
936 				data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
937 			else
938 				data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
939 		} else {
940 			u64 *metadata =
941 				arm_spe__get_metadata_by_cpu(spe, speq->cpu);
942 
943 			/*
944 			 * Set NA ("Not available") mode if no meta data or the
945 			 * SNOOPED event is not supported.
946 			 */
947 			if (!metadata ||
948 			    !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED))
949 				data_src->mem_snoop = PERF_MEM_SNOOP_NA;
950 			else
951 				data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
952 		}
953 	}
954 
955 	if (!data_src->mem_remote) {
956 		if (record->type & ARM_SPE_REMOTE_ACCESS)
957 			data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
958 	}
959 }
960 
arm_spe__synth_ds(struct arm_spe_queue * speq,const struct arm_spe_record * record,union perf_mem_data_src * data_src)961 static void arm_spe__synth_ds(struct arm_spe_queue *speq,
962 			      const struct arm_spe_record *record,
963 			      union perf_mem_data_src *data_src)
964 {
965 	struct arm_spe *spe = speq->spe;
966 	u64 *metadata = NULL;
967 	u64 midr;
968 	unsigned int i;
969 
970 	/* Metadata version 1 assumes all CPUs are the same (old behavior) */
971 	if (spe->metadata_ver == 1) {
972 		const char *cpuid;
973 
974 		pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
975 		cpuid = perf_env__cpuid(perf_session__env(spe->session));
976 		midr = strtol(cpuid, NULL, 16);
977 	} else {
978 		metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
979 		if (!metadata)
980 			return;
981 
982 		midr = metadata[ARM_SPE_CPU_MIDR];
983 	}
984 
985 	for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
986 		if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
987 			return data_source_handles[i].ds_synth(record, data_src);
988 		}
989 	}
990 
991 	return;
992 }
993 
994 static union perf_mem_data_src
arm_spe__synth_data_source(struct arm_spe_queue * speq,const struct arm_spe_record * record)995 arm_spe__synth_data_source(struct arm_spe_queue *speq,
996 			   const struct arm_spe_record *record)
997 {
998 	union perf_mem_data_src	data_src = {};
999 
1000 	if (!is_mem_op(record->op))
1001 		return data_src;
1002 
1003 	if (record->op & ARM_SPE_OP_LD)
1004 		data_src.mem_op = PERF_MEM_OP_LOAD;
1005 	else if (record->op & ARM_SPE_OP_ST)
1006 		data_src.mem_op = PERF_MEM_OP_STORE;
1007 	else
1008 		data_src.mem_op = PERF_MEM_OP_NA;
1009 
1010 	arm_spe__synth_ds(speq, record, &data_src);
1011 	arm_spe__synth_memory_level(speq, record, &data_src);
1012 
1013 	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
1014 		data_src.mem_dtlb = PERF_MEM_TLB_WK;
1015 
1016 		if (record->type & ARM_SPE_TLB_MISS)
1017 			data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
1018 		else
1019 			data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
1020 	}
1021 
1022 	return data_src;
1023 }
1024 
arm_spe_sample(struct arm_spe_queue * speq)1025 static int arm_spe_sample(struct arm_spe_queue *speq)
1026 {
1027 	const struct arm_spe_record *record = &speq->decoder->record;
1028 	struct arm_spe *spe = speq->spe;
1029 	union perf_mem_data_src data_src;
1030 	int err;
1031 
1032 	/*
1033 	 * Discard all samples until period is reached
1034 	 */
1035 	speq->sample_count++;
1036 	if (speq->sample_count < spe->synth_opts.period)
1037 		return 0;
1038 	speq->sample_count = 0;
1039 
1040 	arm_spe__sample_flags(speq);
1041 	data_src = arm_spe__synth_data_source(speq, record);
1042 
1043 	if (spe->sample_flc) {
1044 		if (record->type & ARM_SPE_L1D_MISS) {
1045 			err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
1046 							data_src);
1047 			if (err)
1048 				return err;
1049 		}
1050 
1051 		if (record->type & ARM_SPE_L1D_ACCESS) {
1052 			err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
1053 							data_src);
1054 			if (err)
1055 				return err;
1056 		}
1057 	}
1058 
1059 	if (spe->sample_llc) {
1060 		if (record->type & ARM_SPE_LLC_MISS) {
1061 			err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
1062 							data_src);
1063 			if (err)
1064 				return err;
1065 		}
1066 
1067 		if (record->type & ARM_SPE_LLC_ACCESS) {
1068 			err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
1069 							data_src);
1070 			if (err)
1071 				return err;
1072 		}
1073 	}
1074 
1075 	if (spe->sample_tlb) {
1076 		if (record->type & ARM_SPE_TLB_MISS) {
1077 			err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
1078 							data_src);
1079 			if (err)
1080 				return err;
1081 		}
1082 
1083 		if (record->type & ARM_SPE_TLB_ACCESS) {
1084 			err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
1085 							data_src);
1086 			if (err)
1087 				return err;
1088 		}
1089 	}
1090 
1091 	if (spe->synth_opts.last_branch &&
1092 	    (spe->sample_branch || spe->sample_instructions))
1093 		arm_spe__prep_branch_stack(speq);
1094 
1095 	if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
1096 		err = arm_spe__synth_branch_sample(speq, spe->branch_id);
1097 		if (err)
1098 			return err;
1099 	}
1100 
1101 	if (spe->sample_remote_access &&
1102 	    (record->type & ARM_SPE_REMOTE_ACCESS)) {
1103 		err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
1104 						data_src);
1105 		if (err)
1106 			return err;
1107 	}
1108 
1109 	if (spe->sample_memory && is_mem_op(record->op)) {
1110 		err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
1111 		if (err)
1112 			return err;
1113 	}
1114 
1115 	if (spe->sample_instructions) {
1116 		err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
1117 		if (err)
1118 			return err;
1119 	}
1120 
1121 	return 0;
1122 }
1123 
arm_spe_run_decoder(struct arm_spe_queue * speq,u64 * timestamp)1124 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
1125 {
1126 	struct arm_spe *spe = speq->spe;
1127 	struct arm_spe_record *record;
1128 	int ret;
1129 
1130 	if (!spe->kernel_start)
1131 		spe->kernel_start = machine__kernel_start(spe->machine);
1132 
1133 	while (1) {
1134 		/*
1135 		 * The usual logic is firstly to decode the packets, and then
1136 		 * based the record to synthesize sample; but here the flow is
1137 		 * reversed: it calls arm_spe_sample() for synthesizing samples
1138 		 * prior to arm_spe_decode().
1139 		 *
1140 		 * Two reasons for this code logic:
1141 		 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
1142 		 * has decoded trace data and generated a record, but the record
1143 		 * is left to generate sample until run to here, so it's correct
1144 		 * to synthesize sample for the left record.
1145 		 * 2. After decoding trace data, it needs to compare the record
1146 		 * timestamp with the coming perf event, if the record timestamp
1147 		 * is later than the perf event, it needs bail out and pushs the
1148 		 * record into auxtrace heap, thus the record can be deferred to
1149 		 * synthesize sample until run to here at the next time; so this
1150 		 * can correlate samples between Arm SPE trace data and other
1151 		 * perf events with correct time ordering.
1152 		 */
1153 
1154 		/*
1155 		 * Update pid/tid info.
1156 		 */
1157 		record = &speq->decoder->record;
1158 		if (!spe->timeless_decoding && record->context_id != (u64)-1) {
1159 			ret = arm_spe_set_tid(speq, record->context_id);
1160 			if (ret)
1161 				return ret;
1162 
1163 			spe->use_ctx_pkt_for_pid = true;
1164 		}
1165 
1166 		ret = arm_spe_sample(speq);
1167 		if (ret)
1168 			return ret;
1169 
1170 		ret = arm_spe_decode(speq->decoder);
1171 		if (!ret) {
1172 			pr_debug("No data or all data has been processed.\n");
1173 			return 1;
1174 		}
1175 
1176 		/*
1177 		 * Error is detected when decode SPE trace data, continue to
1178 		 * the next trace data and find out more records.
1179 		 */
1180 		if (ret < 0)
1181 			continue;
1182 
1183 		record = &speq->decoder->record;
1184 
1185 		/* Update timestamp for the last record */
1186 		if (record->timestamp > speq->timestamp)
1187 			speq->timestamp = record->timestamp;
1188 
1189 		/*
1190 		 * If the timestamp of the queue is later than timestamp of the
1191 		 * coming perf event, bail out so can allow the perf event to
1192 		 * be processed ahead.
1193 		 */
1194 		if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
1195 			*timestamp = speq->timestamp;
1196 			return 0;
1197 		}
1198 	}
1199 
1200 	return 0;
1201 }
1202 
arm_spe__setup_queue(struct arm_spe * spe,struct auxtrace_queue * queue,unsigned int queue_nr)1203 static int arm_spe__setup_queue(struct arm_spe *spe,
1204 			       struct auxtrace_queue *queue,
1205 			       unsigned int queue_nr)
1206 {
1207 	struct arm_spe_queue *speq = queue->priv;
1208 	struct arm_spe_record *record;
1209 
1210 	if (list_empty(&queue->head) || speq)
1211 		return 0;
1212 
1213 	speq = arm_spe__alloc_queue(spe, queue_nr);
1214 
1215 	if (!speq)
1216 		return -ENOMEM;
1217 
1218 	queue->priv = speq;
1219 
1220 	if (queue->cpu != -1)
1221 		speq->cpu = queue->cpu;
1222 
1223 	if (!speq->on_heap) {
1224 		int ret;
1225 
1226 		if (spe->timeless_decoding)
1227 			return 0;
1228 
1229 retry:
1230 		ret = arm_spe_decode(speq->decoder);
1231 
1232 		if (!ret)
1233 			return 0;
1234 
1235 		if (ret < 0)
1236 			goto retry;
1237 
1238 		record = &speq->decoder->record;
1239 
1240 		speq->timestamp = record->timestamp;
1241 		ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
1242 		if (ret)
1243 			return ret;
1244 		speq->on_heap = true;
1245 	}
1246 
1247 	return 0;
1248 }
1249 
arm_spe__setup_queues(struct arm_spe * spe)1250 static int arm_spe__setup_queues(struct arm_spe *spe)
1251 {
1252 	unsigned int i;
1253 	int ret;
1254 
1255 	for (i = 0; i < spe->queues.nr_queues; i++) {
1256 		ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
1257 		if (ret)
1258 			return ret;
1259 	}
1260 
1261 	return 0;
1262 }
1263 
arm_spe__update_queues(struct arm_spe * spe)1264 static int arm_spe__update_queues(struct arm_spe *spe)
1265 {
1266 	if (spe->queues.new_data) {
1267 		spe->queues.new_data = false;
1268 		return arm_spe__setup_queues(spe);
1269 	}
1270 
1271 	return 0;
1272 }
1273 
arm_spe__is_timeless_decoding(struct arm_spe * spe)1274 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
1275 {
1276 	struct evsel *evsel;
1277 	struct evlist *evlist = spe->session->evlist;
1278 	bool timeless_decoding = true;
1279 
1280 	/*
1281 	 * Circle through the list of event and complain if we find one
1282 	 * with the time bit set.
1283 	 */
1284 	evlist__for_each_entry(evlist, evsel) {
1285 		if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
1286 			timeless_decoding = false;
1287 	}
1288 
1289 	return timeless_decoding;
1290 }
1291 
arm_spe_process_queues(struct arm_spe * spe,u64 timestamp)1292 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
1293 {
1294 	unsigned int queue_nr;
1295 	u64 ts;
1296 	int ret;
1297 
1298 	while (1) {
1299 		struct auxtrace_queue *queue;
1300 		struct arm_spe_queue *speq;
1301 
1302 		if (!spe->heap.heap_cnt)
1303 			return 0;
1304 
1305 		if (spe->heap.heap_array[0].ordinal >= timestamp)
1306 			return 0;
1307 
1308 		queue_nr = spe->heap.heap_array[0].queue_nr;
1309 		queue = &spe->queues.queue_array[queue_nr];
1310 		speq = queue->priv;
1311 
1312 		auxtrace_heap__pop(&spe->heap);
1313 
1314 		if (spe->heap.heap_cnt) {
1315 			ts = spe->heap.heap_array[0].ordinal + 1;
1316 			if (ts > timestamp)
1317 				ts = timestamp;
1318 		} else {
1319 			ts = timestamp;
1320 		}
1321 
1322 		/*
1323 		 * A previous context-switch event has set pid/tid in the machine's context, so
1324 		 * here we need to update the pid/tid in the thread and SPE queue.
1325 		 */
1326 		if (!spe->use_ctx_pkt_for_pid)
1327 			arm_spe_set_pid_tid_cpu(spe, queue);
1328 
1329 		ret = arm_spe_run_decoder(speq, &ts);
1330 		if (ret < 0) {
1331 			auxtrace_heap__add(&spe->heap, queue_nr, ts);
1332 			return ret;
1333 		}
1334 
1335 		if (!ret) {
1336 			ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
1337 			if (ret < 0)
1338 				return ret;
1339 		} else {
1340 			speq->on_heap = false;
1341 		}
1342 	}
1343 
1344 	return 0;
1345 }
1346 
arm_spe_process_timeless_queues(struct arm_spe * spe,pid_t tid,u64 time_)1347 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
1348 					    u64 time_)
1349 {
1350 	struct auxtrace_queues *queues = &spe->queues;
1351 	unsigned int i;
1352 	u64 ts = 0;
1353 
1354 	for (i = 0; i < queues->nr_queues; i++) {
1355 		struct auxtrace_queue *queue = &spe->queues.queue_array[i];
1356 		struct arm_spe_queue *speq = queue->priv;
1357 
1358 		if (speq && (tid == -1 || speq->tid == tid)) {
1359 			speq->time = time_;
1360 			arm_spe_set_pid_tid_cpu(spe, queue);
1361 			arm_spe_run_decoder(speq, &ts);
1362 		}
1363 	}
1364 	return 0;
1365 }
1366 
arm_spe_context_switch(struct arm_spe * spe,union perf_event * event,struct perf_sample * sample)1367 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
1368 				  struct perf_sample *sample)
1369 {
1370 	pid_t pid, tid;
1371 	int cpu;
1372 
1373 	if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
1374 		return 0;
1375 
1376 	pid = event->context_switch.next_prev_pid;
1377 	tid = event->context_switch.next_prev_tid;
1378 	cpu = sample->cpu;
1379 
1380 	if (tid == -1)
1381 		pr_warning("context_switch event has no tid\n");
1382 
1383 	return machine__set_current_tid(spe->machine, cpu, pid, tid);
1384 }
1385 
arm_spe_process_event(struct perf_session * session,union perf_event * event,struct perf_sample * sample,const struct perf_tool * tool)1386 static int arm_spe_process_event(struct perf_session *session,
1387 				 union perf_event *event,
1388 				 struct perf_sample *sample,
1389 				 const struct perf_tool *tool)
1390 {
1391 	int err = 0;
1392 	u64 timestamp;
1393 	struct arm_spe *spe = container_of(session->auxtrace,
1394 			struct arm_spe, auxtrace);
1395 
1396 	if (dump_trace)
1397 		return 0;
1398 
1399 	if (!tool->ordered_events) {
1400 		pr_err("SPE trace requires ordered events\n");
1401 		return -EINVAL;
1402 	}
1403 
1404 	if (sample->time && (sample->time != (u64) -1))
1405 		timestamp = perf_time_to_tsc(sample->time, &spe->tc);
1406 	else
1407 		timestamp = 0;
1408 
1409 	if (timestamp || spe->timeless_decoding) {
1410 		err = arm_spe__update_queues(spe);
1411 		if (err)
1412 			return err;
1413 	}
1414 
1415 	if (spe->timeless_decoding) {
1416 		if (event->header.type == PERF_RECORD_EXIT) {
1417 			err = arm_spe_process_timeless_queues(spe,
1418 					event->fork.tid,
1419 					sample->time);
1420 		}
1421 	} else if (timestamp) {
1422 		err = arm_spe_process_queues(spe, timestamp);
1423 		if (err)
1424 			return err;
1425 
1426 		if (!spe->use_ctx_pkt_for_pid &&
1427 		    (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
1428 		    event->header.type == PERF_RECORD_SWITCH))
1429 			err = arm_spe_context_switch(spe, event, sample);
1430 	}
1431 
1432 	return err;
1433 }
1434 
arm_spe_process_auxtrace_event(struct perf_session * session,union perf_event * event,const struct perf_tool * tool __maybe_unused)1435 static int arm_spe_process_auxtrace_event(struct perf_session *session,
1436 					  union perf_event *event,
1437 					  const struct perf_tool *tool __maybe_unused)
1438 {
1439 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1440 					     auxtrace);
1441 
1442 	if (!spe->data_queued) {
1443 		struct auxtrace_buffer *buffer;
1444 		off_t data_offset;
1445 		int fd = perf_data__fd(session->data);
1446 		int err;
1447 
1448 		if (perf_data__is_pipe(session->data)) {
1449 			data_offset = 0;
1450 		} else {
1451 			data_offset = lseek(fd, 0, SEEK_CUR);
1452 			if (data_offset == -1)
1453 				return -errno;
1454 		}
1455 
1456 		err = auxtrace_queues__add_event(&spe->queues, session, event,
1457 				data_offset, &buffer);
1458 		if (err)
1459 			return err;
1460 
1461 		/* Dump here now we have copied a piped trace out of the pipe */
1462 		if (dump_trace) {
1463 			if (auxtrace_buffer__get_data(buffer, fd)) {
1464 				arm_spe_dump_event(spe, buffer->data,
1465 						buffer->size);
1466 				auxtrace_buffer__put_data(buffer);
1467 			}
1468 		}
1469 	}
1470 
1471 	return 0;
1472 }
1473 
arm_spe_flush(struct perf_session * session __maybe_unused,const struct perf_tool * tool __maybe_unused)1474 static int arm_spe_flush(struct perf_session *session __maybe_unused,
1475 			 const struct perf_tool *tool __maybe_unused)
1476 {
1477 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1478 			auxtrace);
1479 	int ret;
1480 
1481 	if (dump_trace)
1482 		return 0;
1483 
1484 	if (!tool->ordered_events)
1485 		return -EINVAL;
1486 
1487 	ret = arm_spe__update_queues(spe);
1488 	if (ret < 0)
1489 		return ret;
1490 
1491 	if (spe->timeless_decoding)
1492 		return arm_spe_process_timeless_queues(spe, -1,
1493 				MAX_TIMESTAMP - 1);
1494 
1495 	ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
1496 	if (ret)
1497 		return ret;
1498 
1499 	if (!spe->use_ctx_pkt_for_pid)
1500 		ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
1501 			    "Matching of TIDs to SPE events could be inaccurate.\n");
1502 
1503 	return 0;
1504 }
1505 
arm_spe__alloc_per_cpu_metadata(u64 * buf,int per_cpu_size)1506 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
1507 {
1508 	u64 *metadata;
1509 
1510 	metadata = zalloc(per_cpu_size);
1511 	if (!metadata)
1512 		return NULL;
1513 
1514 	memcpy(metadata, buf, per_cpu_size);
1515 	return metadata;
1516 }
1517 
arm_spe__free_metadata(u64 ** metadata,int nr_cpu)1518 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
1519 {
1520 	int i;
1521 
1522 	for (i = 0; i < nr_cpu; i++)
1523 		zfree(&metadata[i]);
1524 	free(metadata);
1525 }
1526 
arm_spe__alloc_metadata(struct perf_record_auxtrace_info * info,u64 * ver,int * nr_cpu)1527 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
1528 				     u64 *ver, int *nr_cpu)
1529 {
1530 	u64 *ptr = (u64 *)info->priv;
1531 	u64 metadata_size;
1532 	u64 **metadata = NULL;
1533 	int hdr_sz, per_cpu_sz, i;
1534 
1535 	metadata_size = info->header.size -
1536 		sizeof(struct perf_record_auxtrace_info);
1537 
1538 	/* Metadata version 1 */
1539 	if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
1540 		*ver = 1;
1541 		*nr_cpu = 0;
1542 		/* No per CPU metadata */
1543 		return NULL;
1544 	}
1545 
1546 	*ver = ptr[ARM_SPE_HEADER_VERSION];
1547 	hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
1548 	*nr_cpu = ptr[ARM_SPE_CPUS_NUM];
1549 
1550 	metadata = calloc(*nr_cpu, sizeof(*metadata));
1551 	if (!metadata)
1552 		return NULL;
1553 
1554 	/* Locate the start address of per CPU metadata */
1555 	ptr += hdr_sz;
1556 	per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
1557 
1558 	for (i = 0; i < *nr_cpu; i++) {
1559 		metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
1560 		if (!metadata[i])
1561 			goto err_per_cpu_metadata;
1562 
1563 		ptr += per_cpu_sz / sizeof(u64);
1564 	}
1565 
1566 	return metadata;
1567 
1568 err_per_cpu_metadata:
1569 	arm_spe__free_metadata(metadata, *nr_cpu);
1570 	return NULL;
1571 }
1572 
arm_spe_free_queue(void * priv)1573 static void arm_spe_free_queue(void *priv)
1574 {
1575 	struct arm_spe_queue *speq = priv;
1576 
1577 	if (!speq)
1578 		return;
1579 	thread__zput(speq->thread);
1580 	arm_spe_decoder_free(speq->decoder);
1581 	zfree(&speq->event_buf);
1582 	zfree(&speq->last_branch);
1583 	free(speq);
1584 }
1585 
arm_spe_free_events(struct perf_session * session)1586 static void arm_spe_free_events(struct perf_session *session)
1587 {
1588 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1589 					     auxtrace);
1590 	struct auxtrace_queues *queues = &spe->queues;
1591 	unsigned int i;
1592 
1593 	for (i = 0; i < queues->nr_queues; i++) {
1594 		arm_spe_free_queue(queues->queue_array[i].priv);
1595 		queues->queue_array[i].priv = NULL;
1596 	}
1597 	auxtrace_queues__free(queues);
1598 }
1599 
arm_spe_free(struct perf_session * session)1600 static void arm_spe_free(struct perf_session *session)
1601 {
1602 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1603 					     auxtrace);
1604 
1605 	auxtrace_heap__free(&spe->heap);
1606 	arm_spe_free_events(session);
1607 	session->auxtrace = NULL;
1608 	arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
1609 	free(spe);
1610 }
1611 
arm_spe_evsel_is_auxtrace(struct perf_session * session,struct evsel * evsel)1612 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
1613 				      struct evsel *evsel)
1614 {
1615 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);
1616 
1617 	return evsel->core.attr.type == spe->pmu_type;
1618 }
1619 
1620 static const char * const metadata_hdr_v1_fmts[] = {
1621 	[ARM_SPE_PMU_TYPE]		= "  PMU Type           :%"PRId64"\n",
1622 	[ARM_SPE_PER_CPU_MMAPS]		= "  Per CPU mmaps      :%"PRId64"\n",
1623 };
1624 
1625 static const char * const metadata_hdr_fmts[] = {
1626 	[ARM_SPE_HEADER_VERSION]	= "  Header version     :%"PRId64"\n",
1627 	[ARM_SPE_HEADER_SIZE]		= "  Header size        :%"PRId64"\n",
1628 	[ARM_SPE_PMU_TYPE_V2]		= "  PMU type v2        :%"PRId64"\n",
1629 	[ARM_SPE_CPUS_NUM]		= "  CPU number         :%"PRId64"\n",
1630 };
1631 
1632 static const char * const metadata_per_cpu_fmts[] = {
1633 	[ARM_SPE_MAGIC]			= "    Magic            :0x%"PRIx64"\n",
1634 	[ARM_SPE_CPU]			= "    CPU #            :%"PRId64"\n",
1635 	[ARM_SPE_CPU_NR_PARAMS]		= "    Num of params    :%"PRId64"\n",
1636 	[ARM_SPE_CPU_MIDR]		= "    MIDR             :0x%"PRIx64"\n",
1637 	[ARM_SPE_CPU_PMU_TYPE]		= "    PMU Type         :%"PRId64"\n",
1638 	[ARM_SPE_CAP_MIN_IVAL]		= "    Min Interval     :%"PRId64"\n",
1639 	[ARM_SPE_CAP_EVENT_FILTER]	= "    Event Filter     :0x%"PRIx64"\n",
1640 };
1641 
arm_spe_print_info(struct arm_spe * spe,__u64 * arr)1642 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
1643 {
1644 	unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
1645 	const char * const *hdr_fmts;
1646 
1647 	if (!dump_trace)
1648 		return;
1649 
1650 	if (spe->metadata_ver == 1) {
1651 		cpu_num = 0;
1652 		hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
1653 		hdr_fmts = metadata_hdr_v1_fmts;
1654 	} else {
1655 		cpu_num = arr[ARM_SPE_CPUS_NUM];
1656 		hdr_size = arr[ARM_SPE_HEADER_SIZE];
1657 		hdr_fmts = metadata_hdr_fmts;
1658 	}
1659 
1660 	for (i = 0; i < hdr_size; i++)
1661 		fprintf(stdout, hdr_fmts[i], arr[i]);
1662 
1663 	arr += hdr_size;
1664 	for (cpu = 0; cpu < cpu_num; cpu++) {
1665 		/*
1666 		 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
1667 		 * are fixed. The sequential parameter size is decided by the
1668 		 * field 'ARM_SPE_CPU_NR_PARAMS'.
1669 		 */
1670 		cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
1671 		for (i = 0; i < cpu_size; i++)
1672 			fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
1673 		arr += cpu_size;
1674 	}
1675 }
1676 
arm_spe_set_event_name(struct evlist * evlist,u64 id,const char * name)1677 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
1678 				    const char *name)
1679 {
1680 	struct evsel *evsel;
1681 
1682 	evlist__for_each_entry(evlist, evsel) {
1683 		if (evsel->core.id && evsel->core.id[0] == id) {
1684 			if (evsel->name)
1685 				zfree(&evsel->name);
1686 			evsel->name = strdup(name);
1687 			break;
1688 		}
1689 	}
1690 }
1691 
1692 static int
arm_spe_synth_events(struct arm_spe * spe,struct perf_session * session)1693 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
1694 {
1695 	struct evlist *evlist = session->evlist;
1696 	struct evsel *evsel;
1697 	struct perf_event_attr attr;
1698 	bool found = false;
1699 	u64 id;
1700 	int err;
1701 
1702 	evlist__for_each_entry(evlist, evsel) {
1703 		if (evsel->core.attr.type == spe->pmu_type) {
1704 			found = true;
1705 			break;
1706 		}
1707 	}
1708 
1709 	if (!found) {
1710 		pr_debug("No selected events with SPE trace data\n");
1711 		return 0;
1712 	}
1713 
1714 	memset(&attr, 0, sizeof(struct perf_event_attr));
1715 	attr.size = sizeof(struct perf_event_attr);
1716 	attr.type = PERF_TYPE_HARDWARE;
1717 	attr.sample_type = evsel->core.attr.sample_type &
1718 				(PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
1719 	attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
1720 			    PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
1721 			    PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
1722 	if (spe->timeless_decoding)
1723 		attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
1724 	else
1725 		attr.sample_type |= PERF_SAMPLE_TIME;
1726 
1727 	spe->sample_type = attr.sample_type;
1728 
1729 	attr.exclude_user = evsel->core.attr.exclude_user;
1730 	attr.exclude_kernel = evsel->core.attr.exclude_kernel;
1731 	attr.exclude_hv = evsel->core.attr.exclude_hv;
1732 	attr.exclude_host = evsel->core.attr.exclude_host;
1733 	attr.exclude_guest = evsel->core.attr.exclude_guest;
1734 	attr.sample_id_all = evsel->core.attr.sample_id_all;
1735 	attr.read_format = evsel->core.attr.read_format;
1736 	attr.sample_period = spe->synth_opts.period;
1737 
1738 	/* create new id val to be a fixed offset from evsel id */
1739 	id = auxtrace_synth_id_range_start(evsel);
1740 
1741 	if (spe->synth_opts.flc) {
1742 		spe->sample_flc = true;
1743 
1744 		/* Level 1 data cache miss */
1745 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1746 		if (err)
1747 			return err;
1748 		spe->l1d_miss_id = id;
1749 		arm_spe_set_event_name(evlist, id, "l1d-miss");
1750 		id += 1;
1751 
1752 		/* Level 1 data cache access */
1753 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1754 		if (err)
1755 			return err;
1756 		spe->l1d_access_id = id;
1757 		arm_spe_set_event_name(evlist, id, "l1d-access");
1758 		id += 1;
1759 	}
1760 
1761 	if (spe->synth_opts.llc) {
1762 		spe->sample_llc = true;
1763 
1764 		/* Last level cache miss */
1765 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1766 		if (err)
1767 			return err;
1768 		spe->llc_miss_id = id;
1769 		arm_spe_set_event_name(evlist, id, "llc-miss");
1770 		id += 1;
1771 
1772 		/* Last level cache access */
1773 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1774 		if (err)
1775 			return err;
1776 		spe->llc_access_id = id;
1777 		arm_spe_set_event_name(evlist, id, "llc-access");
1778 		id += 1;
1779 	}
1780 
1781 	if (spe->synth_opts.tlb) {
1782 		spe->sample_tlb = true;
1783 
1784 		/* TLB miss */
1785 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1786 		if (err)
1787 			return err;
1788 		spe->tlb_miss_id = id;
1789 		arm_spe_set_event_name(evlist, id, "tlb-miss");
1790 		id += 1;
1791 
1792 		/* TLB access */
1793 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1794 		if (err)
1795 			return err;
1796 		spe->tlb_access_id = id;
1797 		arm_spe_set_event_name(evlist, id, "tlb-access");
1798 		id += 1;
1799 	}
1800 
1801 	if (spe->synth_opts.last_branch) {
1802 		if (spe->synth_opts.last_branch_sz > 2)
1803 			pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
1804 
1805 		attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
1806 		/*
1807 		 * We don't use the hardware index, but the sample generation
1808 		 * code uses the new format branch_stack with this field,
1809 		 * so the event attributes must indicate that it's present.
1810 		 */
1811 		attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
1812 	}
1813 
1814 	if (spe->synth_opts.branches) {
1815 		spe->sample_branch = true;
1816 
1817 		/* Branch */
1818 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1819 		if (err)
1820 			return err;
1821 		spe->branch_id = id;
1822 		arm_spe_set_event_name(evlist, id, "branch");
1823 		id += 1;
1824 	}
1825 
1826 	if (spe->synth_opts.remote_access) {
1827 		spe->sample_remote_access = true;
1828 
1829 		/* Remote access */
1830 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1831 		if (err)
1832 			return err;
1833 		spe->remote_access_id = id;
1834 		arm_spe_set_event_name(evlist, id, "remote-access");
1835 		id += 1;
1836 	}
1837 
1838 	if (spe->synth_opts.mem) {
1839 		spe->sample_memory = true;
1840 
1841 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1842 		if (err)
1843 			return err;
1844 		spe->memory_id = id;
1845 		arm_spe_set_event_name(evlist, id, "memory");
1846 		id += 1;
1847 	}
1848 
1849 	if (spe->synth_opts.instructions) {
1850 		spe->sample_instructions = true;
1851 		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
1852 
1853 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1854 		if (err)
1855 			return err;
1856 		spe->instructions_id = id;
1857 		arm_spe_set_event_name(evlist, id, "instructions");
1858 	}
1859 
1860 	return 0;
1861 }
1862 
arm_spe__is_homogeneous(u64 ** metadata,int nr_cpu)1863 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
1864 {
1865 	u64 midr;
1866 	int i;
1867 
1868 	if (!nr_cpu)
1869 		return false;
1870 
1871 	for (i = 0; i < nr_cpu; i++) {
1872 		if (!metadata[i])
1873 			return false;
1874 
1875 		if (i == 0) {
1876 			midr = metadata[i][ARM_SPE_CPU_MIDR];
1877 			continue;
1878 		}
1879 
1880 		if (midr != metadata[i][ARM_SPE_CPU_MIDR])
1881 			return false;
1882 	}
1883 
1884 	return true;
1885 }
1886 
arm_spe_process_auxtrace_info(union perf_event * event,struct perf_session * session)1887 int arm_spe_process_auxtrace_info(union perf_event *event,
1888 				  struct perf_session *session)
1889 {
1890 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1891 	size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
1892 	struct perf_record_time_conv *tc = &session->time_conv;
1893 	struct arm_spe *spe;
1894 	u64 **metadata = NULL;
1895 	u64 metadata_ver;
1896 	int nr_cpu, err;
1897 
1898 	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
1899 					min_sz)
1900 		return -EINVAL;
1901 
1902 	metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
1903 					   &nr_cpu);
1904 	if (!metadata && metadata_ver != 1) {
1905 		pr_err("Failed to parse Arm SPE metadata.\n");
1906 		return -EINVAL;
1907 	}
1908 
1909 	spe = zalloc(sizeof(struct arm_spe));
1910 	if (!spe) {
1911 		err = -ENOMEM;
1912 		goto err_free_metadata;
1913 	}
1914 
1915 	err = auxtrace_queues__init(&spe->queues);
1916 	if (err)
1917 		goto err_free;
1918 
1919 	spe->session = session;
1920 	spe->machine = &session->machines.host; /* No kvm support */
1921 	spe->auxtrace_type = auxtrace_info->type;
1922 	if (metadata_ver == 1)
1923 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1924 	else
1925 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
1926 	spe->metadata = metadata;
1927 	spe->metadata_ver = metadata_ver;
1928 	spe->metadata_nr_cpu = nr_cpu;
1929 	spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
1930 
1931 	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
1932 
1933 	/*
1934 	 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
1935 	 * and the parameters for hardware clock are stored in the session
1936 	 * context.  Passes these parameters to the struct perf_tsc_conversion
1937 	 * in "spe->tc", which is used for later conversion between clock
1938 	 * counter and timestamp.
1939 	 *
1940 	 * For backward compatibility, copies the fields starting from
1941 	 * "time_cycles" only if they are contained in the event.
1942 	 */
1943 	spe->tc.time_shift = tc->time_shift;
1944 	spe->tc.time_mult = tc->time_mult;
1945 	spe->tc.time_zero = tc->time_zero;
1946 
1947 	if (event_contains(*tc, time_cycles)) {
1948 		spe->tc.time_cycles = tc->time_cycles;
1949 		spe->tc.time_mask = tc->time_mask;
1950 		spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
1951 		spe->tc.cap_user_time_short = tc->cap_user_time_short;
1952 	}
1953 
1954 	spe->auxtrace.process_event = arm_spe_process_event;
1955 	spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
1956 	spe->auxtrace.flush_events = arm_spe_flush;
1957 	spe->auxtrace.free_events = arm_spe_free_events;
1958 	spe->auxtrace.free = arm_spe_free;
1959 	spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
1960 	session->auxtrace = &spe->auxtrace;
1961 
1962 	arm_spe_print_info(spe, &auxtrace_info->priv[0]);
1963 
1964 	if (dump_trace)
1965 		return 0;
1966 
1967 	if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
1968 		spe->synth_opts = *session->itrace_synth_opts;
1969 	} else {
1970 		itrace_synth_opts__set_default(&spe->synth_opts, false);
1971 		/* Default nanoseconds period not supported */
1972 		spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS;
1973 		spe->synth_opts.period = 1;
1974 	}
1975 
1976 	if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
1977 		ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n");
1978 		err = -EINVAL;
1979 		goto err_free_queues;
1980 	}
1981 	if (spe->synth_opts.period > 1)
1982 		ui__warning("Arm SPE has a hardware-based sampling period.\n\n"
1983 			    "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n");
1984 
1985 	err = arm_spe_synth_events(spe, session);
1986 	if (err)
1987 		goto err_free_queues;
1988 
1989 	err = auxtrace_queues__process_index(&spe->queues, session);
1990 	if (err)
1991 		goto err_free_queues;
1992 
1993 	if (spe->queues.populated)
1994 		spe->data_queued = true;
1995 
1996 	return 0;
1997 
1998 err_free_queues:
1999 	auxtrace_queues__free(&spe->queues);
2000 	session->auxtrace = NULL;
2001 err_free:
2002 	free(spe);
2003 err_free_metadata:
2004 	arm_spe__free_metadata(metadata, nr_cpu);
2005 	return err;
2006 }
2007