xref: /linux/tools/perf/util/arm-spe.c (revision 1a646a28a0063f1fb31589f7190f2b4fb613e413)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Arm Statistical Profiling Extensions (SPE) support
4  * Copyright (c) 2017-2018, Arm Ltd.
5  */
6 
7 #include <byteswap.h>
8 #include <endian.h>
9 #include <errno.h>
10 #include <inttypes.h>
11 #include <linux/bitops.h>
12 #include <linux/kernel.h>
13 #include <linux/log2.h>
14 #include <linux/types.h>
15 #include <linux/zalloc.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18 
19 #include "auxtrace.h"
20 #include "color.h"
21 #include "debug.h"
22 #include "evlist.h"
23 #include "evsel.h"
24 #include "machine.h"
25 #include "session.h"
26 #include "symbol.h"
27 #include "thread.h"
28 #include "thread-stack.h"
29 #include "tsc.h"
30 #include "tool.h"
31 #include "util/synthetic-events.h"
32 
33 #include "arm-spe.h"
34 #include "arm-spe-decoder/arm-spe-decoder.h"
35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
36 
37 #include "../../arch/arm64/include/asm/cputype.h"
38 #define MAX_TIMESTAMP (~0ULL)
39 
40 #define is_ldst_op(op)		(!!((op) & ARM_SPE_OP_LDST))
41 
42 #define is_simd_op(op)		(!!((op) & (ARM_SPE_OP_SIMD_FP | ARM_SPE_OP_SVE | \
43 					    ARM_SPE_OP_SME | ARM_SPE_OP_ASE)))
44 
45 #define is_mem_op(op)		(is_ldst_op(op) || is_simd_op(op))
46 
47 #define ARM_SPE_CACHE_EVENT(lvl) \
48 	(ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS)
49 
50 #define arm_spe_is_cache_level(type, lvl) \
51 	((type) & ARM_SPE_CACHE_EVENT(lvl))
52 
53 #define arm_spe_is_cache_hit(type, lvl) \
54 	(((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS)
55 
56 #define arm_spe_is_cache_miss(type, lvl) \
57 	((type) & ARM_SPE_##lvl##_MISS)
58 
59 struct arm_spe {
60 	struct auxtrace			auxtrace;
61 	struct auxtrace_queues		queues;
62 	struct auxtrace_heap		heap;
63 	struct itrace_synth_opts        synth_opts;
64 	u32				auxtrace_type;
65 	struct perf_session		*session;
66 	struct machine			*machine;
67 	u32				pmu_type;
68 
69 	struct perf_tsc_conversion	tc;
70 
71 	u8				timeless_decoding;
72 	u8				data_queued;
73 
74 	u64				sample_type;
75 	u8				sample_flc;
76 	u8				sample_llc;
77 	u8				sample_tlb;
78 	u8				sample_branch;
79 	u8				sample_remote_access;
80 	u8				sample_memory;
81 	u8				sample_instructions;
82 
83 	u64				l1d_miss_id;
84 	u64				l1d_access_id;
85 	u64				llc_miss_id;
86 	u64				llc_access_id;
87 	u64				tlb_miss_id;
88 	u64				tlb_access_id;
89 	u64				branch_id;
90 	u64				remote_access_id;
91 	u64				memory_id;
92 	u64				instructions_id;
93 
94 	u64				kernel_start;
95 
96 	unsigned long			num_events;
97 	u8				use_ctx_pkt_for_pid;
98 
99 	u64				**metadata;
100 	u64				metadata_ver;
101 	u64				metadata_nr_cpu;
102 	bool				is_homogeneous;
103 };
104 
105 struct arm_spe_queue {
106 	struct arm_spe			*spe;
107 	unsigned int			queue_nr;
108 	struct auxtrace_buffer		*buffer;
109 	struct auxtrace_buffer		*old_buffer;
110 	union perf_event		*event_buf;
111 	bool				on_heap;
112 	bool				done;
113 	pid_t				pid;
114 	pid_t				tid;
115 	int				cpu;
116 	struct arm_spe_decoder		*decoder;
117 	u64				time;
118 	u64				timestamp;
119 	struct thread			*thread;
120 	u64				sample_count;
121 	u32				flags;
122 	struct branch_stack		*last_branch;
123 };
124 
125 struct data_source_handle {
126 	const struct midr_range *midr_ranges;
127 	void (*ds_synth)(const struct arm_spe_record *record,
128 			 union perf_mem_data_src *data_src);
129 };
130 
131 #define DS(range, func)					\
132 	{						\
133 		.midr_ranges = range,			\
134 		.ds_synth = arm_spe__synth_##func,	\
135 	}
136 
137 static int arm_spe__get_midr(struct arm_spe *spe, int cpu, u64 *midr);
138 
139 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
140 			 unsigned char *buf, size_t len, u64 midr)
141 {
142 	struct arm_spe_pkt packet;
143 	size_t pos = 0;
144 	int ret, pkt_len, i;
145 	char desc[ARM_SPE_PKT_DESC_MAX];
146 	const char *color = PERF_COLOR_BLUE;
147 
148 	color_fprintf(stdout, color,
149 		      ". ... ARM SPE data: size %#zx bytes\n",
150 		      len);
151 
152 	while (len) {
153 		ret = arm_spe_get_packet(buf, len, &packet, midr);
154 
155 		if (ret > 0)
156 			pkt_len = ret;
157 		else
158 			pkt_len = 1;
159 		printf(".");
160 		color_fprintf(stdout, color, "  %08zx: ", pos);
161 		for (i = 0; i < pkt_len; i++)
162 			color_fprintf(stdout, color, " %02x", buf[i]);
163 		for (; i < 16; i++)
164 			color_fprintf(stdout, color, "   ");
165 		if (ret > 0) {
166 			ret = arm_spe_pkt_desc(&packet, desc,
167 					       ARM_SPE_PKT_DESC_MAX);
168 			if (!ret)
169 				color_fprintf(stdout, color, " %s\n", desc);
170 		} else {
171 			color_fprintf(stdout, color, " Bad packet!\n");
172 		}
173 		pos += pkt_len;
174 		buf += pkt_len;
175 		len -= pkt_len;
176 	}
177 }
178 
179 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
180 			       size_t len, u64 midr)
181 {
182 	printf(".\n");
183 	arm_spe_dump(spe, buf, len, midr);
184 }
185 
186 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
187 {
188 	struct arm_spe_queue *speq = data;
189 	struct auxtrace_buffer *buffer = speq->buffer;
190 	struct auxtrace_buffer *old_buffer = speq->old_buffer;
191 	struct auxtrace_queue *queue;
192 
193 	queue = &speq->spe->queues.queue_array[speq->queue_nr];
194 
195 	buffer = auxtrace_buffer__next(queue, buffer);
196 	/* If no more data, drop the previous auxtrace_buffer and return */
197 	if (!buffer) {
198 		if (old_buffer)
199 			auxtrace_buffer__drop_data(old_buffer);
200 		b->len = 0;
201 		return 0;
202 	}
203 
204 	speq->buffer = buffer;
205 
206 	/* If the aux_buffer doesn't have data associated, try to load it */
207 	if (!buffer->data) {
208 		/* get the file desc associated with the perf data file */
209 		int fd = perf_data__fd(speq->spe->session->data);
210 
211 		buffer->data = auxtrace_buffer__get_data(buffer, fd);
212 		if (!buffer->data)
213 			return -ENOMEM;
214 	}
215 
216 	b->len = buffer->size;
217 	b->buf = buffer->data;
218 
219 	if (b->len) {
220 		if (old_buffer)
221 			auxtrace_buffer__drop_data(old_buffer);
222 		speq->old_buffer = buffer;
223 	} else {
224 		auxtrace_buffer__drop_data(buffer);
225 		return arm_spe_get_trace(b, data);
226 	}
227 
228 	return 0;
229 }
230 
231 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
232 		unsigned int queue_nr)
233 {
234 	struct arm_spe_params params = { .get_trace = 0, };
235 	struct arm_spe_queue *speq;
236 
237 	speq = zalloc(sizeof(*speq));
238 	if (!speq)
239 		return NULL;
240 
241 	speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
242 	if (!speq->event_buf)
243 		goto out_free;
244 
245 	speq->spe = spe;
246 	speq->queue_nr = queue_nr;
247 	speq->pid = -1;
248 	speq->tid = -1;
249 	speq->cpu = -1;
250 
251 	/* params set */
252 	params.get_trace = arm_spe_get_trace;
253 	params.data = speq;
254 
255 	if (spe->synth_opts.last_branch) {
256 		size_t sz = sizeof(struct branch_stack);
257 
258 		/* Allocate up to two entries for PBT + TGT */
259 		sz += sizeof(struct branch_entry) *
260 			min(spe->synth_opts.last_branch_sz, 2U);
261 		speq->last_branch = zalloc(sz);
262 		if (!speq->last_branch)
263 			goto out_free;
264 	}
265 
266 	/* create new decoder */
267 	speq->decoder = arm_spe_decoder_new(&params);
268 	if (!speq->decoder)
269 		goto out_free;
270 
271 	return speq;
272 
273 out_free:
274 	zfree(&speq->event_buf);
275 	zfree(&speq->last_branch);
276 	free(speq);
277 
278 	return NULL;
279 }
280 
281 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
282 {
283 	return ip >= spe->kernel_start ?
284 		PERF_RECORD_MISC_KERNEL :
285 		PERF_RECORD_MISC_USER;
286 }
287 
288 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
289 				    struct auxtrace_queue *queue)
290 {
291 	struct arm_spe_queue *speq = queue->priv;
292 	pid_t tid;
293 
294 	tid = machine__get_current_tid(spe->machine, speq->cpu);
295 	if (tid != -1) {
296 		speq->tid = tid;
297 		thread__zput(speq->thread);
298 	} else
299 		speq->tid = queue->tid;
300 
301 	if ((!speq->thread) && (speq->tid != -1)) {
302 		speq->thread = machine__find_thread(spe->machine, -1,
303 						    speq->tid);
304 	}
305 
306 	if (speq->thread) {
307 		speq->pid = thread__pid(speq->thread);
308 		if (queue->cpu == -1) {
309 			speq->cpu = thread__cpu(speq->thread);
310 			arm_spe__get_midr(spe, speq->cpu, &speq->decoder->midr);
311 		}
312 	}
313 }
314 
315 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
316 {
317 	struct arm_spe *spe = speq->spe;
318 	int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);
319 
320 	if (err)
321 		return err;
322 
323 	arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
324 
325 	return 0;
326 }
327 
328 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu)
329 {
330 	u64 i;
331 
332 	if (!spe->metadata)
333 		return NULL;
334 
335 	/* CPU ID is -1 for per-thread mode */
336 	if (cpu < 0) {
337 		/*
338 		 * On the heterogeneous system, due to CPU ID is -1,
339 		 * cannot confirm the data source packet is supported.
340 		 */
341 		if (!spe->is_homogeneous)
342 			return NULL;
343 
344 		/* In homogeneous system, simply use CPU0's metadata */
345 		return spe->metadata[0];
346 	}
347 
348 	for (i = 0; i < spe->metadata_nr_cpu; i++)
349 		if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu)
350 			return spe->metadata[i];
351 
352 	return NULL;
353 }
354 
355 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
356 {
357 	struct simd_flags simd_flags = {};
358 
359 	if (record->op & ARM_SPE_OP_SVE)
360 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
361 	else if (record->op & ARM_SPE_OP_SME)
362 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SME;
363 	else if (record->op & (ARM_SPE_OP_ASE | ARM_SPE_OP_SIMD_FP))
364 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_ASE;
365 
366 	if (record->op & ARM_SPE_OP_SVE) {
367 		if (!(record->op & ARM_SPE_OP_PRED))
368 			simd_flags.pred = SIMD_OP_FLAGS_PRED_DISABLED;
369 		else if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
370 			simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL;
371 		else if (record->type & ARM_SPE_SVE_EMPTY_PRED)
372 			simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY;
373 		else
374 			simd_flags.pred = SIMD_OP_FLAGS_PRED_FULL;
375 	} else {
376 		if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
377 			simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL;
378 		else if (record->type & ARM_SPE_SVE_EMPTY_PRED)
379 			simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY;
380 	}
381 
382 	return simd_flags;
383 }
384 
385 static void arm_spe_prep_sample(struct arm_spe *spe,
386 				struct arm_spe_queue *speq,
387 				union perf_event *event,
388 				struct perf_sample *sample)
389 {
390 	struct arm_spe_record *record = &speq->decoder->record;
391 
392 	if (!spe->timeless_decoding)
393 		sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
394 
395 	sample->ip = record->from_ip;
396 	sample->cpumode = arm_spe_cpumode(spe, sample->ip);
397 	sample->pid = speq->pid;
398 	sample->tid = speq->tid;
399 	sample->period = spe->synth_opts.period;
400 	sample->cpu = speq->cpu;
401 	sample->simd_flags = arm_spe__synth_simd_flags(record);
402 
403 	event->sample.header.type = PERF_RECORD_SAMPLE;
404 	event->sample.header.misc = sample->cpumode;
405 	event->sample.header.size = sizeof(struct perf_event_header);
406 }
407 
408 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
409 {
410 	struct arm_spe *spe = speq->spe;
411 	struct arm_spe_record *record = &speq->decoder->record;
412 	struct branch_stack *bstack = speq->last_branch;
413 	struct branch_flags *bs_flags;
414 	unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
415 	bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
416 	bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
417 	size_t sz = sizeof(struct branch_stack) +
418 		    sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
419 	int i = 0;
420 
421 	/* Clean up branch stack */
422 	memset(bstack, 0x0, sz);
423 
424 	if (!have_tgt && !have_pbt)
425 		return;
426 
427 	if (have_tgt) {
428 		bstack->entries[i].from = record->from_ip;
429 		bstack->entries[i].to = record->to_ip;
430 
431 		bs_flags = &bstack->entries[i].flags;
432 		bs_flags->value = 0;
433 
434 		if (record->op & ARM_SPE_OP_BR_CR_BL) {
435 			if (record->op & ARM_SPE_OP_BR_COND)
436 				bs_flags->type |= PERF_BR_COND_CALL;
437 			else
438 				bs_flags->type |= PERF_BR_CALL;
439 		/*
440 		 * Indirect branch instruction without link (e.g. BR),
441 		 * take this case as function return.
442 		 */
443 		} else if (record->op & ARM_SPE_OP_BR_CR_RET ||
444 			   record->op & ARM_SPE_OP_BR_INDIRECT) {
445 			if (record->op & ARM_SPE_OP_BR_COND)
446 				bs_flags->type |= PERF_BR_COND_RET;
447 			else
448 				bs_flags->type |= PERF_BR_RET;
449 		} else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
450 			if (record->op & ARM_SPE_OP_BR_COND)
451 				bs_flags->type |= PERF_BR_COND;
452 			else
453 				bs_flags->type |= PERF_BR_UNCOND;
454 		} else {
455 			if (record->op & ARM_SPE_OP_BR_COND)
456 				bs_flags->type |= PERF_BR_COND;
457 			else
458 				bs_flags->type |= PERF_BR_UNKNOWN;
459 		}
460 
461 		if (record->type & ARM_SPE_BRANCH_MISS) {
462 			bs_flags->mispred = 1;
463 			bs_flags->predicted = 0;
464 		} else {
465 			bs_flags->mispred = 0;
466 			bs_flags->predicted = 1;
467 		}
468 
469 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
470 			bs_flags->not_taken = 1;
471 
472 		if (record->type & ARM_SPE_IN_TXN)
473 			bs_flags->in_tx = 1;
474 
475 		bs_flags->cycles = min(record->latency, 0xFFFFU);
476 		i++;
477 	}
478 
479 	if (have_pbt) {
480 		bs_flags = &bstack->entries[i].flags;
481 		bs_flags->type |= PERF_BR_UNKNOWN;
482 		bstack->entries[i].to = record->prev_br_tgt;
483 		i++;
484 	}
485 
486 	bstack->nr = i;
487 	bstack->hw_idx = -1ULL;
488 }
489 
490 static int arm_spe__inject_event(struct arm_spe *spe, union perf_event *event,
491 				 struct perf_sample *sample, u64 type)
492 {
493 	struct evsel *evsel = sample->evsel;
494 	u64 branch_sample_type = 0;
495 	size_t sz;
496 
497 	if (!evsel && spe->session && spe->session->evlist)
498 		evsel = evlist__id2evsel(spe->session->evlist, sample->id);
499 
500 	if (evsel)
501 		branch_sample_type = evsel->core.attr.branch_sample_type;
502 
503 	event->header.type = PERF_RECORD_SAMPLE;
504 	sz = perf_event__sample_event_size(sample, type, /*read_format=*/0,
505 					   branch_sample_type);
506 	if (sz >= PERF_SAMPLE_MAX_SIZE) {
507 		pr_err("Sample size %zu exceeds max size %d\n", sz, PERF_SAMPLE_MAX_SIZE);
508 		return -EFAULT;
509 	}
510 	event->header.size = sz;
511 
512 	return perf_event__synthesize_sample(event, type, /*read_format=*/0,
513 					     branch_sample_type, sample);
514 }
515 
516 static inline int
517 arm_spe_deliver_synth_event(struct arm_spe *spe,
518 			    struct arm_spe_queue *speq __maybe_unused,
519 			    union perf_event *event,
520 			    struct perf_sample *sample)
521 {
522 	int ret;
523 
524 	if (spe->synth_opts.inject) {
525 		ret = arm_spe__inject_event(spe, event, sample, spe->sample_type);
526 		if (ret)
527 			return ret;
528 	}
529 
530 	ret = perf_session__deliver_synth_event(spe->session, event, sample);
531 	if (ret)
532 		pr_err("ARM SPE: failed to deliver event, error %d\n", ret);
533 
534 	return ret;
535 }
536 
537 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
538 				     u64 spe_events_id,
539 				     union perf_mem_data_src data_src)
540 {
541 	struct arm_spe *spe = speq->spe;
542 	struct arm_spe_record *record = &speq->decoder->record;
543 	union perf_event *event = speq->event_buf;
544 	struct perf_sample sample;
545 	int ret;
546 
547 	perf_sample__init(&sample, /*all=*/true);
548 	arm_spe_prep_sample(spe, speq, event, &sample);
549 
550 	sample.id = spe_events_id;
551 	sample.stream_id = spe_events_id;
552 	sample.addr = record->virt_addr;
553 	sample.phys_addr = record->phys_addr;
554 	sample.data_src = data_src.val;
555 	sample.weight = record->latency;
556 
557 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
558 	perf_sample__exit(&sample);
559 	return ret;
560 }
561 
562 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
563 					u64 spe_events_id)
564 {
565 	struct arm_spe *spe = speq->spe;
566 	struct arm_spe_record *record = &speq->decoder->record;
567 	union perf_event *event = speq->event_buf;
568 	struct perf_sample sample;
569 	int ret;
570 
571 	perf_sample__init(&sample, /*all=*/true);
572 	arm_spe_prep_sample(spe, speq, event, &sample);
573 
574 	sample.id = spe_events_id;
575 	sample.stream_id = spe_events_id;
576 	sample.addr = record->to_ip;
577 	sample.weight = record->latency;
578 	sample.flags = speq->flags;
579 	sample.branch_stack = speq->last_branch;
580 
581 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
582 	perf_sample__exit(&sample);
583 	return ret;
584 }
585 
586 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
587 					     u64 spe_events_id,
588 					     union perf_mem_data_src data_src)
589 {
590 	struct arm_spe *spe = speq->spe;
591 	struct arm_spe_record *record = &speq->decoder->record;
592 	union perf_event *event = speq->event_buf;
593 	struct perf_sample sample;
594 	int ret;
595 
596 	perf_sample__init(&sample, /*all=*/true);
597 	arm_spe_prep_sample(spe, speq, event, &sample);
598 
599 	sample.id = spe_events_id;
600 	sample.stream_id = spe_events_id;
601 	sample.addr = record->to_ip;
602 	sample.phys_addr = record->phys_addr;
603 	sample.data_src = data_src.val;
604 	sample.weight = record->latency;
605 	sample.flags = speq->flags;
606 	sample.branch_stack = speq->last_branch;
607 
608 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
609 	perf_sample__exit(&sample);
610 	return ret;
611 }
612 
613 static const struct midr_range common_ds_encoding_cpus[] = {
614 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
615 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
616 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE),
617 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
618 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
619 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
620 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
621 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
622 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X4),
623 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
624 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
625 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
626 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
627 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
628 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3),
629 	MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
630 	{},
631 };
632 
633 static const struct midr_range ampereone_ds_encoding_cpus[] = {
634 	MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
635 	{},
636 };
637 
638 static const struct midr_range hisi_hip_ds_encoding_cpus[] = {
639 	MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
640 	{},
641 };
642 
643 static void arm_spe__sample_flags(struct arm_spe_queue *speq)
644 {
645 	const struct arm_spe_record *record = &speq->decoder->record;
646 
647 	speq->flags = 0;
648 	if (record->op & ARM_SPE_OP_BRANCH_ERET) {
649 		speq->flags = PERF_IP_FLAG_BRANCH;
650 
651 		if (record->type & ARM_SPE_BRANCH_MISS)
652 			speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
653 
654 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
655 			speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
656 
657 		if (record->type & ARM_SPE_IN_TXN)
658 			speq->flags |= PERF_IP_FLAG_IN_TX;
659 
660 		if (record->op & ARM_SPE_OP_BR_COND)
661 			speq->flags |= PERF_IP_FLAG_CONDITIONAL;
662 
663 		if (record->op & ARM_SPE_OP_BR_CR_BL)
664 			speq->flags |= PERF_IP_FLAG_CALL;
665 		else if (record->op & ARM_SPE_OP_BR_CR_RET)
666 			speq->flags |= PERF_IP_FLAG_RETURN;
667 		/*
668 		 * Indirect branch instruction without link (e.g. BR),
669 		 * take it as a function return.
670 		 */
671 		else if (record->op & ARM_SPE_OP_BR_INDIRECT)
672 			speq->flags |= PERF_IP_FLAG_RETURN;
673 	}
674 }
675 
676 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
677 					      union perf_mem_data_src *data_src)
678 {
679 	/*
680 	 * Even though four levels of cache hierarchy are possible, no known
681 	 * production Neoverse systems currently include more than three levels
682 	 * so for the time being we assume three exist. If a production system
683 	 * is built with four the this function would have to be changed to
684 	 * detect the number of levels for reporting.
685 	 */
686 
687 	/*
688 	 * We have no data on the hit level or data source for stores in the
689 	 * Neoverse SPE records.
690 	 */
691 	if (record->op & ARM_SPE_OP_ST) {
692 		data_src->mem_lvl = PERF_MEM_LVL_NA;
693 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
694 		data_src->mem_snoop = PERF_MEM_SNOOP_NA;
695 		return;
696 	}
697 
698 	switch (record->source) {
699 	case ARM_SPE_COMMON_DS_L1D:
700 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
701 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
702 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
703 		break;
704 	case ARM_SPE_COMMON_DS_L2:
705 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
706 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
707 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
708 		break;
709 	case ARM_SPE_COMMON_DS_PEER_CORE:
710 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
711 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
712 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
713 		break;
714 	/*
715 	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
716 	 * transfer, so set SNOOPX_PEER
717 	 */
718 	case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
719 	case ARM_SPE_COMMON_DS_PEER_CLUSTER:
720 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
721 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
722 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
723 		break;
724 	/*
725 	 * System cache is assumed to be L3
726 	 */
727 	case ARM_SPE_COMMON_DS_SYS_CACHE:
728 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
729 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
730 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
731 		break;
732 	/*
733 	 * We don't know what level it hit in, except it came from the other
734 	 * socket
735 	 */
736 	case ARM_SPE_COMMON_DS_REMOTE:
737 		data_src->mem_lvl = PERF_MEM_LVL_NA;
738 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
739 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
740 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
741 		break;
742 	case ARM_SPE_COMMON_DS_DRAM:
743 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
744 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
745 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
746 		break;
747 	default:
748 		break;
749 	}
750 }
751 
752 /*
753  * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
754  * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
755  */
756 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
757 						 union perf_mem_data_src *data_src)
758 {
759 	struct arm_spe_record common_record;
760 
761 	switch (record->source) {
762 	case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
763 		common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
764 		break;
765 	case ARM_SPE_AMPEREONE_SLC:
766 		common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
767 		break;
768 	case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
769 		common_record.source = ARM_SPE_COMMON_DS_REMOTE;
770 		break;
771 	case ARM_SPE_AMPEREONE_DDR:
772 		common_record.source = ARM_SPE_COMMON_DS_DRAM;
773 		break;
774 	case ARM_SPE_AMPEREONE_L1D:
775 		common_record.source = ARM_SPE_COMMON_DS_L1D;
776 		break;
777 	case ARM_SPE_AMPEREONE_L2D:
778 		common_record.source = ARM_SPE_COMMON_DS_L2;
779 		break;
780 	default:
781 		pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
782 				record->source);
783 		return;
784 	}
785 
786 	common_record.op = record->op;
787 	arm_spe__synth_data_source_common(&common_record, data_src);
788 }
789 
790 static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record,
791 						union perf_mem_data_src *data_src)
792 {
793 	/* Use common synthesis method to handle store operations */
794 	if (record->op & ARM_SPE_OP_ST) {
795 		arm_spe__synth_data_source_common(record, data_src);
796 		return;
797 	}
798 
799 	switch (record->source) {
800 	case ARM_SPE_HISI_HIP_PEER_CPU:
801 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
802 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
803 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
804 		break;
805 	case ARM_SPE_HISI_HIP_PEER_CPU_HITM:
806 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
807 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
808 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
809 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
810 		break;
811 	case ARM_SPE_HISI_HIP_L3:
812 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
813 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
814 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
815 		break;
816 	case ARM_SPE_HISI_HIP_L3_HITM:
817 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
818 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
819 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
820 		break;
821 	case ARM_SPE_HISI_HIP_PEER_CLUSTER:
822 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
823 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
824 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
825 		break;
826 	case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM:
827 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
828 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
829 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
830 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
831 		break;
832 	case ARM_SPE_HISI_HIP_REMOTE_SOCKET:
833 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
834 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
835 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
836 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
837 		break;
838 	case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM:
839 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
840 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
841 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
842 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
843 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
844 		break;
845 	case ARM_SPE_HISI_HIP_LOCAL_MEM:
846 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
847 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
848 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
849 		break;
850 	case ARM_SPE_HISI_HIP_REMOTE_MEM:
851 		data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
852 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
853 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
854 		break;
855 	case ARM_SPE_HISI_HIP_NC_DEV:
856 		data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT;
857 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
858 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
859 		break;
860 	case ARM_SPE_HISI_HIP_L2:
861 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
862 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
863 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
864 		break;
865 	case ARM_SPE_HISI_HIP_L2_HITM:
866 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
867 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
868 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
869 		break;
870 	case ARM_SPE_HISI_HIP_L1:
871 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
872 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
873 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
874 		break;
875 	default:
876 		break;
877 	}
878 }
879 
880 static const struct data_source_handle data_source_handles[] = {
881 	DS(common_ds_encoding_cpus, data_source_common),
882 	DS(ampereone_ds_encoding_cpus, data_source_ampereone),
883 	DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip),
884 };
885 
886 static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record,
887 					   union perf_mem_data_src *data_src)
888 {
889 	/*
890 	 * To find a cache hit, search in ascending order from the lower level
891 	 * caches to the higher level caches. This reflects the best scenario
892 	 * for a cache hit.
893 	 */
894 	if (arm_spe_is_cache_hit(record->type, L1D)) {
895 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
896 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
897 	} else if (record->type & ARM_SPE_RECENTLY_FETCHED) {
898 		data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
899 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB;
900 	} else if (arm_spe_is_cache_hit(record->type, L2D)) {
901 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
902 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
903 	} else if (arm_spe_is_cache_hit(record->type, LLC)) {
904 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
905 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
906 	/*
907 	 * To find a cache miss, search in descending order from the higher
908 	 * level cache to the lower level cache. This represents the worst
909 	 * scenario for a cache miss.
910 	 */
911 	} else if (arm_spe_is_cache_miss(record->type, LLC)) {
912 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS;
913 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
914 	} else if (arm_spe_is_cache_miss(record->type, L2D)) {
915 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS;
916 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
917 	} else if (arm_spe_is_cache_miss(record->type, L1D)) {
918 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
919 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
920 	}
921 }
922 
923 static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record,
924 					   union perf_mem_data_src *data_src)
925 {
926 	/* Record the greatest level info for a store operation. */
927 	if (arm_spe_is_cache_level(record->type, LLC)) {
928 		data_src->mem_lvl = PERF_MEM_LVL_L3;
929 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ?
930 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
931 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
932 	} else if (arm_spe_is_cache_level(record->type, L2D)) {
933 		data_src->mem_lvl = PERF_MEM_LVL_L2;
934 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ?
935 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
936 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
937 	} else if (arm_spe_is_cache_level(record->type, L1D)) {
938 		data_src->mem_lvl = PERF_MEM_LVL_L1;
939 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ?
940 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
941 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
942 	}
943 }
944 
945 static void arm_spe__synth_memory_level(struct arm_spe_queue *speq,
946 					const struct arm_spe_record *record,
947 					union perf_mem_data_src *data_src)
948 {
949 	struct arm_spe *spe = speq->spe;
950 
951 	/*
952 	 * The data source packet contains more info for cache levels for
953 	 * peer snooping. So respect the memory level if has been set by
954 	 * data source parsing.
955 	 */
956 	if (!data_src->mem_lvl) {
957 		if (data_src->mem_op == PERF_MEM_OP_LOAD)
958 			arm_spe__synth_ld_memory_level(record, data_src);
959 		if (data_src->mem_op == PERF_MEM_OP_STORE)
960 			arm_spe__synth_st_memory_level(record, data_src);
961 	}
962 
963 	if (!data_src->mem_lvl) {
964 		data_src->mem_lvl = PERF_MEM_LVL_NA;
965 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
966 	}
967 
968 	/*
969 	 * If 'mem_snoop' has been set by data source packet, skip to set
970 	 * it at here.
971 	 */
972 	if (!data_src->mem_snoop) {
973 		if (record->type & ARM_SPE_DATA_SNOOPED) {
974 			if (record->type & ARM_SPE_HITM)
975 				data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
976 			else
977 				data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
978 		} else {
979 			u64 *metadata =
980 				arm_spe__get_metadata_by_cpu(spe, speq->cpu);
981 
982 			/*
983 			 * Set NA ("Not available") mode if no meta data or the
984 			 * SNOOPED event is not supported.
985 			 */
986 			if (!metadata ||
987 			    !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED))
988 				data_src->mem_snoop = PERF_MEM_SNOOP_NA;
989 			else
990 				data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
991 		}
992 	}
993 
994 	if (!data_src->mem_remote) {
995 		if (record->type & ARM_SPE_REMOTE_ACCESS)
996 			data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
997 	}
998 }
999 
1000 static int arm_spe__get_midr(struct arm_spe *spe, int cpu, u64 *midr)
1001 {
1002 	u64 *metadata;
1003 
1004 	/* Metadata version 1 assumes all CPUs are the same (old behavior) */
1005 	if (spe->metadata_ver == 1) {
1006 		const char *cpuid;
1007 
1008 		pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
1009 		cpuid = perf_env__cpuid(perf_session__env(spe->session));
1010 		if (!cpuid)
1011 			goto err;
1012 
1013 		*midr = strtol(cpuid, NULL, 16);
1014 		return 0;
1015 	}
1016 
1017 	metadata = arm_spe__get_metadata_by_cpu(spe, cpu);
1018 	if (!metadata)
1019 		goto err;
1020 
1021 	*midr = metadata[ARM_SPE_CPU_MIDR];
1022 	return 0;
1023 
1024 err:
1025 	pr_warning_once("Failed to get MIDR for CPU %d\n", cpu);
1026 	return -EINVAL;
1027 }
1028 
1029 static void arm_spe__synth_ds(struct arm_spe_queue *speq,
1030 			      const struct arm_spe_record *record,
1031 			      union perf_mem_data_src *data_src)
1032 {
1033 	u64 midr;
1034 	unsigned int i;
1035 
1036 	if (arm_spe__get_midr(speq->spe, speq->cpu, &midr))
1037 		return;
1038 
1039 	for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
1040 		if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
1041 			return data_source_handles[i].ds_synth(record, data_src);
1042 		}
1043 	}
1044 
1045 	return;
1046 }
1047 
1048 static union perf_mem_data_src
1049 arm_spe__synth_data_source(struct arm_spe_queue *speq,
1050 			   const struct arm_spe_record *record)
1051 {
1052 	union perf_mem_data_src	data_src = {};
1053 
1054 	if (!is_mem_op(record->op))
1055 		return data_src;
1056 
1057 	if (record->op & ARM_SPE_OP_LD)
1058 		data_src.mem_op = PERF_MEM_OP_LOAD;
1059 	else if (record->op & ARM_SPE_OP_ST)
1060 		data_src.mem_op = PERF_MEM_OP_STORE;
1061 	else
1062 		data_src.mem_op = PERF_MEM_OP_NA;
1063 
1064 	arm_spe__synth_ds(speq, record, &data_src);
1065 	arm_spe__synth_memory_level(speq, record, &data_src);
1066 
1067 	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
1068 		data_src.mem_dtlb = PERF_MEM_TLB_WK;
1069 
1070 		if (record->type & ARM_SPE_TLB_MISS)
1071 			data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
1072 		else
1073 			data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
1074 	}
1075 
1076 	return data_src;
1077 }
1078 
1079 static int arm_spe_sample(struct arm_spe_queue *speq)
1080 {
1081 	const struct arm_spe_record *record = &speq->decoder->record;
1082 	struct arm_spe *spe = speq->spe;
1083 	union perf_mem_data_src data_src;
1084 	int err;
1085 
1086 	/*
1087 	 * Discard all samples until period is reached
1088 	 */
1089 	speq->sample_count++;
1090 	if (speq->sample_count < spe->synth_opts.period)
1091 		return 0;
1092 	speq->sample_count = 0;
1093 
1094 	arm_spe__sample_flags(speq);
1095 	data_src = arm_spe__synth_data_source(speq, record);
1096 
1097 	if (spe->sample_flc) {
1098 		if (record->type & ARM_SPE_L1D_MISS) {
1099 			err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
1100 							data_src);
1101 			if (err)
1102 				return err;
1103 		}
1104 
1105 		if (record->type & ARM_SPE_L1D_ACCESS) {
1106 			err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
1107 							data_src);
1108 			if (err)
1109 				return err;
1110 		}
1111 	}
1112 
1113 	if (spe->sample_llc) {
1114 		if (record->type & ARM_SPE_LLC_MISS) {
1115 			err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
1116 							data_src);
1117 			if (err)
1118 				return err;
1119 		}
1120 
1121 		if (record->type & ARM_SPE_LLC_ACCESS) {
1122 			err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
1123 							data_src);
1124 			if (err)
1125 				return err;
1126 		}
1127 	}
1128 
1129 	if (spe->sample_tlb) {
1130 		if (record->type & ARM_SPE_TLB_MISS) {
1131 			err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
1132 							data_src);
1133 			if (err)
1134 				return err;
1135 		}
1136 
1137 		if (record->type & ARM_SPE_TLB_ACCESS) {
1138 			err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
1139 							data_src);
1140 			if (err)
1141 				return err;
1142 		}
1143 	}
1144 
1145 	if (spe->synth_opts.last_branch &&
1146 	    (spe->sample_branch || spe->sample_instructions))
1147 		arm_spe__prep_branch_stack(speq);
1148 
1149 	if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
1150 		err = arm_spe__synth_branch_sample(speq, spe->branch_id);
1151 		if (err)
1152 			return err;
1153 	}
1154 
1155 	if (spe->sample_remote_access &&
1156 	    (record->type & ARM_SPE_REMOTE_ACCESS)) {
1157 		err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
1158 						data_src);
1159 		if (err)
1160 			return err;
1161 	}
1162 
1163 	if (spe->sample_memory && is_mem_op(record->op)) {
1164 		err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
1165 		if (err)
1166 			return err;
1167 	}
1168 
1169 	if (spe->sample_instructions) {
1170 		err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
1171 		if (err)
1172 			return err;
1173 	}
1174 
1175 	return 0;
1176 }
1177 
1178 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
1179 {
1180 	struct arm_spe *spe = speq->spe;
1181 	struct arm_spe_record *record;
1182 	int ret;
1183 
1184 	if (!spe->kernel_start)
1185 		spe->kernel_start = machine__kernel_start(spe->machine);
1186 
1187 	while (1) {
1188 		/*
1189 		 * The usual logic is firstly to decode the packets, and then
1190 		 * based the record to synthesize sample; but here the flow is
1191 		 * reversed: it calls arm_spe_sample() for synthesizing samples
1192 		 * prior to arm_spe_decode().
1193 		 *
1194 		 * Two reasons for this code logic:
1195 		 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
1196 		 * has decoded trace data and generated a record, but the record
1197 		 * is left to generate sample until run to here, so it's correct
1198 		 * to synthesize sample for the left record.
1199 		 * 2. After decoding trace data, it needs to compare the record
1200 		 * timestamp with the coming perf event, if the record timestamp
1201 		 * is later than the perf event, it needs bail out and pushs the
1202 		 * record into auxtrace heap, thus the record can be deferred to
1203 		 * synthesize sample until run to here at the next time; so this
1204 		 * can correlate samples between Arm SPE trace data and other
1205 		 * perf events with correct time ordering.
1206 		 */
1207 
1208 		/*
1209 		 * Update pid/tid info.
1210 		 */
1211 		record = &speq->decoder->record;
1212 		if (!spe->timeless_decoding && record->context_id != (u64)-1) {
1213 			ret = arm_spe_set_tid(speq, record->context_id);
1214 			if (ret)
1215 				return ret;
1216 
1217 			spe->use_ctx_pkt_for_pid = true;
1218 		}
1219 
1220 		ret = arm_spe_sample(speq);
1221 		if (ret)
1222 			return ret;
1223 
1224 		ret = arm_spe_decode(speq->decoder);
1225 		if (!ret) {
1226 			pr_debug("No data or all data has been processed.\n");
1227 			return 1;
1228 		}
1229 
1230 		/*
1231 		 * Error is detected when decode SPE trace data, continue to
1232 		 * the next trace data and find out more records.
1233 		 */
1234 		if (ret < 0)
1235 			continue;
1236 
1237 		record = &speq->decoder->record;
1238 
1239 		/* Update timestamp for the last record */
1240 		if (record->timestamp > speq->timestamp)
1241 			speq->timestamp = record->timestamp;
1242 
1243 		/*
1244 		 * If the timestamp of the queue is later than timestamp of the
1245 		 * coming perf event, bail out so can allow the perf event to
1246 		 * be processed ahead.
1247 		 */
1248 		if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
1249 			*timestamp = speq->timestamp;
1250 			return 0;
1251 		}
1252 	}
1253 
1254 	return 0;
1255 }
1256 
1257 static int arm_spe__setup_queue(struct arm_spe *spe,
1258 			       struct auxtrace_queue *queue,
1259 			       unsigned int queue_nr)
1260 {
1261 	struct arm_spe_queue *speq = queue->priv;
1262 	struct arm_spe_record *record;
1263 
1264 	if (list_empty(&queue->head) || speq)
1265 		return 0;
1266 
1267 	speq = arm_spe__alloc_queue(spe, queue_nr);
1268 
1269 	if (!speq)
1270 		return -ENOMEM;
1271 
1272 	queue->priv = speq;
1273 
1274 	if (queue->cpu != -1)
1275 		speq->cpu = queue->cpu;
1276 	arm_spe__get_midr(spe, queue->cpu, &speq->decoder->midr);
1277 
1278 	if (!speq->on_heap) {
1279 		int ret;
1280 
1281 		if (spe->timeless_decoding)
1282 			return 0;
1283 
1284 retry:
1285 		ret = arm_spe_decode(speq->decoder);
1286 
1287 		if (!ret)
1288 			return 0;
1289 
1290 		if (ret < 0)
1291 			goto retry;
1292 
1293 		record = &speq->decoder->record;
1294 
1295 		speq->timestamp = record->timestamp;
1296 		ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
1297 		if (ret)
1298 			return ret;
1299 		speq->on_heap = true;
1300 	}
1301 
1302 	return 0;
1303 }
1304 
1305 static int arm_spe__setup_queues(struct arm_spe *spe)
1306 {
1307 	unsigned int i;
1308 	int ret;
1309 
1310 	for (i = 0; i < spe->queues.nr_queues; i++) {
1311 		ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
1312 		if (ret)
1313 			return ret;
1314 	}
1315 
1316 	return 0;
1317 }
1318 
1319 static int arm_spe__update_queues(struct arm_spe *spe)
1320 {
1321 	if (spe->queues.new_data) {
1322 		spe->queues.new_data = false;
1323 		return arm_spe__setup_queues(spe);
1324 	}
1325 
1326 	return 0;
1327 }
1328 
1329 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
1330 {
1331 	struct evsel *evsel;
1332 	struct evlist *evlist = spe->session->evlist;
1333 	bool timeless_decoding = true;
1334 
1335 	/*
1336 	 * Circle through the list of event and complain if we find one
1337 	 * with the time bit set.
1338 	 */
1339 	evlist__for_each_entry(evlist, evsel) {
1340 		if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
1341 			timeless_decoding = false;
1342 	}
1343 
1344 	return timeless_decoding;
1345 }
1346 
1347 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
1348 {
1349 	unsigned int queue_nr;
1350 	u64 ts;
1351 	int ret;
1352 
1353 	while (1) {
1354 		struct auxtrace_queue *queue;
1355 		struct arm_spe_queue *speq;
1356 
1357 		if (!spe->heap.heap_cnt)
1358 			return 0;
1359 
1360 		if (spe->heap.heap_array[0].ordinal >= timestamp)
1361 			return 0;
1362 
1363 		queue_nr = spe->heap.heap_array[0].queue_nr;
1364 		queue = &spe->queues.queue_array[queue_nr];
1365 		speq = queue->priv;
1366 
1367 		auxtrace_heap__pop(&spe->heap);
1368 
1369 		if (spe->heap.heap_cnt) {
1370 			ts = spe->heap.heap_array[0].ordinal + 1;
1371 			if (ts > timestamp)
1372 				ts = timestamp;
1373 		} else {
1374 			ts = timestamp;
1375 		}
1376 
1377 		/*
1378 		 * A previous context-switch event has set pid/tid in the machine's context, so
1379 		 * here we need to update the pid/tid in the thread and SPE queue.
1380 		 */
1381 		if (!spe->use_ctx_pkt_for_pid)
1382 			arm_spe_set_pid_tid_cpu(spe, queue);
1383 
1384 		ret = arm_spe_run_decoder(speq, &ts);
1385 		if (ret < 0) {
1386 			auxtrace_heap__add(&spe->heap, queue_nr, ts);
1387 			return ret;
1388 		}
1389 
1390 		if (!ret) {
1391 			ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
1392 			if (ret < 0)
1393 				return ret;
1394 		} else {
1395 			speq->on_heap = false;
1396 		}
1397 	}
1398 
1399 	return 0;
1400 }
1401 
1402 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
1403 					    u64 time_)
1404 {
1405 	struct auxtrace_queues *queues = &spe->queues;
1406 	unsigned int i;
1407 	u64 ts = 0;
1408 
1409 	for (i = 0; i < queues->nr_queues; i++) {
1410 		struct auxtrace_queue *queue = &spe->queues.queue_array[i];
1411 		struct arm_spe_queue *speq = queue->priv;
1412 
1413 		if (speq && (tid == -1 || speq->tid == tid)) {
1414 			speq->time = time_;
1415 			arm_spe_set_pid_tid_cpu(spe, queue);
1416 			arm_spe_run_decoder(speq, &ts);
1417 		}
1418 	}
1419 	return 0;
1420 }
1421 
1422 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
1423 				  struct perf_sample *sample)
1424 {
1425 	pid_t pid, tid;
1426 	int cpu;
1427 
1428 	if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
1429 		return 0;
1430 
1431 	pid = event->context_switch.next_prev_pid;
1432 	tid = event->context_switch.next_prev_tid;
1433 	cpu = sample->cpu;
1434 
1435 	if (tid == -1)
1436 		pr_warning("context_switch event has no tid\n");
1437 
1438 	return machine__set_current_tid(spe->machine, cpu, pid, tid);
1439 }
1440 
1441 static int arm_spe_process_event(struct perf_session *session,
1442 				 union perf_event *event,
1443 				 struct perf_sample *sample,
1444 				 const struct perf_tool *tool)
1445 {
1446 	int err = 0;
1447 	u64 timestamp;
1448 	struct arm_spe *spe = container_of(session->auxtrace,
1449 			struct arm_spe, auxtrace);
1450 
1451 	if (dump_trace)
1452 		return 0;
1453 
1454 	if (!tool->ordered_events) {
1455 		pr_err("SPE trace requires ordered events\n");
1456 		return -EINVAL;
1457 	}
1458 
1459 	if (sample->time && (sample->time != (u64) -1))
1460 		timestamp = perf_time_to_tsc(sample->time, &spe->tc);
1461 	else
1462 		timestamp = 0;
1463 
1464 	if (timestamp || spe->timeless_decoding) {
1465 		err = arm_spe__update_queues(spe);
1466 		if (err)
1467 			return err;
1468 	}
1469 
1470 	if (spe->timeless_decoding) {
1471 		if (event->header.type == PERF_RECORD_EXIT) {
1472 			err = arm_spe_process_timeless_queues(spe,
1473 					event->fork.tid,
1474 					sample->time);
1475 		}
1476 	} else if (timestamp) {
1477 		err = arm_spe_process_queues(spe, timestamp);
1478 		if (err)
1479 			return err;
1480 
1481 		if (!spe->use_ctx_pkt_for_pid &&
1482 		    (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
1483 		    event->header.type == PERF_RECORD_SWITCH))
1484 			err = arm_spe_context_switch(spe, event, sample);
1485 	}
1486 
1487 	return err;
1488 }
1489 
1490 static int arm_spe_process_auxtrace_event(struct perf_session *session,
1491 					  union perf_event *event,
1492 					  const struct perf_tool *tool __maybe_unused)
1493 {
1494 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1495 					     auxtrace);
1496 
1497 	if (!spe->data_queued) {
1498 		struct auxtrace_buffer *buffer;
1499 		off_t data_offset;
1500 		int fd = perf_data__fd(session->data);
1501 		int err;
1502 
1503 		if (perf_data__is_pipe(session->data)) {
1504 			data_offset = 0;
1505 		} else {
1506 			data_offset = lseek(fd, 0, SEEK_CUR);
1507 			if (data_offset == -1)
1508 				return -errno;
1509 		}
1510 
1511 		err = auxtrace_queues__add_event(&spe->queues, session, event,
1512 				data_offset, &buffer);
1513 		if (err)
1514 			return err;
1515 
1516 		/* Dump here now we have copied a piped trace out of the pipe */
1517 		if (dump_trace) {
1518 			if (auxtrace_buffer__get_data(buffer, fd)) {
1519 				u64 midr = 0;
1520 
1521 				arm_spe__get_midr(spe, buffer->cpu.cpu, &midr);
1522 				arm_spe_dump_event(spe, buffer->data,
1523 						   buffer->size, midr);
1524 				auxtrace_buffer__put_data(buffer);
1525 			}
1526 		}
1527 	}
1528 
1529 	return 0;
1530 }
1531 
1532 static int arm_spe_flush(struct perf_session *session __maybe_unused,
1533 			 const struct perf_tool *tool __maybe_unused)
1534 {
1535 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1536 			auxtrace);
1537 	int ret;
1538 
1539 	if (dump_trace)
1540 		return 0;
1541 
1542 	if (!tool->ordered_events)
1543 		return -EINVAL;
1544 
1545 	ret = arm_spe__update_queues(spe);
1546 	if (ret < 0)
1547 		return ret;
1548 
1549 	if (spe->timeless_decoding)
1550 		return arm_spe_process_timeless_queues(spe, -1,
1551 				MAX_TIMESTAMP - 1);
1552 
1553 	ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
1554 	if (ret)
1555 		return ret;
1556 
1557 	if (!spe->use_ctx_pkt_for_pid)
1558 		ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
1559 			    "Matching of TIDs to SPE events could be inaccurate.\n");
1560 
1561 	return 0;
1562 }
1563 
1564 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
1565 {
1566 	u64 *metadata;
1567 
1568 	metadata = zalloc(per_cpu_size);
1569 	if (!metadata)
1570 		return NULL;
1571 
1572 	memcpy(metadata, buf, per_cpu_size);
1573 	return metadata;
1574 }
1575 
1576 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
1577 {
1578 	int i;
1579 
1580 	for (i = 0; i < nr_cpu; i++)
1581 		zfree(&metadata[i]);
1582 	free(metadata);
1583 }
1584 
1585 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
1586 				     u64 *ver, int *nr_cpu)
1587 {
1588 	u64 *ptr = (u64 *)info->priv;
1589 	u64 metadata_size;
1590 	u64 **metadata = NULL;
1591 	int hdr_sz, per_cpu_sz, i;
1592 
1593 	metadata_size = info->header.size -
1594 		sizeof(struct perf_record_auxtrace_info);
1595 
1596 	/* Metadata version 1 */
1597 	if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
1598 		*ver = 1;
1599 		*nr_cpu = 0;
1600 		/* No per CPU metadata */
1601 		return NULL;
1602 	}
1603 
1604 	*ver = ptr[ARM_SPE_HEADER_VERSION];
1605 	hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
1606 	*nr_cpu = ptr[ARM_SPE_CPUS_NUM];
1607 
1608 	metadata = calloc(*nr_cpu, sizeof(*metadata));
1609 	if (!metadata)
1610 		return NULL;
1611 
1612 	/* Locate the start address of per CPU metadata */
1613 	ptr += hdr_sz;
1614 	per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
1615 
1616 	for (i = 0; i < *nr_cpu; i++) {
1617 		metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
1618 		if (!metadata[i])
1619 			goto err_per_cpu_metadata;
1620 
1621 		ptr += per_cpu_sz / sizeof(u64);
1622 	}
1623 
1624 	return metadata;
1625 
1626 err_per_cpu_metadata:
1627 	arm_spe__free_metadata(metadata, *nr_cpu);
1628 	return NULL;
1629 }
1630 
1631 static void arm_spe_free_queue(void *priv)
1632 {
1633 	struct arm_spe_queue *speq = priv;
1634 
1635 	if (!speq)
1636 		return;
1637 	thread__zput(speq->thread);
1638 	arm_spe_decoder_free(speq->decoder);
1639 	zfree(&speq->event_buf);
1640 	zfree(&speq->last_branch);
1641 	free(speq);
1642 }
1643 
1644 static void arm_spe_free_events(struct perf_session *session)
1645 {
1646 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1647 					     auxtrace);
1648 	struct auxtrace_queues *queues = &spe->queues;
1649 	unsigned int i;
1650 
1651 	for (i = 0; i < queues->nr_queues; i++) {
1652 		arm_spe_free_queue(queues->queue_array[i].priv);
1653 		queues->queue_array[i].priv = NULL;
1654 	}
1655 	auxtrace_queues__free(queues);
1656 }
1657 
1658 static void arm_spe_free(struct perf_session *session)
1659 {
1660 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1661 					     auxtrace);
1662 
1663 	auxtrace_heap__free(&spe->heap);
1664 	arm_spe_free_events(session);
1665 	session->auxtrace = NULL;
1666 	arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
1667 	free(spe);
1668 }
1669 
1670 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
1671 				      struct evsel *evsel)
1672 {
1673 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);
1674 
1675 	return evsel->core.attr.type == spe->pmu_type;
1676 }
1677 
1678 static const char * const metadata_hdr_v1_fmts[] = {
1679 	[ARM_SPE_PMU_TYPE]		= "  PMU Type           :%"PRId64"\n",
1680 	[ARM_SPE_PER_CPU_MMAPS]		= "  Per CPU mmaps      :%"PRId64"\n",
1681 };
1682 
1683 static const char * const metadata_hdr_fmts[] = {
1684 	[ARM_SPE_HEADER_VERSION]	= "  Header version     :%"PRId64"\n",
1685 	[ARM_SPE_HEADER_SIZE]		= "  Header size        :%"PRId64"\n",
1686 	[ARM_SPE_PMU_TYPE_V2]		= "  PMU type v2        :%"PRId64"\n",
1687 	[ARM_SPE_CPUS_NUM]		= "  CPU number         :%"PRId64"\n",
1688 };
1689 
1690 static const char * const metadata_per_cpu_fmts[] = {
1691 	[ARM_SPE_MAGIC]			= "    Magic            :0x%"PRIx64"\n",
1692 	[ARM_SPE_CPU]			= "    CPU #            :%"PRId64"\n",
1693 	[ARM_SPE_CPU_NR_PARAMS]		= "    Num of params    :%"PRId64"\n",
1694 	[ARM_SPE_CPU_MIDR]		= "    MIDR             :0x%"PRIx64"\n",
1695 	[ARM_SPE_CPU_PMU_TYPE]		= "    PMU Type         :%"PRId64"\n",
1696 	[ARM_SPE_CAP_MIN_IVAL]		= "    Min Interval     :%"PRId64"\n",
1697 	[ARM_SPE_CAP_EVENT_FILTER]	= "    Event Filter     :0x%"PRIx64"\n",
1698 };
1699 
1700 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
1701 {
1702 	unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
1703 	const char * const *hdr_fmts;
1704 
1705 	if (!dump_trace)
1706 		return;
1707 
1708 	if (spe->metadata_ver == 1) {
1709 		cpu_num = 0;
1710 		hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
1711 		hdr_fmts = metadata_hdr_v1_fmts;
1712 	} else {
1713 		cpu_num = arr[ARM_SPE_CPUS_NUM];
1714 		hdr_size = arr[ARM_SPE_HEADER_SIZE];
1715 		hdr_fmts = metadata_hdr_fmts;
1716 	}
1717 
1718 	for (i = 0; i < hdr_size; i++)
1719 		fprintf(stdout, hdr_fmts[i], arr[i]);
1720 
1721 	arr += hdr_size;
1722 	for (cpu = 0; cpu < cpu_num; cpu++) {
1723 		/*
1724 		 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
1725 		 * are fixed. The sequential parameter size is decided by the
1726 		 * field 'ARM_SPE_CPU_NR_PARAMS'.
1727 		 */
1728 		cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
1729 		for (i = 0; i < cpu_size; i++)
1730 			fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
1731 		arr += cpu_size;
1732 	}
1733 }
1734 
1735 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
1736 				    const char *name)
1737 {
1738 	struct evsel *evsel;
1739 
1740 	evlist__for_each_entry(evlist, evsel) {
1741 		if (evsel->core.id && evsel->core.id[0] == id) {
1742 			if (evsel->name)
1743 				zfree(&evsel->name);
1744 			evsel->name = strdup(name);
1745 			break;
1746 		}
1747 	}
1748 }
1749 
1750 static int
1751 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
1752 {
1753 	struct evlist *evlist = session->evlist;
1754 	struct evsel *evsel;
1755 	struct perf_event_attr attr;
1756 	bool found = false;
1757 	u64 id;
1758 	int err;
1759 
1760 	evlist__for_each_entry(evlist, evsel) {
1761 		if (evsel->core.attr.type == spe->pmu_type) {
1762 			found = true;
1763 			break;
1764 		}
1765 	}
1766 
1767 	if (!found) {
1768 		pr_debug("No selected events with SPE trace data\n");
1769 		return 0;
1770 	}
1771 
1772 	memset(&attr, 0, sizeof(struct perf_event_attr));
1773 	attr.size = sizeof(struct perf_event_attr);
1774 	attr.type = PERF_TYPE_HARDWARE;
1775 	attr.sample_type = evsel->core.attr.sample_type &
1776 				(PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
1777 	attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
1778 			    PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
1779 			    PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
1780 	if (spe->timeless_decoding)
1781 		attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
1782 	else
1783 		attr.sample_type |= PERF_SAMPLE_TIME;
1784 
1785 	spe->sample_type = attr.sample_type;
1786 
1787 	attr.exclude_user = evsel->core.attr.exclude_user;
1788 	attr.exclude_kernel = evsel->core.attr.exclude_kernel;
1789 	attr.exclude_hv = evsel->core.attr.exclude_hv;
1790 	attr.exclude_host = evsel->core.attr.exclude_host;
1791 	attr.exclude_guest = evsel->core.attr.exclude_guest;
1792 	attr.sample_id_all = evsel->core.attr.sample_id_all;
1793 	attr.read_format = evsel->core.attr.read_format;
1794 	attr.sample_period = spe->synth_opts.period;
1795 
1796 	/* create new id val to be a fixed offset from evsel id */
1797 	id = auxtrace_synth_id_range_start(evsel);
1798 
1799 	if (spe->synth_opts.flc) {
1800 		spe->sample_flc = true;
1801 
1802 		/* Level 1 data cache miss */
1803 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1804 		if (err)
1805 			return err;
1806 		spe->l1d_miss_id = id;
1807 		arm_spe_set_event_name(evlist, id, "l1d-miss");
1808 		id += 1;
1809 
1810 		/* Level 1 data cache access */
1811 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1812 		if (err)
1813 			return err;
1814 		spe->l1d_access_id = id;
1815 		arm_spe_set_event_name(evlist, id, "l1d-access");
1816 		id += 1;
1817 	}
1818 
1819 	if (spe->synth_opts.llc) {
1820 		spe->sample_llc = true;
1821 
1822 		/* Last level cache miss */
1823 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1824 		if (err)
1825 			return err;
1826 		spe->llc_miss_id = id;
1827 		arm_spe_set_event_name(evlist, id, "llc-miss");
1828 		id += 1;
1829 
1830 		/* Last level cache access */
1831 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1832 		if (err)
1833 			return err;
1834 		spe->llc_access_id = id;
1835 		arm_spe_set_event_name(evlist, id, "llc-access");
1836 		id += 1;
1837 	}
1838 
1839 	if (spe->synth_opts.tlb) {
1840 		spe->sample_tlb = true;
1841 
1842 		/* TLB miss */
1843 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1844 		if (err)
1845 			return err;
1846 		spe->tlb_miss_id = id;
1847 		arm_spe_set_event_name(evlist, id, "tlb-miss");
1848 		id += 1;
1849 
1850 		/* TLB access */
1851 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1852 		if (err)
1853 			return err;
1854 		spe->tlb_access_id = id;
1855 		arm_spe_set_event_name(evlist, id, "tlb-access");
1856 		id += 1;
1857 	}
1858 
1859 	if (spe->synth_opts.last_branch) {
1860 		if (spe->synth_opts.last_branch_sz > 2)
1861 			pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
1862 
1863 		attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
1864 		/*
1865 		 * We don't use the hardware index, but the sample generation
1866 		 * code uses the new format branch_stack with this field,
1867 		 * so the event attributes must indicate that it's present.
1868 		 */
1869 		attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
1870 	}
1871 
1872 	if (spe->synth_opts.branches) {
1873 		spe->sample_branch = true;
1874 
1875 		/* Branch */
1876 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1877 		if (err)
1878 			return err;
1879 		spe->branch_id = id;
1880 		arm_spe_set_event_name(evlist, id, "branch");
1881 		id += 1;
1882 	}
1883 
1884 	if (spe->synth_opts.remote_access) {
1885 		spe->sample_remote_access = true;
1886 
1887 		/* Remote access */
1888 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1889 		if (err)
1890 			return err;
1891 		spe->remote_access_id = id;
1892 		arm_spe_set_event_name(evlist, id, "remote-access");
1893 		id += 1;
1894 	}
1895 
1896 	if (spe->synth_opts.mem) {
1897 		spe->sample_memory = true;
1898 
1899 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1900 		if (err)
1901 			return err;
1902 		spe->memory_id = id;
1903 		arm_spe_set_event_name(evlist, id, "memory");
1904 		id += 1;
1905 	}
1906 
1907 	if (spe->synth_opts.instructions) {
1908 		spe->sample_instructions = true;
1909 		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
1910 
1911 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1912 		if (err)
1913 			return err;
1914 		spe->instructions_id = id;
1915 		arm_spe_set_event_name(evlist, id, "instructions");
1916 	}
1917 
1918 	return 0;
1919 }
1920 
1921 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
1922 {
1923 	u64 midr;
1924 	int i;
1925 
1926 	if (!nr_cpu)
1927 		return false;
1928 
1929 	for (i = 0; i < nr_cpu; i++) {
1930 		if (!metadata[i])
1931 			return false;
1932 
1933 		if (i == 0) {
1934 			midr = metadata[i][ARM_SPE_CPU_MIDR];
1935 			continue;
1936 		}
1937 
1938 		if (midr != metadata[i][ARM_SPE_CPU_MIDR])
1939 			return false;
1940 	}
1941 
1942 	return true;
1943 }
1944 
1945 int arm_spe_process_auxtrace_info(union perf_event *event,
1946 				  struct perf_session *session)
1947 {
1948 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1949 	size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
1950 	struct perf_record_time_conv *tc = &session->time_conv;
1951 	struct arm_spe *spe;
1952 	u64 **metadata = NULL;
1953 	u64 metadata_ver;
1954 	int nr_cpu, err;
1955 
1956 	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
1957 					min_sz)
1958 		return -EINVAL;
1959 
1960 	metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
1961 					   &nr_cpu);
1962 	if (!metadata && metadata_ver != 1) {
1963 		pr_err("Failed to parse Arm SPE metadata.\n");
1964 		return -EINVAL;
1965 	}
1966 
1967 	spe = zalloc(sizeof(struct arm_spe));
1968 	if (!spe) {
1969 		err = -ENOMEM;
1970 		goto err_free_metadata;
1971 	}
1972 
1973 	err = auxtrace_queues__init(&spe->queues);
1974 	if (err)
1975 		goto err_free;
1976 
1977 	spe->session = session;
1978 	spe->machine = &session->machines.host; /* No kvm support */
1979 	spe->auxtrace_type = auxtrace_info->type;
1980 	if (metadata_ver == 1)
1981 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1982 	else
1983 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
1984 	spe->metadata = metadata;
1985 	spe->metadata_ver = metadata_ver;
1986 	spe->metadata_nr_cpu = nr_cpu;
1987 	spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
1988 
1989 	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
1990 
1991 	/*
1992 	 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
1993 	 * and the parameters for hardware clock are stored in the session
1994 	 * context.  Passes these parameters to the struct perf_tsc_conversion
1995 	 * in "spe->tc", which is used for later conversion between clock
1996 	 * counter and timestamp.
1997 	 *
1998 	 * For backward compatibility, copies the fields starting from
1999 	 * "time_cycles" only if they are contained in the event.
2000 	 */
2001 	spe->tc.time_shift = tc->time_shift;
2002 	spe->tc.time_mult = tc->time_mult;
2003 	spe->tc.time_zero = tc->time_zero;
2004 
2005 	if (event_contains(*tc, cap_user_time_short)) {
2006 		spe->tc.time_cycles = tc->time_cycles;
2007 		spe->tc.time_mask = tc->time_mask;
2008 		spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
2009 		spe->tc.cap_user_time_short = tc->cap_user_time_short;
2010 	}
2011 
2012 	spe->auxtrace.process_event = arm_spe_process_event;
2013 	spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
2014 	spe->auxtrace.flush_events = arm_spe_flush;
2015 	spe->auxtrace.free_events = arm_spe_free_events;
2016 	spe->auxtrace.free = arm_spe_free;
2017 	spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
2018 	session->auxtrace = &spe->auxtrace;
2019 
2020 	arm_spe_print_info(spe, &auxtrace_info->priv[0]);
2021 
2022 	if (dump_trace)
2023 		return 0;
2024 
2025 	if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
2026 		spe->synth_opts = *session->itrace_synth_opts;
2027 	} else {
2028 		itrace_synth_opts__set_default(&spe->synth_opts, false);
2029 		/* Default nanoseconds period not supported */
2030 		spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS;
2031 		spe->synth_opts.period = 1;
2032 	}
2033 
2034 	if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
2035 		ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n");
2036 		err = -EINVAL;
2037 		goto err_free_queues;
2038 	}
2039 	if (spe->synth_opts.period > 1)
2040 		ui__warning("Arm SPE has a hardware-based sampling period.\n\n"
2041 			    "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n");
2042 
2043 	err = arm_spe_synth_events(spe, session);
2044 	if (err)
2045 		goto err_free_queues;
2046 
2047 	err = auxtrace_queues__process_index(&spe->queues, session);
2048 	if (err)
2049 		goto err_free_queues;
2050 
2051 	if (spe->queues.populated)
2052 		spe->data_queued = true;
2053 
2054 	return 0;
2055 
2056 err_free_queues:
2057 	auxtrace_queues__free(&spe->queues);
2058 	session->auxtrace = NULL;
2059 err_free:
2060 	free(spe);
2061 err_free_metadata:
2062 	arm_spe__free_metadata(metadata, nr_cpu);
2063 	return err;
2064 }
2065