xref: /linux/tools/perf/util/arm-spe.c (revision 4e03d6494f9504f8af46ba68a2a8b6877c196789)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Arm Statistical Profiling Extensions (SPE) support
4  * Copyright (c) 2017-2018, Arm Ltd.
5  */
6 
7 #include <byteswap.h>
8 #include <endian.h>
9 #include <errno.h>
10 #include <inttypes.h>
11 #include <linux/bitops.h>
12 #include <linux/kernel.h>
13 #include <linux/log2.h>
14 #include <linux/types.h>
15 #include <linux/zalloc.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18 
19 #include "auxtrace.h"
20 #include "color.h"
21 #include "debug.h"
22 #include "evlist.h"
23 #include "evsel.h"
24 #include "machine.h"
25 #include "session.h"
26 #include "symbol.h"
27 #include "thread.h"
28 #include "thread-stack.h"
29 #include "tsc.h"
30 #include "tool.h"
31 #include "util/synthetic-events.h"
32 
33 #include "arm-spe.h"
34 #include "arm-spe-decoder/arm-spe-decoder.h"
35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
36 
37 #include "../../arch/arm64/include/asm/cputype.h"
38 #define MAX_TIMESTAMP (~0ULL)
39 
40 #define is_ldst_op(op)		(!!((op) & ARM_SPE_OP_LDST))
41 
42 #define is_simd_op(op)		(!!((op) & (ARM_SPE_OP_SIMD_FP | ARM_SPE_OP_SVE | \
43 					    ARM_SPE_OP_SME | ARM_SPE_OP_ASE)))
44 
45 #define is_mem_op(op)		(is_ldst_op(op) || is_simd_op(op))
46 
47 #define ARM_SPE_CACHE_EVENT(lvl) \
48 	(ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS)
49 
50 #define arm_spe_is_cache_level(type, lvl) \
51 	((type) & ARM_SPE_CACHE_EVENT(lvl))
52 
53 #define arm_spe_is_cache_hit(type, lvl) \
54 	(((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS)
55 
56 #define arm_spe_is_cache_miss(type, lvl) \
57 	((type) & ARM_SPE_##lvl##_MISS)
58 
59 struct arm_spe {
60 	struct auxtrace			auxtrace;
61 	struct auxtrace_queues		queues;
62 	struct auxtrace_heap		heap;
63 	struct itrace_synth_opts        synth_opts;
64 	u32				auxtrace_type;
65 	struct perf_session		*session;
66 	struct machine			*machine;
67 	u32				pmu_type;
68 
69 	struct perf_tsc_conversion	tc;
70 
71 	u8				timeless_decoding;
72 	u8				data_queued;
73 
74 	u64				sample_type;
75 	u8				sample_flc;
76 	u8				sample_llc;
77 	u8				sample_tlb;
78 	u8				sample_branch;
79 	u8				sample_remote_access;
80 	u8				sample_memory;
81 	u8				sample_instructions;
82 
83 	u64				l1d_miss_id;
84 	u64				l1d_access_id;
85 	u64				llc_miss_id;
86 	u64				llc_access_id;
87 	u64				tlb_miss_id;
88 	u64				tlb_access_id;
89 	u64				branch_id;
90 	u64				remote_access_id;
91 	u64				memory_id;
92 	u64				instructions_id;
93 
94 	u64				kernel_start;
95 
96 	unsigned long			num_events;
97 	u8				use_ctx_pkt_for_pid;
98 
99 	u64				**metadata;
100 	u64				metadata_ver;
101 	u64				metadata_nr_cpu;
102 	bool				is_homogeneous;
103 };
104 
105 struct arm_spe_queue {
106 	struct arm_spe			*spe;
107 	unsigned int			queue_nr;
108 	struct auxtrace_buffer		*buffer;
109 	struct auxtrace_buffer		*old_buffer;
110 	union perf_event		*event_buf;
111 	bool				on_heap;
112 	bool				done;
113 	pid_t				pid;
114 	pid_t				tid;
115 	int				cpu;
116 	struct arm_spe_decoder		*decoder;
117 	u64				time;
118 	u64				timestamp;
119 	struct thread			*thread;
120 	u64				sample_count;
121 	u32				flags;
122 	struct branch_stack		*last_branch;
123 };
124 
125 struct data_source_handle {
126 	const struct midr_range *midr_ranges;
127 	void (*ds_synth)(const struct arm_spe_record *record,
128 			 union perf_mem_data_src *data_src);
129 };
130 
131 #define DS(range, func)					\
132 	{						\
133 		.midr_ranges = range,			\
134 		.ds_synth = arm_spe__synth_##func,	\
135 	}
136 
137 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
138 			 unsigned char *buf, size_t len)
139 {
140 	struct arm_spe_pkt packet;
141 	size_t pos = 0;
142 	int ret, pkt_len, i;
143 	char desc[ARM_SPE_PKT_DESC_MAX];
144 	const char *color = PERF_COLOR_BLUE;
145 
146 	color_fprintf(stdout, color,
147 		      ". ... ARM SPE data: size %#zx bytes\n",
148 		      len);
149 
150 	while (len) {
151 		ret = arm_spe_get_packet(buf, len, &packet);
152 		if (ret > 0)
153 			pkt_len = ret;
154 		else
155 			pkt_len = 1;
156 		printf(".");
157 		color_fprintf(stdout, color, "  %08zx: ", pos);
158 		for (i = 0; i < pkt_len; i++)
159 			color_fprintf(stdout, color, " %02x", buf[i]);
160 		for (; i < 16; i++)
161 			color_fprintf(stdout, color, "   ");
162 		if (ret > 0) {
163 			ret = arm_spe_pkt_desc(&packet, desc,
164 					       ARM_SPE_PKT_DESC_MAX);
165 			if (!ret)
166 				color_fprintf(stdout, color, " %s\n", desc);
167 		} else {
168 			color_fprintf(stdout, color, " Bad packet!\n");
169 		}
170 		pos += pkt_len;
171 		buf += pkt_len;
172 		len -= pkt_len;
173 	}
174 }
175 
176 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
177 			       size_t len)
178 {
179 	printf(".\n");
180 	arm_spe_dump(spe, buf, len);
181 }
182 
183 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
184 {
185 	struct arm_spe_queue *speq = data;
186 	struct auxtrace_buffer *buffer = speq->buffer;
187 	struct auxtrace_buffer *old_buffer = speq->old_buffer;
188 	struct auxtrace_queue *queue;
189 
190 	queue = &speq->spe->queues.queue_array[speq->queue_nr];
191 
192 	buffer = auxtrace_buffer__next(queue, buffer);
193 	/* If no more data, drop the previous auxtrace_buffer and return */
194 	if (!buffer) {
195 		if (old_buffer)
196 			auxtrace_buffer__drop_data(old_buffer);
197 		b->len = 0;
198 		return 0;
199 	}
200 
201 	speq->buffer = buffer;
202 
203 	/* If the aux_buffer doesn't have data associated, try to load it */
204 	if (!buffer->data) {
205 		/* get the file desc associated with the perf data file */
206 		int fd = perf_data__fd(speq->spe->session->data);
207 
208 		buffer->data = auxtrace_buffer__get_data(buffer, fd);
209 		if (!buffer->data)
210 			return -ENOMEM;
211 	}
212 
213 	b->len = buffer->size;
214 	b->buf = buffer->data;
215 
216 	if (b->len) {
217 		if (old_buffer)
218 			auxtrace_buffer__drop_data(old_buffer);
219 		speq->old_buffer = buffer;
220 	} else {
221 		auxtrace_buffer__drop_data(buffer);
222 		return arm_spe_get_trace(b, data);
223 	}
224 
225 	return 0;
226 }
227 
228 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
229 		unsigned int queue_nr)
230 {
231 	struct arm_spe_params params = { .get_trace = 0, };
232 	struct arm_spe_queue *speq;
233 
234 	speq = zalloc(sizeof(*speq));
235 	if (!speq)
236 		return NULL;
237 
238 	speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
239 	if (!speq->event_buf)
240 		goto out_free;
241 
242 	speq->spe = spe;
243 	speq->queue_nr = queue_nr;
244 	speq->pid = -1;
245 	speq->tid = -1;
246 	speq->cpu = -1;
247 
248 	/* params set */
249 	params.get_trace = arm_spe_get_trace;
250 	params.data = speq;
251 
252 	if (spe->synth_opts.last_branch) {
253 		size_t sz = sizeof(struct branch_stack);
254 
255 		/* Allocate up to two entries for PBT + TGT */
256 		sz += sizeof(struct branch_entry) *
257 			min(spe->synth_opts.last_branch_sz, 2U);
258 		speq->last_branch = zalloc(sz);
259 		if (!speq->last_branch)
260 			goto out_free;
261 	}
262 
263 	/* create new decoder */
264 	speq->decoder = arm_spe_decoder_new(&params);
265 	if (!speq->decoder)
266 		goto out_free;
267 
268 	return speq;
269 
270 out_free:
271 	zfree(&speq->event_buf);
272 	zfree(&speq->last_branch);
273 	free(speq);
274 
275 	return NULL;
276 }
277 
278 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
279 {
280 	return ip >= spe->kernel_start ?
281 		PERF_RECORD_MISC_KERNEL :
282 		PERF_RECORD_MISC_USER;
283 }
284 
285 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
286 				    struct auxtrace_queue *queue)
287 {
288 	struct arm_spe_queue *speq = queue->priv;
289 	pid_t tid;
290 
291 	tid = machine__get_current_tid(spe->machine, speq->cpu);
292 	if (tid != -1) {
293 		speq->tid = tid;
294 		thread__zput(speq->thread);
295 	} else
296 		speq->tid = queue->tid;
297 
298 	if ((!speq->thread) && (speq->tid != -1)) {
299 		speq->thread = machine__find_thread(spe->machine, -1,
300 						    speq->tid);
301 	}
302 
303 	if (speq->thread) {
304 		speq->pid = thread__pid(speq->thread);
305 		if (queue->cpu == -1)
306 			speq->cpu = thread__cpu(speq->thread);
307 	}
308 }
309 
310 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
311 {
312 	struct arm_spe *spe = speq->spe;
313 	int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);
314 
315 	if (err)
316 		return err;
317 
318 	arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
319 
320 	return 0;
321 }
322 
323 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu)
324 {
325 	u64 i;
326 
327 	if (!spe->metadata)
328 		return NULL;
329 
330 	/* CPU ID is -1 for per-thread mode */
331 	if (cpu < 0) {
332 		/*
333 		 * On the heterogeneous system, due to CPU ID is -1,
334 		 * cannot confirm the data source packet is supported.
335 		 */
336 		if (!spe->is_homogeneous)
337 			return NULL;
338 
339 		/* In homogeneous system, simply use CPU0's metadata */
340 		return spe->metadata[0];
341 	}
342 
343 	for (i = 0; i < spe->metadata_nr_cpu; i++)
344 		if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu)
345 			return spe->metadata[i];
346 
347 	return NULL;
348 }
349 
350 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
351 {
352 	struct simd_flags simd_flags = {};
353 
354 	if (record->op & ARM_SPE_OP_SVE)
355 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
356 	else if (record->op & ARM_SPE_OP_SME)
357 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SME;
358 	else if (record->op & (ARM_SPE_OP_ASE | ARM_SPE_OP_SIMD_FP))
359 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_ASE;
360 
361 	if (record->op & ARM_SPE_OP_SVE) {
362 		if (!(record->op & ARM_SPE_OP_PRED))
363 			simd_flags.pred = SIMD_OP_FLAGS_PRED_DISABLED;
364 		else if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
365 			simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL;
366 		else if (record->type & ARM_SPE_SVE_EMPTY_PRED)
367 			simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY;
368 		else
369 			simd_flags.pred = SIMD_OP_FLAGS_PRED_FULL;
370 	} else {
371 		if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
372 			simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL;
373 		else if (record->type & ARM_SPE_SVE_EMPTY_PRED)
374 			simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY;
375 	}
376 
377 	return simd_flags;
378 }
379 
380 static void arm_spe_prep_sample(struct arm_spe *spe,
381 				struct arm_spe_queue *speq,
382 				union perf_event *event,
383 				struct perf_sample *sample)
384 {
385 	struct arm_spe_record *record = &speq->decoder->record;
386 
387 	if (!spe->timeless_decoding)
388 		sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
389 
390 	sample->ip = record->from_ip;
391 	sample->cpumode = arm_spe_cpumode(spe, sample->ip);
392 	sample->pid = speq->pid;
393 	sample->tid = speq->tid;
394 	sample->period = spe->synth_opts.period;
395 	sample->cpu = speq->cpu;
396 	sample->simd_flags = arm_spe__synth_simd_flags(record);
397 
398 	event->sample.header.type = PERF_RECORD_SAMPLE;
399 	event->sample.header.misc = sample->cpumode;
400 	event->sample.header.size = sizeof(struct perf_event_header);
401 }
402 
403 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
404 {
405 	struct arm_spe *spe = speq->spe;
406 	struct arm_spe_record *record = &speq->decoder->record;
407 	struct branch_stack *bstack = speq->last_branch;
408 	struct branch_flags *bs_flags;
409 	unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
410 	bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
411 	bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
412 	size_t sz = sizeof(struct branch_stack) +
413 		    sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
414 	int i = 0;
415 
416 	/* Clean up branch stack */
417 	memset(bstack, 0x0, sz);
418 
419 	if (!have_tgt && !have_pbt)
420 		return;
421 
422 	if (have_tgt) {
423 		bstack->entries[i].from = record->from_ip;
424 		bstack->entries[i].to = record->to_ip;
425 
426 		bs_flags = &bstack->entries[i].flags;
427 		bs_flags->value = 0;
428 
429 		if (record->op & ARM_SPE_OP_BR_CR_BL) {
430 			if (record->op & ARM_SPE_OP_BR_COND)
431 				bs_flags->type |= PERF_BR_COND_CALL;
432 			else
433 				bs_flags->type |= PERF_BR_CALL;
434 		/*
435 		 * Indirect branch instruction without link (e.g. BR),
436 		 * take this case as function return.
437 		 */
438 		} else if (record->op & ARM_SPE_OP_BR_CR_RET ||
439 			   record->op & ARM_SPE_OP_BR_INDIRECT) {
440 			if (record->op & ARM_SPE_OP_BR_COND)
441 				bs_flags->type |= PERF_BR_COND_RET;
442 			else
443 				bs_flags->type |= PERF_BR_RET;
444 		} else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
445 			if (record->op & ARM_SPE_OP_BR_COND)
446 				bs_flags->type |= PERF_BR_COND;
447 			else
448 				bs_flags->type |= PERF_BR_UNCOND;
449 		} else {
450 			if (record->op & ARM_SPE_OP_BR_COND)
451 				bs_flags->type |= PERF_BR_COND;
452 			else
453 				bs_flags->type |= PERF_BR_UNKNOWN;
454 		}
455 
456 		if (record->type & ARM_SPE_BRANCH_MISS) {
457 			bs_flags->mispred = 1;
458 			bs_flags->predicted = 0;
459 		} else {
460 			bs_flags->mispred = 0;
461 			bs_flags->predicted = 1;
462 		}
463 
464 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
465 			bs_flags->not_taken = 1;
466 
467 		if (record->type & ARM_SPE_IN_TXN)
468 			bs_flags->in_tx = 1;
469 
470 		bs_flags->cycles = min(record->latency, 0xFFFFU);
471 		i++;
472 	}
473 
474 	if (have_pbt) {
475 		bs_flags = &bstack->entries[i].flags;
476 		bs_flags->type |= PERF_BR_UNKNOWN;
477 		bstack->entries[i].to = record->prev_br_tgt;
478 		i++;
479 	}
480 
481 	bstack->nr = i;
482 	bstack->hw_idx = -1ULL;
483 }
484 
485 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
486 {
487 	event->header.size = perf_event__sample_event_size(sample, type, 0);
488 	return perf_event__synthesize_sample(event, type, 0, sample);
489 }
490 
491 static inline int
492 arm_spe_deliver_synth_event(struct arm_spe *spe,
493 			    struct arm_spe_queue *speq __maybe_unused,
494 			    union perf_event *event,
495 			    struct perf_sample *sample)
496 {
497 	int ret;
498 
499 	if (spe->synth_opts.inject) {
500 		ret = arm_spe__inject_event(event, sample, spe->sample_type);
501 		if (ret)
502 			return ret;
503 	}
504 
505 	ret = perf_session__deliver_synth_event(spe->session, event, sample);
506 	if (ret)
507 		pr_err("ARM SPE: failed to deliver event, error %d\n", ret);
508 
509 	return ret;
510 }
511 
512 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
513 				     u64 spe_events_id,
514 				     union perf_mem_data_src data_src)
515 {
516 	struct arm_spe *spe = speq->spe;
517 	struct arm_spe_record *record = &speq->decoder->record;
518 	union perf_event *event = speq->event_buf;
519 	struct perf_sample sample;
520 	int ret;
521 
522 	perf_sample__init(&sample, /*all=*/true);
523 	arm_spe_prep_sample(spe, speq, event, &sample);
524 
525 	sample.id = spe_events_id;
526 	sample.stream_id = spe_events_id;
527 	sample.addr = record->virt_addr;
528 	sample.phys_addr = record->phys_addr;
529 	sample.data_src = data_src.val;
530 	sample.weight = record->latency;
531 
532 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
533 	perf_sample__exit(&sample);
534 	return ret;
535 }
536 
537 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
538 					u64 spe_events_id)
539 {
540 	struct arm_spe *spe = speq->spe;
541 	struct arm_spe_record *record = &speq->decoder->record;
542 	union perf_event *event = speq->event_buf;
543 	struct perf_sample sample;
544 	int ret;
545 
546 	perf_sample__init(&sample, /*all=*/true);
547 	arm_spe_prep_sample(spe, speq, event, &sample);
548 
549 	sample.id = spe_events_id;
550 	sample.stream_id = spe_events_id;
551 	sample.addr = record->to_ip;
552 	sample.weight = record->latency;
553 	sample.flags = speq->flags;
554 	sample.branch_stack = speq->last_branch;
555 
556 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
557 	perf_sample__exit(&sample);
558 	return ret;
559 }
560 
561 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
562 					     u64 spe_events_id,
563 					     union perf_mem_data_src data_src)
564 {
565 	struct arm_spe *spe = speq->spe;
566 	struct arm_spe_record *record = &speq->decoder->record;
567 	union perf_event *event = speq->event_buf;
568 	struct perf_sample sample;
569 	int ret;
570 
571 	perf_sample__init(&sample, /*all=*/true);
572 	arm_spe_prep_sample(spe, speq, event, &sample);
573 
574 	sample.id = spe_events_id;
575 	sample.stream_id = spe_events_id;
576 	sample.addr = record->to_ip;
577 	sample.phys_addr = record->phys_addr;
578 	sample.data_src = data_src.val;
579 	sample.weight = record->latency;
580 	sample.flags = speq->flags;
581 	sample.branch_stack = speq->last_branch;
582 
583 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
584 	perf_sample__exit(&sample);
585 	return ret;
586 }
587 
588 static const struct midr_range common_ds_encoding_cpus[] = {
589 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
590 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
591 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE),
592 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
593 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
594 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
595 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
596 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
597 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X4),
598 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
599 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
600 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
601 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
602 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
603 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3),
604 	MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
605 	{},
606 };
607 
608 static const struct midr_range ampereone_ds_encoding_cpus[] = {
609 	MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
610 	{},
611 };
612 
613 static const struct midr_range hisi_hip_ds_encoding_cpus[] = {
614 	MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
615 	{},
616 };
617 
618 static void arm_spe__sample_flags(struct arm_spe_queue *speq)
619 {
620 	const struct arm_spe_record *record = &speq->decoder->record;
621 
622 	speq->flags = 0;
623 	if (record->op & ARM_SPE_OP_BRANCH_ERET) {
624 		speq->flags = PERF_IP_FLAG_BRANCH;
625 
626 		if (record->type & ARM_SPE_BRANCH_MISS)
627 			speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
628 
629 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
630 			speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
631 
632 		if (record->type & ARM_SPE_IN_TXN)
633 			speq->flags |= PERF_IP_FLAG_IN_TX;
634 
635 		if (record->op & ARM_SPE_OP_BR_COND)
636 			speq->flags |= PERF_IP_FLAG_CONDITIONAL;
637 
638 		if (record->op & ARM_SPE_OP_BR_CR_BL)
639 			speq->flags |= PERF_IP_FLAG_CALL;
640 		else if (record->op & ARM_SPE_OP_BR_CR_RET)
641 			speq->flags |= PERF_IP_FLAG_RETURN;
642 		/*
643 		 * Indirect branch instruction without link (e.g. BR),
644 		 * take it as a function return.
645 		 */
646 		else if (record->op & ARM_SPE_OP_BR_INDIRECT)
647 			speq->flags |= PERF_IP_FLAG_RETURN;
648 	}
649 }
650 
651 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
652 					      union perf_mem_data_src *data_src)
653 {
654 	/*
655 	 * Even though four levels of cache hierarchy are possible, no known
656 	 * production Neoverse systems currently include more than three levels
657 	 * so for the time being we assume three exist. If a production system
658 	 * is built with four the this function would have to be changed to
659 	 * detect the number of levels for reporting.
660 	 */
661 
662 	/*
663 	 * We have no data on the hit level or data source for stores in the
664 	 * Neoverse SPE records.
665 	 */
666 	if (record->op & ARM_SPE_OP_ST) {
667 		data_src->mem_lvl = PERF_MEM_LVL_NA;
668 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
669 		data_src->mem_snoop = PERF_MEM_SNOOP_NA;
670 		return;
671 	}
672 
673 	switch (record->source) {
674 	case ARM_SPE_COMMON_DS_L1D:
675 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
676 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
677 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
678 		break;
679 	case ARM_SPE_COMMON_DS_L2:
680 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
681 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
682 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
683 		break;
684 	case ARM_SPE_COMMON_DS_PEER_CORE:
685 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
686 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
687 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
688 		break;
689 	/*
690 	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
691 	 * transfer, so set SNOOPX_PEER
692 	 */
693 	case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
694 	case ARM_SPE_COMMON_DS_PEER_CLUSTER:
695 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
696 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
697 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
698 		break;
699 	/*
700 	 * System cache is assumed to be L3
701 	 */
702 	case ARM_SPE_COMMON_DS_SYS_CACHE:
703 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
704 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
705 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
706 		break;
707 	/*
708 	 * We don't know what level it hit in, except it came from the other
709 	 * socket
710 	 */
711 	case ARM_SPE_COMMON_DS_REMOTE:
712 		data_src->mem_lvl = PERF_MEM_LVL_NA;
713 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
714 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
715 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
716 		break;
717 	case ARM_SPE_COMMON_DS_DRAM:
718 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
719 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
720 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
721 		break;
722 	default:
723 		break;
724 	}
725 }
726 
727 /*
728  * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
729  * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
730  */
731 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
732 						 union perf_mem_data_src *data_src)
733 {
734 	struct arm_spe_record common_record;
735 
736 	switch (record->source) {
737 	case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
738 		common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
739 		break;
740 	case ARM_SPE_AMPEREONE_SLC:
741 		common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
742 		break;
743 	case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
744 		common_record.source = ARM_SPE_COMMON_DS_REMOTE;
745 		break;
746 	case ARM_SPE_AMPEREONE_DDR:
747 		common_record.source = ARM_SPE_COMMON_DS_DRAM;
748 		break;
749 	case ARM_SPE_AMPEREONE_L1D:
750 		common_record.source = ARM_SPE_COMMON_DS_L1D;
751 		break;
752 	case ARM_SPE_AMPEREONE_L2D:
753 		common_record.source = ARM_SPE_COMMON_DS_L2;
754 		break;
755 	default:
756 		pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
757 				record->source);
758 		return;
759 	}
760 
761 	common_record.op = record->op;
762 	arm_spe__synth_data_source_common(&common_record, data_src);
763 }
764 
765 static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record,
766 						union perf_mem_data_src *data_src)
767 {
768 	/* Use common synthesis method to handle store operations */
769 	if (record->op & ARM_SPE_OP_ST) {
770 		arm_spe__synth_data_source_common(record, data_src);
771 		return;
772 	}
773 
774 	switch (record->source) {
775 	case ARM_SPE_HISI_HIP_PEER_CPU:
776 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
777 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
778 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
779 		break;
780 	case ARM_SPE_HISI_HIP_PEER_CPU_HITM:
781 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
782 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
783 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
784 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
785 		break;
786 	case ARM_SPE_HISI_HIP_L3:
787 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
788 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
789 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
790 		break;
791 	case ARM_SPE_HISI_HIP_L3_HITM:
792 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
793 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
794 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
795 		break;
796 	case ARM_SPE_HISI_HIP_PEER_CLUSTER:
797 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
798 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
799 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
800 		break;
801 	case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM:
802 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
803 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
804 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
805 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
806 		break;
807 	case ARM_SPE_HISI_HIP_REMOTE_SOCKET:
808 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
809 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
810 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
811 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
812 		break;
813 	case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM:
814 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
815 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
816 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
817 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
818 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
819 		break;
820 	case ARM_SPE_HISI_HIP_LOCAL_MEM:
821 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
822 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
823 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
824 		break;
825 	case ARM_SPE_HISI_HIP_REMOTE_MEM:
826 		data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
827 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
828 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
829 		break;
830 	case ARM_SPE_HISI_HIP_NC_DEV:
831 		data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT;
832 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
833 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
834 		break;
835 	case ARM_SPE_HISI_HIP_L2:
836 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
837 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
838 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
839 		break;
840 	case ARM_SPE_HISI_HIP_L2_HITM:
841 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
842 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
843 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
844 		break;
845 	case ARM_SPE_HISI_HIP_L1:
846 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
847 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
848 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
849 		break;
850 	default:
851 		break;
852 	}
853 }
854 
855 static const struct data_source_handle data_source_handles[] = {
856 	DS(common_ds_encoding_cpus, data_source_common),
857 	DS(ampereone_ds_encoding_cpus, data_source_ampereone),
858 	DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip),
859 };
860 
861 static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record,
862 					   union perf_mem_data_src *data_src)
863 {
864 	/*
865 	 * To find a cache hit, search in ascending order from the lower level
866 	 * caches to the higher level caches. This reflects the best scenario
867 	 * for a cache hit.
868 	 */
869 	if (arm_spe_is_cache_hit(record->type, L1D)) {
870 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
871 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
872 	} else if (record->type & ARM_SPE_RECENTLY_FETCHED) {
873 		data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
874 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB;
875 	} else if (arm_spe_is_cache_hit(record->type, L2D)) {
876 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
877 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
878 	} else if (arm_spe_is_cache_hit(record->type, LLC)) {
879 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
880 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
881 	/*
882 	 * To find a cache miss, search in descending order from the higher
883 	 * level cache to the lower level cache. This represents the worst
884 	 * scenario for a cache miss.
885 	 */
886 	} else if (arm_spe_is_cache_miss(record->type, LLC)) {
887 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS;
888 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
889 	} else if (arm_spe_is_cache_miss(record->type, L2D)) {
890 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS;
891 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
892 	} else if (arm_spe_is_cache_miss(record->type, L1D)) {
893 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
894 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
895 	}
896 }
897 
898 static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record,
899 					   union perf_mem_data_src *data_src)
900 {
901 	/* Record the greatest level info for a store operation. */
902 	if (arm_spe_is_cache_level(record->type, LLC)) {
903 		data_src->mem_lvl = PERF_MEM_LVL_L3;
904 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ?
905 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
906 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
907 	} else if (arm_spe_is_cache_level(record->type, L2D)) {
908 		data_src->mem_lvl = PERF_MEM_LVL_L2;
909 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ?
910 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
911 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
912 	} else if (arm_spe_is_cache_level(record->type, L1D)) {
913 		data_src->mem_lvl = PERF_MEM_LVL_L1;
914 		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ?
915 				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
916 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
917 	}
918 }
919 
920 static void arm_spe__synth_memory_level(struct arm_spe_queue *speq,
921 					const struct arm_spe_record *record,
922 					union perf_mem_data_src *data_src)
923 {
924 	struct arm_spe *spe = speq->spe;
925 
926 	/*
927 	 * The data source packet contains more info for cache levels for
928 	 * peer snooping. So respect the memory level if has been set by
929 	 * data source parsing.
930 	 */
931 	if (!data_src->mem_lvl) {
932 		if (data_src->mem_op == PERF_MEM_OP_LOAD)
933 			arm_spe__synth_ld_memory_level(record, data_src);
934 		if (data_src->mem_op == PERF_MEM_OP_STORE)
935 			arm_spe__synth_st_memory_level(record, data_src);
936 	}
937 
938 	if (!data_src->mem_lvl) {
939 		data_src->mem_lvl = PERF_MEM_LVL_NA;
940 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
941 	}
942 
943 	/*
944 	 * If 'mem_snoop' has been set by data source packet, skip to set
945 	 * it at here.
946 	 */
947 	if (!data_src->mem_snoop) {
948 		if (record->type & ARM_SPE_DATA_SNOOPED) {
949 			if (record->type & ARM_SPE_HITM)
950 				data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
951 			else
952 				data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
953 		} else {
954 			u64 *metadata =
955 				arm_spe__get_metadata_by_cpu(spe, speq->cpu);
956 
957 			/*
958 			 * Set NA ("Not available") mode if no meta data or the
959 			 * SNOOPED event is not supported.
960 			 */
961 			if (!metadata ||
962 			    !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED))
963 				data_src->mem_snoop = PERF_MEM_SNOOP_NA;
964 			else
965 				data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
966 		}
967 	}
968 
969 	if (!data_src->mem_remote) {
970 		if (record->type & ARM_SPE_REMOTE_ACCESS)
971 			data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
972 	}
973 }
974 
975 static void arm_spe__synth_ds(struct arm_spe_queue *speq,
976 			      const struct arm_spe_record *record,
977 			      union perf_mem_data_src *data_src)
978 {
979 	struct arm_spe *spe = speq->spe;
980 	u64 *metadata = NULL;
981 	u64 midr;
982 	unsigned int i;
983 
984 	/* Metadata version 1 assumes all CPUs are the same (old behavior) */
985 	if (spe->metadata_ver == 1) {
986 		const char *cpuid;
987 
988 		pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
989 		cpuid = perf_env__cpuid(perf_session__env(spe->session));
990 		midr = strtol(cpuid, NULL, 16);
991 	} else {
992 		metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
993 		if (!metadata)
994 			return;
995 
996 		midr = metadata[ARM_SPE_CPU_MIDR];
997 	}
998 
999 	for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
1000 		if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
1001 			return data_source_handles[i].ds_synth(record, data_src);
1002 		}
1003 	}
1004 
1005 	return;
1006 }
1007 
1008 static union perf_mem_data_src
1009 arm_spe__synth_data_source(struct arm_spe_queue *speq,
1010 			   const struct arm_spe_record *record)
1011 {
1012 	union perf_mem_data_src	data_src = {};
1013 
1014 	if (!is_mem_op(record->op))
1015 		return data_src;
1016 
1017 	if (record->op & ARM_SPE_OP_LD)
1018 		data_src.mem_op = PERF_MEM_OP_LOAD;
1019 	else if (record->op & ARM_SPE_OP_ST)
1020 		data_src.mem_op = PERF_MEM_OP_STORE;
1021 	else
1022 		data_src.mem_op = PERF_MEM_OP_NA;
1023 
1024 	arm_spe__synth_ds(speq, record, &data_src);
1025 	arm_spe__synth_memory_level(speq, record, &data_src);
1026 
1027 	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
1028 		data_src.mem_dtlb = PERF_MEM_TLB_WK;
1029 
1030 		if (record->type & ARM_SPE_TLB_MISS)
1031 			data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
1032 		else
1033 			data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
1034 	}
1035 
1036 	return data_src;
1037 }
1038 
1039 static int arm_spe_sample(struct arm_spe_queue *speq)
1040 {
1041 	const struct arm_spe_record *record = &speq->decoder->record;
1042 	struct arm_spe *spe = speq->spe;
1043 	union perf_mem_data_src data_src;
1044 	int err;
1045 
1046 	/*
1047 	 * Discard all samples until period is reached
1048 	 */
1049 	speq->sample_count++;
1050 	if (speq->sample_count < spe->synth_opts.period)
1051 		return 0;
1052 	speq->sample_count = 0;
1053 
1054 	arm_spe__sample_flags(speq);
1055 	data_src = arm_spe__synth_data_source(speq, record);
1056 
1057 	if (spe->sample_flc) {
1058 		if (record->type & ARM_SPE_L1D_MISS) {
1059 			err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
1060 							data_src);
1061 			if (err)
1062 				return err;
1063 		}
1064 
1065 		if (record->type & ARM_SPE_L1D_ACCESS) {
1066 			err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
1067 							data_src);
1068 			if (err)
1069 				return err;
1070 		}
1071 	}
1072 
1073 	if (spe->sample_llc) {
1074 		if (record->type & ARM_SPE_LLC_MISS) {
1075 			err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
1076 							data_src);
1077 			if (err)
1078 				return err;
1079 		}
1080 
1081 		if (record->type & ARM_SPE_LLC_ACCESS) {
1082 			err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
1083 							data_src);
1084 			if (err)
1085 				return err;
1086 		}
1087 	}
1088 
1089 	if (spe->sample_tlb) {
1090 		if (record->type & ARM_SPE_TLB_MISS) {
1091 			err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
1092 							data_src);
1093 			if (err)
1094 				return err;
1095 		}
1096 
1097 		if (record->type & ARM_SPE_TLB_ACCESS) {
1098 			err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
1099 							data_src);
1100 			if (err)
1101 				return err;
1102 		}
1103 	}
1104 
1105 	if (spe->synth_opts.last_branch &&
1106 	    (spe->sample_branch || spe->sample_instructions))
1107 		arm_spe__prep_branch_stack(speq);
1108 
1109 	if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
1110 		err = arm_spe__synth_branch_sample(speq, spe->branch_id);
1111 		if (err)
1112 			return err;
1113 	}
1114 
1115 	if (spe->sample_remote_access &&
1116 	    (record->type & ARM_SPE_REMOTE_ACCESS)) {
1117 		err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
1118 						data_src);
1119 		if (err)
1120 			return err;
1121 	}
1122 
1123 	if (spe->sample_memory && is_mem_op(record->op)) {
1124 		err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
1125 		if (err)
1126 			return err;
1127 	}
1128 
1129 	if (spe->sample_instructions) {
1130 		err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
1131 		if (err)
1132 			return err;
1133 	}
1134 
1135 	return 0;
1136 }
1137 
1138 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
1139 {
1140 	struct arm_spe *spe = speq->spe;
1141 	struct arm_spe_record *record;
1142 	int ret;
1143 
1144 	if (!spe->kernel_start)
1145 		spe->kernel_start = machine__kernel_start(spe->machine);
1146 
1147 	while (1) {
1148 		/*
1149 		 * The usual logic is firstly to decode the packets, and then
1150 		 * based the record to synthesize sample; but here the flow is
1151 		 * reversed: it calls arm_spe_sample() for synthesizing samples
1152 		 * prior to arm_spe_decode().
1153 		 *
1154 		 * Two reasons for this code logic:
1155 		 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
1156 		 * has decoded trace data and generated a record, but the record
1157 		 * is left to generate sample until run to here, so it's correct
1158 		 * to synthesize sample for the left record.
1159 		 * 2. After decoding trace data, it needs to compare the record
1160 		 * timestamp with the coming perf event, if the record timestamp
1161 		 * is later than the perf event, it needs bail out and pushs the
1162 		 * record into auxtrace heap, thus the record can be deferred to
1163 		 * synthesize sample until run to here at the next time; so this
1164 		 * can correlate samples between Arm SPE trace data and other
1165 		 * perf events with correct time ordering.
1166 		 */
1167 
1168 		/*
1169 		 * Update pid/tid info.
1170 		 */
1171 		record = &speq->decoder->record;
1172 		if (!spe->timeless_decoding && record->context_id != (u64)-1) {
1173 			ret = arm_spe_set_tid(speq, record->context_id);
1174 			if (ret)
1175 				return ret;
1176 
1177 			spe->use_ctx_pkt_for_pid = true;
1178 		}
1179 
1180 		ret = arm_spe_sample(speq);
1181 		if (ret)
1182 			return ret;
1183 
1184 		ret = arm_spe_decode(speq->decoder);
1185 		if (!ret) {
1186 			pr_debug("No data or all data has been processed.\n");
1187 			return 1;
1188 		}
1189 
1190 		/*
1191 		 * Error is detected when decode SPE trace data, continue to
1192 		 * the next trace data and find out more records.
1193 		 */
1194 		if (ret < 0)
1195 			continue;
1196 
1197 		record = &speq->decoder->record;
1198 
1199 		/* Update timestamp for the last record */
1200 		if (record->timestamp > speq->timestamp)
1201 			speq->timestamp = record->timestamp;
1202 
1203 		/*
1204 		 * If the timestamp of the queue is later than timestamp of the
1205 		 * coming perf event, bail out so can allow the perf event to
1206 		 * be processed ahead.
1207 		 */
1208 		if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
1209 			*timestamp = speq->timestamp;
1210 			return 0;
1211 		}
1212 	}
1213 
1214 	return 0;
1215 }
1216 
1217 static int arm_spe__setup_queue(struct arm_spe *spe,
1218 			       struct auxtrace_queue *queue,
1219 			       unsigned int queue_nr)
1220 {
1221 	struct arm_spe_queue *speq = queue->priv;
1222 	struct arm_spe_record *record;
1223 
1224 	if (list_empty(&queue->head) || speq)
1225 		return 0;
1226 
1227 	speq = arm_spe__alloc_queue(spe, queue_nr);
1228 
1229 	if (!speq)
1230 		return -ENOMEM;
1231 
1232 	queue->priv = speq;
1233 
1234 	if (queue->cpu != -1)
1235 		speq->cpu = queue->cpu;
1236 
1237 	if (!speq->on_heap) {
1238 		int ret;
1239 
1240 		if (spe->timeless_decoding)
1241 			return 0;
1242 
1243 retry:
1244 		ret = arm_spe_decode(speq->decoder);
1245 
1246 		if (!ret)
1247 			return 0;
1248 
1249 		if (ret < 0)
1250 			goto retry;
1251 
1252 		record = &speq->decoder->record;
1253 
1254 		speq->timestamp = record->timestamp;
1255 		ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
1256 		if (ret)
1257 			return ret;
1258 		speq->on_heap = true;
1259 	}
1260 
1261 	return 0;
1262 }
1263 
1264 static int arm_spe__setup_queues(struct arm_spe *spe)
1265 {
1266 	unsigned int i;
1267 	int ret;
1268 
1269 	for (i = 0; i < spe->queues.nr_queues; i++) {
1270 		ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
1271 		if (ret)
1272 			return ret;
1273 	}
1274 
1275 	return 0;
1276 }
1277 
1278 static int arm_spe__update_queues(struct arm_spe *spe)
1279 {
1280 	if (spe->queues.new_data) {
1281 		spe->queues.new_data = false;
1282 		return arm_spe__setup_queues(spe);
1283 	}
1284 
1285 	return 0;
1286 }
1287 
1288 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
1289 {
1290 	struct evsel *evsel;
1291 	struct evlist *evlist = spe->session->evlist;
1292 	bool timeless_decoding = true;
1293 
1294 	/*
1295 	 * Circle through the list of event and complain if we find one
1296 	 * with the time bit set.
1297 	 */
1298 	evlist__for_each_entry(evlist, evsel) {
1299 		if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
1300 			timeless_decoding = false;
1301 	}
1302 
1303 	return timeless_decoding;
1304 }
1305 
1306 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
1307 {
1308 	unsigned int queue_nr;
1309 	u64 ts;
1310 	int ret;
1311 
1312 	while (1) {
1313 		struct auxtrace_queue *queue;
1314 		struct arm_spe_queue *speq;
1315 
1316 		if (!spe->heap.heap_cnt)
1317 			return 0;
1318 
1319 		if (spe->heap.heap_array[0].ordinal >= timestamp)
1320 			return 0;
1321 
1322 		queue_nr = spe->heap.heap_array[0].queue_nr;
1323 		queue = &spe->queues.queue_array[queue_nr];
1324 		speq = queue->priv;
1325 
1326 		auxtrace_heap__pop(&spe->heap);
1327 
1328 		if (spe->heap.heap_cnt) {
1329 			ts = spe->heap.heap_array[0].ordinal + 1;
1330 			if (ts > timestamp)
1331 				ts = timestamp;
1332 		} else {
1333 			ts = timestamp;
1334 		}
1335 
1336 		/*
1337 		 * A previous context-switch event has set pid/tid in the machine's context, so
1338 		 * here we need to update the pid/tid in the thread and SPE queue.
1339 		 */
1340 		if (!spe->use_ctx_pkt_for_pid)
1341 			arm_spe_set_pid_tid_cpu(spe, queue);
1342 
1343 		ret = arm_spe_run_decoder(speq, &ts);
1344 		if (ret < 0) {
1345 			auxtrace_heap__add(&spe->heap, queue_nr, ts);
1346 			return ret;
1347 		}
1348 
1349 		if (!ret) {
1350 			ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
1351 			if (ret < 0)
1352 				return ret;
1353 		} else {
1354 			speq->on_heap = false;
1355 		}
1356 	}
1357 
1358 	return 0;
1359 }
1360 
1361 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
1362 					    u64 time_)
1363 {
1364 	struct auxtrace_queues *queues = &spe->queues;
1365 	unsigned int i;
1366 	u64 ts = 0;
1367 
1368 	for (i = 0; i < queues->nr_queues; i++) {
1369 		struct auxtrace_queue *queue = &spe->queues.queue_array[i];
1370 		struct arm_spe_queue *speq = queue->priv;
1371 
1372 		if (speq && (tid == -1 || speq->tid == tid)) {
1373 			speq->time = time_;
1374 			arm_spe_set_pid_tid_cpu(spe, queue);
1375 			arm_spe_run_decoder(speq, &ts);
1376 		}
1377 	}
1378 	return 0;
1379 }
1380 
1381 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
1382 				  struct perf_sample *sample)
1383 {
1384 	pid_t pid, tid;
1385 	int cpu;
1386 
1387 	if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
1388 		return 0;
1389 
1390 	pid = event->context_switch.next_prev_pid;
1391 	tid = event->context_switch.next_prev_tid;
1392 	cpu = sample->cpu;
1393 
1394 	if (tid == -1)
1395 		pr_warning("context_switch event has no tid\n");
1396 
1397 	return machine__set_current_tid(spe->machine, cpu, pid, tid);
1398 }
1399 
1400 static int arm_spe_process_event(struct perf_session *session,
1401 				 union perf_event *event,
1402 				 struct perf_sample *sample,
1403 				 const struct perf_tool *tool)
1404 {
1405 	int err = 0;
1406 	u64 timestamp;
1407 	struct arm_spe *spe = container_of(session->auxtrace,
1408 			struct arm_spe, auxtrace);
1409 
1410 	if (dump_trace)
1411 		return 0;
1412 
1413 	if (!tool->ordered_events) {
1414 		pr_err("SPE trace requires ordered events\n");
1415 		return -EINVAL;
1416 	}
1417 
1418 	if (sample->time && (sample->time != (u64) -1))
1419 		timestamp = perf_time_to_tsc(sample->time, &spe->tc);
1420 	else
1421 		timestamp = 0;
1422 
1423 	if (timestamp || spe->timeless_decoding) {
1424 		err = arm_spe__update_queues(spe);
1425 		if (err)
1426 			return err;
1427 	}
1428 
1429 	if (spe->timeless_decoding) {
1430 		if (event->header.type == PERF_RECORD_EXIT) {
1431 			err = arm_spe_process_timeless_queues(spe,
1432 					event->fork.tid,
1433 					sample->time);
1434 		}
1435 	} else if (timestamp) {
1436 		err = arm_spe_process_queues(spe, timestamp);
1437 		if (err)
1438 			return err;
1439 
1440 		if (!spe->use_ctx_pkt_for_pid &&
1441 		    (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
1442 		    event->header.type == PERF_RECORD_SWITCH))
1443 			err = arm_spe_context_switch(spe, event, sample);
1444 	}
1445 
1446 	return err;
1447 }
1448 
1449 static int arm_spe_process_auxtrace_event(struct perf_session *session,
1450 					  union perf_event *event,
1451 					  const struct perf_tool *tool __maybe_unused)
1452 {
1453 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1454 					     auxtrace);
1455 
1456 	if (!spe->data_queued) {
1457 		struct auxtrace_buffer *buffer;
1458 		off_t data_offset;
1459 		int fd = perf_data__fd(session->data);
1460 		int err;
1461 
1462 		if (perf_data__is_pipe(session->data)) {
1463 			data_offset = 0;
1464 		} else {
1465 			data_offset = lseek(fd, 0, SEEK_CUR);
1466 			if (data_offset == -1)
1467 				return -errno;
1468 		}
1469 
1470 		err = auxtrace_queues__add_event(&spe->queues, session, event,
1471 				data_offset, &buffer);
1472 		if (err)
1473 			return err;
1474 
1475 		/* Dump here now we have copied a piped trace out of the pipe */
1476 		if (dump_trace) {
1477 			if (auxtrace_buffer__get_data(buffer, fd)) {
1478 				arm_spe_dump_event(spe, buffer->data,
1479 						buffer->size);
1480 				auxtrace_buffer__put_data(buffer);
1481 			}
1482 		}
1483 	}
1484 
1485 	return 0;
1486 }
1487 
1488 static int arm_spe_flush(struct perf_session *session __maybe_unused,
1489 			 const struct perf_tool *tool __maybe_unused)
1490 {
1491 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1492 			auxtrace);
1493 	int ret;
1494 
1495 	if (dump_trace)
1496 		return 0;
1497 
1498 	if (!tool->ordered_events)
1499 		return -EINVAL;
1500 
1501 	ret = arm_spe__update_queues(spe);
1502 	if (ret < 0)
1503 		return ret;
1504 
1505 	if (spe->timeless_decoding)
1506 		return arm_spe_process_timeless_queues(spe, -1,
1507 				MAX_TIMESTAMP - 1);
1508 
1509 	ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
1510 	if (ret)
1511 		return ret;
1512 
1513 	if (!spe->use_ctx_pkt_for_pid)
1514 		ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
1515 			    "Matching of TIDs to SPE events could be inaccurate.\n");
1516 
1517 	return 0;
1518 }
1519 
1520 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
1521 {
1522 	u64 *metadata;
1523 
1524 	metadata = zalloc(per_cpu_size);
1525 	if (!metadata)
1526 		return NULL;
1527 
1528 	memcpy(metadata, buf, per_cpu_size);
1529 	return metadata;
1530 }
1531 
1532 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
1533 {
1534 	int i;
1535 
1536 	for (i = 0; i < nr_cpu; i++)
1537 		zfree(&metadata[i]);
1538 	free(metadata);
1539 }
1540 
1541 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
1542 				     u64 *ver, int *nr_cpu)
1543 {
1544 	u64 *ptr = (u64 *)info->priv;
1545 	u64 metadata_size;
1546 	u64 **metadata = NULL;
1547 	int hdr_sz, per_cpu_sz, i;
1548 
1549 	metadata_size = info->header.size -
1550 		sizeof(struct perf_record_auxtrace_info);
1551 
1552 	/* Metadata version 1 */
1553 	if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
1554 		*ver = 1;
1555 		*nr_cpu = 0;
1556 		/* No per CPU metadata */
1557 		return NULL;
1558 	}
1559 
1560 	*ver = ptr[ARM_SPE_HEADER_VERSION];
1561 	hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
1562 	*nr_cpu = ptr[ARM_SPE_CPUS_NUM];
1563 
1564 	metadata = calloc(*nr_cpu, sizeof(*metadata));
1565 	if (!metadata)
1566 		return NULL;
1567 
1568 	/* Locate the start address of per CPU metadata */
1569 	ptr += hdr_sz;
1570 	per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
1571 
1572 	for (i = 0; i < *nr_cpu; i++) {
1573 		metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
1574 		if (!metadata[i])
1575 			goto err_per_cpu_metadata;
1576 
1577 		ptr += per_cpu_sz / sizeof(u64);
1578 	}
1579 
1580 	return metadata;
1581 
1582 err_per_cpu_metadata:
1583 	arm_spe__free_metadata(metadata, *nr_cpu);
1584 	return NULL;
1585 }
1586 
1587 static void arm_spe_free_queue(void *priv)
1588 {
1589 	struct arm_spe_queue *speq = priv;
1590 
1591 	if (!speq)
1592 		return;
1593 	thread__zput(speq->thread);
1594 	arm_spe_decoder_free(speq->decoder);
1595 	zfree(&speq->event_buf);
1596 	zfree(&speq->last_branch);
1597 	free(speq);
1598 }
1599 
1600 static void arm_spe_free_events(struct perf_session *session)
1601 {
1602 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1603 					     auxtrace);
1604 	struct auxtrace_queues *queues = &spe->queues;
1605 	unsigned int i;
1606 
1607 	for (i = 0; i < queues->nr_queues; i++) {
1608 		arm_spe_free_queue(queues->queue_array[i].priv);
1609 		queues->queue_array[i].priv = NULL;
1610 	}
1611 	auxtrace_queues__free(queues);
1612 }
1613 
1614 static void arm_spe_free(struct perf_session *session)
1615 {
1616 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1617 					     auxtrace);
1618 
1619 	auxtrace_heap__free(&spe->heap);
1620 	arm_spe_free_events(session);
1621 	session->auxtrace = NULL;
1622 	arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
1623 	free(spe);
1624 }
1625 
1626 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
1627 				      struct evsel *evsel)
1628 {
1629 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);
1630 
1631 	return evsel->core.attr.type == spe->pmu_type;
1632 }
1633 
1634 static const char * const metadata_hdr_v1_fmts[] = {
1635 	[ARM_SPE_PMU_TYPE]		= "  PMU Type           :%"PRId64"\n",
1636 	[ARM_SPE_PER_CPU_MMAPS]		= "  Per CPU mmaps      :%"PRId64"\n",
1637 };
1638 
1639 static const char * const metadata_hdr_fmts[] = {
1640 	[ARM_SPE_HEADER_VERSION]	= "  Header version     :%"PRId64"\n",
1641 	[ARM_SPE_HEADER_SIZE]		= "  Header size        :%"PRId64"\n",
1642 	[ARM_SPE_PMU_TYPE_V2]		= "  PMU type v2        :%"PRId64"\n",
1643 	[ARM_SPE_CPUS_NUM]		= "  CPU number         :%"PRId64"\n",
1644 };
1645 
1646 static const char * const metadata_per_cpu_fmts[] = {
1647 	[ARM_SPE_MAGIC]			= "    Magic            :0x%"PRIx64"\n",
1648 	[ARM_SPE_CPU]			= "    CPU #            :%"PRId64"\n",
1649 	[ARM_SPE_CPU_NR_PARAMS]		= "    Num of params    :%"PRId64"\n",
1650 	[ARM_SPE_CPU_MIDR]		= "    MIDR             :0x%"PRIx64"\n",
1651 	[ARM_SPE_CPU_PMU_TYPE]		= "    PMU Type         :%"PRId64"\n",
1652 	[ARM_SPE_CAP_MIN_IVAL]		= "    Min Interval     :%"PRId64"\n",
1653 	[ARM_SPE_CAP_EVENT_FILTER]	= "    Event Filter     :0x%"PRIx64"\n",
1654 };
1655 
1656 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
1657 {
1658 	unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
1659 	const char * const *hdr_fmts;
1660 
1661 	if (!dump_trace)
1662 		return;
1663 
1664 	if (spe->metadata_ver == 1) {
1665 		cpu_num = 0;
1666 		hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
1667 		hdr_fmts = metadata_hdr_v1_fmts;
1668 	} else {
1669 		cpu_num = arr[ARM_SPE_CPUS_NUM];
1670 		hdr_size = arr[ARM_SPE_HEADER_SIZE];
1671 		hdr_fmts = metadata_hdr_fmts;
1672 	}
1673 
1674 	for (i = 0; i < hdr_size; i++)
1675 		fprintf(stdout, hdr_fmts[i], arr[i]);
1676 
1677 	arr += hdr_size;
1678 	for (cpu = 0; cpu < cpu_num; cpu++) {
1679 		/*
1680 		 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
1681 		 * are fixed. The sequential parameter size is decided by the
1682 		 * field 'ARM_SPE_CPU_NR_PARAMS'.
1683 		 */
1684 		cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
1685 		for (i = 0; i < cpu_size; i++)
1686 			fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
1687 		arr += cpu_size;
1688 	}
1689 }
1690 
1691 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
1692 				    const char *name)
1693 {
1694 	struct evsel *evsel;
1695 
1696 	evlist__for_each_entry(evlist, evsel) {
1697 		if (evsel->core.id && evsel->core.id[0] == id) {
1698 			if (evsel->name)
1699 				zfree(&evsel->name);
1700 			evsel->name = strdup(name);
1701 			break;
1702 		}
1703 	}
1704 }
1705 
1706 static int
1707 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
1708 {
1709 	struct evlist *evlist = session->evlist;
1710 	struct evsel *evsel;
1711 	struct perf_event_attr attr;
1712 	bool found = false;
1713 	u64 id;
1714 	int err;
1715 
1716 	evlist__for_each_entry(evlist, evsel) {
1717 		if (evsel->core.attr.type == spe->pmu_type) {
1718 			found = true;
1719 			break;
1720 		}
1721 	}
1722 
1723 	if (!found) {
1724 		pr_debug("No selected events with SPE trace data\n");
1725 		return 0;
1726 	}
1727 
1728 	memset(&attr, 0, sizeof(struct perf_event_attr));
1729 	attr.size = sizeof(struct perf_event_attr);
1730 	attr.type = PERF_TYPE_HARDWARE;
1731 	attr.sample_type = evsel->core.attr.sample_type &
1732 				(PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
1733 	attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
1734 			    PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
1735 			    PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
1736 	if (spe->timeless_decoding)
1737 		attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
1738 	else
1739 		attr.sample_type |= PERF_SAMPLE_TIME;
1740 
1741 	spe->sample_type = attr.sample_type;
1742 
1743 	attr.exclude_user = evsel->core.attr.exclude_user;
1744 	attr.exclude_kernel = evsel->core.attr.exclude_kernel;
1745 	attr.exclude_hv = evsel->core.attr.exclude_hv;
1746 	attr.exclude_host = evsel->core.attr.exclude_host;
1747 	attr.exclude_guest = evsel->core.attr.exclude_guest;
1748 	attr.sample_id_all = evsel->core.attr.sample_id_all;
1749 	attr.read_format = evsel->core.attr.read_format;
1750 	attr.sample_period = spe->synth_opts.period;
1751 
1752 	/* create new id val to be a fixed offset from evsel id */
1753 	id = auxtrace_synth_id_range_start(evsel);
1754 
1755 	if (spe->synth_opts.flc) {
1756 		spe->sample_flc = true;
1757 
1758 		/* Level 1 data cache miss */
1759 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1760 		if (err)
1761 			return err;
1762 		spe->l1d_miss_id = id;
1763 		arm_spe_set_event_name(evlist, id, "l1d-miss");
1764 		id += 1;
1765 
1766 		/* Level 1 data cache access */
1767 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1768 		if (err)
1769 			return err;
1770 		spe->l1d_access_id = id;
1771 		arm_spe_set_event_name(evlist, id, "l1d-access");
1772 		id += 1;
1773 	}
1774 
1775 	if (spe->synth_opts.llc) {
1776 		spe->sample_llc = true;
1777 
1778 		/* Last level cache miss */
1779 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1780 		if (err)
1781 			return err;
1782 		spe->llc_miss_id = id;
1783 		arm_spe_set_event_name(evlist, id, "llc-miss");
1784 		id += 1;
1785 
1786 		/* Last level cache access */
1787 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1788 		if (err)
1789 			return err;
1790 		spe->llc_access_id = id;
1791 		arm_spe_set_event_name(evlist, id, "llc-access");
1792 		id += 1;
1793 	}
1794 
1795 	if (spe->synth_opts.tlb) {
1796 		spe->sample_tlb = true;
1797 
1798 		/* TLB miss */
1799 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1800 		if (err)
1801 			return err;
1802 		spe->tlb_miss_id = id;
1803 		arm_spe_set_event_name(evlist, id, "tlb-miss");
1804 		id += 1;
1805 
1806 		/* TLB access */
1807 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1808 		if (err)
1809 			return err;
1810 		spe->tlb_access_id = id;
1811 		arm_spe_set_event_name(evlist, id, "tlb-access");
1812 		id += 1;
1813 	}
1814 
1815 	if (spe->synth_opts.last_branch) {
1816 		if (spe->synth_opts.last_branch_sz > 2)
1817 			pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
1818 
1819 		attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
1820 		/*
1821 		 * We don't use the hardware index, but the sample generation
1822 		 * code uses the new format branch_stack with this field,
1823 		 * so the event attributes must indicate that it's present.
1824 		 */
1825 		attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
1826 	}
1827 
1828 	if (spe->synth_opts.branches) {
1829 		spe->sample_branch = true;
1830 
1831 		/* Branch */
1832 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1833 		if (err)
1834 			return err;
1835 		spe->branch_id = id;
1836 		arm_spe_set_event_name(evlist, id, "branch");
1837 		id += 1;
1838 	}
1839 
1840 	if (spe->synth_opts.remote_access) {
1841 		spe->sample_remote_access = true;
1842 
1843 		/* Remote access */
1844 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1845 		if (err)
1846 			return err;
1847 		spe->remote_access_id = id;
1848 		arm_spe_set_event_name(evlist, id, "remote-access");
1849 		id += 1;
1850 	}
1851 
1852 	if (spe->synth_opts.mem) {
1853 		spe->sample_memory = true;
1854 
1855 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1856 		if (err)
1857 			return err;
1858 		spe->memory_id = id;
1859 		arm_spe_set_event_name(evlist, id, "memory");
1860 		id += 1;
1861 	}
1862 
1863 	if (spe->synth_opts.instructions) {
1864 		spe->sample_instructions = true;
1865 		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
1866 
1867 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1868 		if (err)
1869 			return err;
1870 		spe->instructions_id = id;
1871 		arm_spe_set_event_name(evlist, id, "instructions");
1872 	}
1873 
1874 	return 0;
1875 }
1876 
1877 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
1878 {
1879 	u64 midr;
1880 	int i;
1881 
1882 	if (!nr_cpu)
1883 		return false;
1884 
1885 	for (i = 0; i < nr_cpu; i++) {
1886 		if (!metadata[i])
1887 			return false;
1888 
1889 		if (i == 0) {
1890 			midr = metadata[i][ARM_SPE_CPU_MIDR];
1891 			continue;
1892 		}
1893 
1894 		if (midr != metadata[i][ARM_SPE_CPU_MIDR])
1895 			return false;
1896 	}
1897 
1898 	return true;
1899 }
1900 
1901 int arm_spe_process_auxtrace_info(union perf_event *event,
1902 				  struct perf_session *session)
1903 {
1904 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1905 	size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
1906 	struct perf_record_time_conv *tc = &session->time_conv;
1907 	struct arm_spe *spe;
1908 	u64 **metadata = NULL;
1909 	u64 metadata_ver;
1910 	int nr_cpu, err;
1911 
1912 	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
1913 					min_sz)
1914 		return -EINVAL;
1915 
1916 	metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
1917 					   &nr_cpu);
1918 	if (!metadata && metadata_ver != 1) {
1919 		pr_err("Failed to parse Arm SPE metadata.\n");
1920 		return -EINVAL;
1921 	}
1922 
1923 	spe = zalloc(sizeof(struct arm_spe));
1924 	if (!spe) {
1925 		err = -ENOMEM;
1926 		goto err_free_metadata;
1927 	}
1928 
1929 	err = auxtrace_queues__init(&spe->queues);
1930 	if (err)
1931 		goto err_free;
1932 
1933 	spe->session = session;
1934 	spe->machine = &session->machines.host; /* No kvm support */
1935 	spe->auxtrace_type = auxtrace_info->type;
1936 	if (metadata_ver == 1)
1937 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1938 	else
1939 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
1940 	spe->metadata = metadata;
1941 	spe->metadata_ver = metadata_ver;
1942 	spe->metadata_nr_cpu = nr_cpu;
1943 	spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
1944 
1945 	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
1946 
1947 	/*
1948 	 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
1949 	 * and the parameters for hardware clock are stored in the session
1950 	 * context.  Passes these parameters to the struct perf_tsc_conversion
1951 	 * in "spe->tc", which is used for later conversion between clock
1952 	 * counter and timestamp.
1953 	 *
1954 	 * For backward compatibility, copies the fields starting from
1955 	 * "time_cycles" only if they are contained in the event.
1956 	 */
1957 	spe->tc.time_shift = tc->time_shift;
1958 	spe->tc.time_mult = tc->time_mult;
1959 	spe->tc.time_zero = tc->time_zero;
1960 
1961 	if (event_contains(*tc, time_cycles)) {
1962 		spe->tc.time_cycles = tc->time_cycles;
1963 		spe->tc.time_mask = tc->time_mask;
1964 		spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
1965 		spe->tc.cap_user_time_short = tc->cap_user_time_short;
1966 	}
1967 
1968 	spe->auxtrace.process_event = arm_spe_process_event;
1969 	spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
1970 	spe->auxtrace.flush_events = arm_spe_flush;
1971 	spe->auxtrace.free_events = arm_spe_free_events;
1972 	spe->auxtrace.free = arm_spe_free;
1973 	spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
1974 	session->auxtrace = &spe->auxtrace;
1975 
1976 	arm_spe_print_info(spe, &auxtrace_info->priv[0]);
1977 
1978 	if (dump_trace)
1979 		return 0;
1980 
1981 	if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
1982 		spe->synth_opts = *session->itrace_synth_opts;
1983 	} else {
1984 		itrace_synth_opts__set_default(&spe->synth_opts, false);
1985 		/* Default nanoseconds period not supported */
1986 		spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS;
1987 		spe->synth_opts.period = 1;
1988 	}
1989 
1990 	if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
1991 		ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n");
1992 		err = -EINVAL;
1993 		goto err_free_queues;
1994 	}
1995 	if (spe->synth_opts.period > 1)
1996 		ui__warning("Arm SPE has a hardware-based sampling period.\n\n"
1997 			    "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n");
1998 
1999 	err = arm_spe_synth_events(spe, session);
2000 	if (err)
2001 		goto err_free_queues;
2002 
2003 	err = auxtrace_queues__process_index(&spe->queues, session);
2004 	if (err)
2005 		goto err_free_queues;
2006 
2007 	if (spe->queues.populated)
2008 		spe->data_queued = true;
2009 
2010 	return 0;
2011 
2012 err_free_queues:
2013 	auxtrace_queues__free(&spe->queues);
2014 	session->auxtrace = NULL;
2015 err_free:
2016 	free(spe);
2017 err_free_metadata:
2018 	arm_spe__free_metadata(metadata, nr_cpu);
2019 	return err;
2020 }
2021