xref: /linux/tools/perf/util/arm-spe.c (revision df2e3152f1cb798ed8ffa7e488c50261e6dc50e3)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Arm Statistical Profiling Extensions (SPE) support
4  * Copyright (c) 2017-2018, Arm Ltd.
5  */
6 
7 #include <byteswap.h>
8 #include <endian.h>
9 #include <errno.h>
10 #include <inttypes.h>
11 #include <linux/bitops.h>
12 #include <linux/kernel.h>
13 #include <linux/log2.h>
14 #include <linux/types.h>
15 #include <linux/zalloc.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18 
19 #include "auxtrace.h"
20 #include "color.h"
21 #include "debug.h"
22 #include "evlist.h"
23 #include "evsel.h"
24 #include "machine.h"
25 #include "session.h"
26 #include "symbol.h"
27 #include "thread.h"
28 #include "thread-stack.h"
29 #include "tsc.h"
30 #include "tool.h"
31 #include "util/synthetic-events.h"
32 
33 #include "arm-spe.h"
34 #include "arm-spe-decoder/arm-spe-decoder.h"
35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
36 
37 #include "../../arch/arm64/include/asm/cputype.h"
38 #define MAX_TIMESTAMP (~0ULL)
39 
40 struct arm_spe {
41 	struct auxtrace			auxtrace;
42 	struct auxtrace_queues		queues;
43 	struct auxtrace_heap		heap;
44 	struct itrace_synth_opts        synth_opts;
45 	u32				auxtrace_type;
46 	struct perf_session		*session;
47 	struct machine			*machine;
48 	u32				pmu_type;
49 
50 	struct perf_tsc_conversion	tc;
51 
52 	u8				timeless_decoding;
53 	u8				data_queued;
54 
55 	u64				sample_type;
56 	u8				sample_flc;
57 	u8				sample_llc;
58 	u8				sample_tlb;
59 	u8				sample_branch;
60 	u8				sample_remote_access;
61 	u8				sample_memory;
62 	u8				sample_instructions;
63 	u64				instructions_sample_period;
64 
65 	u64				l1d_miss_id;
66 	u64				l1d_access_id;
67 	u64				llc_miss_id;
68 	u64				llc_access_id;
69 	u64				tlb_miss_id;
70 	u64				tlb_access_id;
71 	u64				branch_id;
72 	u64				remote_access_id;
73 	u64				memory_id;
74 	u64				instructions_id;
75 
76 	u64				kernel_start;
77 
78 	unsigned long			num_events;
79 	u8				use_ctx_pkt_for_pid;
80 
81 	u64				**metadata;
82 	u64				metadata_ver;
83 	u64				metadata_nr_cpu;
84 	bool				is_homogeneous;
85 };
86 
87 struct arm_spe_queue {
88 	struct arm_spe			*spe;
89 	unsigned int			queue_nr;
90 	struct auxtrace_buffer		*buffer;
91 	struct auxtrace_buffer		*old_buffer;
92 	union perf_event		*event_buf;
93 	bool				on_heap;
94 	bool				done;
95 	pid_t				pid;
96 	pid_t				tid;
97 	int				cpu;
98 	struct arm_spe_decoder		*decoder;
99 	u64				time;
100 	u64				timestamp;
101 	struct thread			*thread;
102 	u64				period_instructions;
103 	u32				flags;
104 };
105 
106 struct data_source_handle {
107 	const struct midr_range *midr_ranges;
108 	void (*ds_synth)(const struct arm_spe_record *record,
109 			 union perf_mem_data_src *data_src);
110 };
111 
112 #define DS(range, func)					\
113 	{						\
114 		.midr_ranges = range,			\
115 		.ds_synth = arm_spe__synth_##func,	\
116 	}
117 
118 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
119 			 unsigned char *buf, size_t len)
120 {
121 	struct arm_spe_pkt packet;
122 	size_t pos = 0;
123 	int ret, pkt_len, i;
124 	char desc[ARM_SPE_PKT_DESC_MAX];
125 	const char *color = PERF_COLOR_BLUE;
126 
127 	color_fprintf(stdout, color,
128 		      ". ... ARM SPE data: size %#zx bytes\n",
129 		      len);
130 
131 	while (len) {
132 		ret = arm_spe_get_packet(buf, len, &packet);
133 		if (ret > 0)
134 			pkt_len = ret;
135 		else
136 			pkt_len = 1;
137 		printf(".");
138 		color_fprintf(stdout, color, "  %08zx: ", pos);
139 		for (i = 0; i < pkt_len; i++)
140 			color_fprintf(stdout, color, " %02x", buf[i]);
141 		for (; i < 16; i++)
142 			color_fprintf(stdout, color, "   ");
143 		if (ret > 0) {
144 			ret = arm_spe_pkt_desc(&packet, desc,
145 					       ARM_SPE_PKT_DESC_MAX);
146 			if (!ret)
147 				color_fprintf(stdout, color, " %s\n", desc);
148 		} else {
149 			color_fprintf(stdout, color, " Bad packet!\n");
150 		}
151 		pos += pkt_len;
152 		buf += pkt_len;
153 		len -= pkt_len;
154 	}
155 }
156 
157 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
158 			       size_t len)
159 {
160 	printf(".\n");
161 	arm_spe_dump(spe, buf, len);
162 }
163 
164 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
165 {
166 	struct arm_spe_queue *speq = data;
167 	struct auxtrace_buffer *buffer = speq->buffer;
168 	struct auxtrace_buffer *old_buffer = speq->old_buffer;
169 	struct auxtrace_queue *queue;
170 
171 	queue = &speq->spe->queues.queue_array[speq->queue_nr];
172 
173 	buffer = auxtrace_buffer__next(queue, buffer);
174 	/* If no more data, drop the previous auxtrace_buffer and return */
175 	if (!buffer) {
176 		if (old_buffer)
177 			auxtrace_buffer__drop_data(old_buffer);
178 		b->len = 0;
179 		return 0;
180 	}
181 
182 	speq->buffer = buffer;
183 
184 	/* If the aux_buffer doesn't have data associated, try to load it */
185 	if (!buffer->data) {
186 		/* get the file desc associated with the perf data file */
187 		int fd = perf_data__fd(speq->spe->session->data);
188 
189 		buffer->data = auxtrace_buffer__get_data(buffer, fd);
190 		if (!buffer->data)
191 			return -ENOMEM;
192 	}
193 
194 	b->len = buffer->size;
195 	b->buf = buffer->data;
196 
197 	if (b->len) {
198 		if (old_buffer)
199 			auxtrace_buffer__drop_data(old_buffer);
200 		speq->old_buffer = buffer;
201 	} else {
202 		auxtrace_buffer__drop_data(buffer);
203 		return arm_spe_get_trace(b, data);
204 	}
205 
206 	return 0;
207 }
208 
209 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
210 		unsigned int queue_nr)
211 {
212 	struct arm_spe_params params = { .get_trace = 0, };
213 	struct arm_spe_queue *speq;
214 
215 	speq = zalloc(sizeof(*speq));
216 	if (!speq)
217 		return NULL;
218 
219 	speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
220 	if (!speq->event_buf)
221 		goto out_free;
222 
223 	speq->spe = spe;
224 	speq->queue_nr = queue_nr;
225 	speq->pid = -1;
226 	speq->tid = -1;
227 	speq->cpu = -1;
228 	speq->period_instructions = 0;
229 
230 	/* params set */
231 	params.get_trace = arm_spe_get_trace;
232 	params.data = speq;
233 
234 	/* create new decoder */
235 	speq->decoder = arm_spe_decoder_new(&params);
236 	if (!speq->decoder)
237 		goto out_free;
238 
239 	return speq;
240 
241 out_free:
242 	zfree(&speq->event_buf);
243 	free(speq);
244 
245 	return NULL;
246 }
247 
248 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
249 {
250 	return ip >= spe->kernel_start ?
251 		PERF_RECORD_MISC_KERNEL :
252 		PERF_RECORD_MISC_USER;
253 }
254 
255 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
256 				    struct auxtrace_queue *queue)
257 {
258 	struct arm_spe_queue *speq = queue->priv;
259 	pid_t tid;
260 
261 	tid = machine__get_current_tid(spe->machine, speq->cpu);
262 	if (tid != -1) {
263 		speq->tid = tid;
264 		thread__zput(speq->thread);
265 	} else
266 		speq->tid = queue->tid;
267 
268 	if ((!speq->thread) && (speq->tid != -1)) {
269 		speq->thread = machine__find_thread(spe->machine, -1,
270 						    speq->tid);
271 	}
272 
273 	if (speq->thread) {
274 		speq->pid = thread__pid(speq->thread);
275 		if (queue->cpu == -1)
276 			speq->cpu = thread__cpu(speq->thread);
277 	}
278 }
279 
280 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
281 {
282 	struct arm_spe *spe = speq->spe;
283 	int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);
284 
285 	if (err)
286 		return err;
287 
288 	arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
289 
290 	return 0;
291 }
292 
293 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu)
294 {
295 	u64 i;
296 
297 	if (!spe->metadata)
298 		return NULL;
299 
300 	for (i = 0; i < spe->metadata_nr_cpu; i++)
301 		if (spe->metadata[i][ARM_SPE_CPU] == cpu)
302 			return spe->metadata[i];
303 
304 	return NULL;
305 }
306 
307 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
308 {
309 	struct simd_flags simd_flags = {};
310 
311 	if ((record->op & ARM_SPE_OP_LDST) && (record->op & ARM_SPE_OP_SVE_LDST))
312 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
313 
314 	if ((record->op & ARM_SPE_OP_OTHER) && (record->op & ARM_SPE_OP_SVE_OTHER))
315 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
316 
317 	if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
318 		simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL;
319 
320 	if (record->type & ARM_SPE_SVE_EMPTY_PRED)
321 		simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY;
322 
323 	return simd_flags;
324 }
325 
326 static void arm_spe_prep_sample(struct arm_spe *spe,
327 				struct arm_spe_queue *speq,
328 				union perf_event *event,
329 				struct perf_sample *sample)
330 {
331 	struct arm_spe_record *record = &speq->decoder->record;
332 
333 	if (!spe->timeless_decoding)
334 		sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
335 
336 	sample->ip = record->from_ip;
337 	sample->cpumode = arm_spe_cpumode(spe, sample->ip);
338 	sample->pid = speq->pid;
339 	sample->tid = speq->tid;
340 	sample->period = 1;
341 	sample->cpu = speq->cpu;
342 	sample->simd_flags = arm_spe__synth_simd_flags(record);
343 
344 	event->sample.header.type = PERF_RECORD_SAMPLE;
345 	event->sample.header.misc = sample->cpumode;
346 	event->sample.header.size = sizeof(struct perf_event_header);
347 }
348 
349 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
350 {
351 	event->header.size = perf_event__sample_event_size(sample, type, 0);
352 	return perf_event__synthesize_sample(event, type, 0, sample);
353 }
354 
355 static inline int
356 arm_spe_deliver_synth_event(struct arm_spe *spe,
357 			    struct arm_spe_queue *speq __maybe_unused,
358 			    union perf_event *event,
359 			    struct perf_sample *sample)
360 {
361 	int ret;
362 
363 	if (spe->synth_opts.inject) {
364 		ret = arm_spe__inject_event(event, sample, spe->sample_type);
365 		if (ret)
366 			return ret;
367 	}
368 
369 	ret = perf_session__deliver_synth_event(spe->session, event, sample);
370 	if (ret)
371 		pr_err("ARM SPE: failed to deliver event, error %d\n", ret);
372 
373 	return ret;
374 }
375 
376 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
377 				     u64 spe_events_id, u64 data_src)
378 {
379 	struct arm_spe *spe = speq->spe;
380 	struct arm_spe_record *record = &speq->decoder->record;
381 	union perf_event *event = speq->event_buf;
382 	struct perf_sample sample = { .ip = 0, };
383 
384 	arm_spe_prep_sample(spe, speq, event, &sample);
385 
386 	sample.id = spe_events_id;
387 	sample.stream_id = spe_events_id;
388 	sample.addr = record->virt_addr;
389 	sample.phys_addr = record->phys_addr;
390 	sample.data_src = data_src;
391 	sample.weight = record->latency;
392 
393 	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
394 }
395 
396 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
397 					u64 spe_events_id)
398 {
399 	struct arm_spe *spe = speq->spe;
400 	struct arm_spe_record *record = &speq->decoder->record;
401 	union perf_event *event = speq->event_buf;
402 	struct perf_sample sample = { .ip = 0, };
403 
404 	arm_spe_prep_sample(spe, speq, event, &sample);
405 
406 	sample.id = spe_events_id;
407 	sample.stream_id = spe_events_id;
408 	sample.addr = record->to_ip;
409 	sample.weight = record->latency;
410 	sample.flags = speq->flags;
411 
412 	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
413 }
414 
415 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
416 					     u64 spe_events_id, u64 data_src)
417 {
418 	struct arm_spe *spe = speq->spe;
419 	struct arm_spe_record *record = &speq->decoder->record;
420 	union perf_event *event = speq->event_buf;
421 	struct perf_sample sample = { .ip = 0, };
422 
423 	/*
424 	 * Handles perf instruction sampling period.
425 	 */
426 	speq->period_instructions++;
427 	if (speq->period_instructions < spe->instructions_sample_period)
428 		return 0;
429 	speq->period_instructions = 0;
430 
431 	arm_spe_prep_sample(spe, speq, event, &sample);
432 
433 	sample.id = spe_events_id;
434 	sample.stream_id = spe_events_id;
435 	sample.addr = record->to_ip;
436 	sample.phys_addr = record->phys_addr;
437 	sample.data_src = data_src;
438 	sample.period = spe->instructions_sample_period;
439 	sample.weight = record->latency;
440 	sample.flags = speq->flags;
441 
442 	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
443 }
444 
445 static const struct midr_range common_ds_encoding_cpus[] = {
446 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
447 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
448 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
449 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
450 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
451 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
452 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
453 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
454 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
455 	{},
456 };
457 
458 static const struct midr_range ampereone_ds_encoding_cpus[] = {
459 	MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
460 	{},
461 };
462 
463 static void arm_spe__sample_flags(struct arm_spe_queue *speq)
464 {
465 	const struct arm_spe_record *record = &speq->decoder->record;
466 
467 	speq->flags = 0;
468 	if (record->op & ARM_SPE_OP_BRANCH_ERET) {
469 		speq->flags = PERF_IP_FLAG_BRANCH;
470 
471 		if (record->type & ARM_SPE_BRANCH_MISS)
472 			speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
473 	}
474 }
475 
476 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
477 					      union perf_mem_data_src *data_src)
478 {
479 	/*
480 	 * Even though four levels of cache hierarchy are possible, no known
481 	 * production Neoverse systems currently include more than three levels
482 	 * so for the time being we assume three exist. If a production system
483 	 * is built with four the this function would have to be changed to
484 	 * detect the number of levels for reporting.
485 	 */
486 
487 	/*
488 	 * We have no data on the hit level or data source for stores in the
489 	 * Neoverse SPE records.
490 	 */
491 	if (record->op & ARM_SPE_OP_ST) {
492 		data_src->mem_lvl = PERF_MEM_LVL_NA;
493 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
494 		data_src->mem_snoop = PERF_MEM_SNOOP_NA;
495 		return;
496 	}
497 
498 	switch (record->source) {
499 	case ARM_SPE_COMMON_DS_L1D:
500 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
501 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
502 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
503 		break;
504 	case ARM_SPE_COMMON_DS_L2:
505 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
506 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
507 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
508 		break;
509 	case ARM_SPE_COMMON_DS_PEER_CORE:
510 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
511 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
512 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
513 		break;
514 	/*
515 	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
516 	 * transfer, so set SNOOPX_PEER
517 	 */
518 	case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
519 	case ARM_SPE_COMMON_DS_PEER_CLUSTER:
520 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
521 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
522 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
523 		break;
524 	/*
525 	 * System cache is assumed to be L3
526 	 */
527 	case ARM_SPE_COMMON_DS_SYS_CACHE:
528 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
529 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
530 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
531 		break;
532 	/*
533 	 * We don't know what level it hit in, except it came from the other
534 	 * socket
535 	 */
536 	case ARM_SPE_COMMON_DS_REMOTE:
537 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1;
538 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
539 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
540 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
541 		break;
542 	case ARM_SPE_COMMON_DS_DRAM:
543 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
544 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
545 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
546 		break;
547 	default:
548 		break;
549 	}
550 }
551 
552 /*
553  * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
554  * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
555  */
556 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
557 						 union perf_mem_data_src *data_src)
558 {
559 	struct arm_spe_record common_record;
560 
561 	switch (record->source) {
562 	case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
563 		common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
564 		break;
565 	case ARM_SPE_AMPEREONE_SLC:
566 		common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
567 		break;
568 	case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
569 		common_record.source = ARM_SPE_COMMON_DS_REMOTE;
570 		break;
571 	case ARM_SPE_AMPEREONE_DDR:
572 		common_record.source = ARM_SPE_COMMON_DS_DRAM;
573 		break;
574 	case ARM_SPE_AMPEREONE_L1D:
575 		common_record.source = ARM_SPE_COMMON_DS_L1D;
576 		break;
577 	case ARM_SPE_AMPEREONE_L2D:
578 		common_record.source = ARM_SPE_COMMON_DS_L2;
579 		break;
580 	default:
581 		pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
582 				record->source);
583 		return;
584 	}
585 
586 	common_record.op = record->op;
587 	arm_spe__synth_data_source_common(&common_record, data_src);
588 }
589 
590 static const struct data_source_handle data_source_handles[] = {
591 	DS(common_ds_encoding_cpus, data_source_common),
592 	DS(ampereone_ds_encoding_cpus, data_source_ampereone),
593 };
594 
595 static void arm_spe__synth_memory_level(const struct arm_spe_record *record,
596 					union perf_mem_data_src *data_src)
597 {
598 	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
599 		data_src->mem_lvl = PERF_MEM_LVL_L3;
600 
601 		if (record->type & ARM_SPE_LLC_MISS)
602 			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
603 		else
604 			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
605 	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
606 		data_src->mem_lvl = PERF_MEM_LVL_L1;
607 
608 		if (record->type & ARM_SPE_L1D_MISS)
609 			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
610 		else
611 			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
612 	}
613 
614 	if (record->type & ARM_SPE_REMOTE_ACCESS)
615 		data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
616 }
617 
618 static bool arm_spe__synth_ds(struct arm_spe_queue *speq,
619 			      const struct arm_spe_record *record,
620 			      union perf_mem_data_src *data_src)
621 {
622 	struct arm_spe *spe = speq->spe;
623 	u64 *metadata = NULL;
624 	u64 midr;
625 	unsigned int i;
626 
627 	/* Metadata version 1 assumes all CPUs are the same (old behavior) */
628 	if (spe->metadata_ver == 1) {
629 		const char *cpuid;
630 
631 		pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
632 		cpuid = perf_env__cpuid(spe->session->evlist->env);
633 		midr = strtol(cpuid, NULL, 16);
634 	} else {
635 		/* CPU ID is -1 for per-thread mode */
636 		if (speq->cpu < 0) {
637 			/*
638 			 * On the heterogeneous system, due to CPU ID is -1,
639 			 * cannot confirm the data source packet is supported.
640 			 */
641 			if (!spe->is_homogeneous)
642 				return false;
643 
644 			/* In homogeneous system, simply use CPU0's metadata */
645 			if (spe->metadata)
646 				metadata = spe->metadata[0];
647 		} else {
648 			metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
649 		}
650 
651 		if (!metadata)
652 			return false;
653 
654 		midr = metadata[ARM_SPE_CPU_MIDR];
655 	}
656 
657 	for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
658 		if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
659 			data_source_handles[i].ds_synth(record, data_src);
660 			return true;
661 		}
662 	}
663 
664 	return false;
665 }
666 
667 static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,
668 				      const struct arm_spe_record *record)
669 {
670 	union perf_mem_data_src	data_src = { .mem_op = PERF_MEM_OP_NA };
671 
672 	if (record->op & ARM_SPE_OP_LD)
673 		data_src.mem_op = PERF_MEM_OP_LOAD;
674 	else if (record->op & ARM_SPE_OP_ST)
675 		data_src.mem_op = PERF_MEM_OP_STORE;
676 	else
677 		return 0;
678 
679 	if (!arm_spe__synth_ds(speq, record, &data_src))
680 		arm_spe__synth_memory_level(record, &data_src);
681 
682 	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
683 		data_src.mem_dtlb = PERF_MEM_TLB_WK;
684 
685 		if (record->type & ARM_SPE_TLB_MISS)
686 			data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
687 		else
688 			data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
689 	}
690 
691 	return data_src.val;
692 }
693 
694 static int arm_spe_sample(struct arm_spe_queue *speq)
695 {
696 	const struct arm_spe_record *record = &speq->decoder->record;
697 	struct arm_spe *spe = speq->spe;
698 	u64 data_src;
699 	int err;
700 
701 	arm_spe__sample_flags(speq);
702 	data_src = arm_spe__synth_data_source(speq, record);
703 
704 	if (spe->sample_flc) {
705 		if (record->type & ARM_SPE_L1D_MISS) {
706 			err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
707 							data_src);
708 			if (err)
709 				return err;
710 		}
711 
712 		if (record->type & ARM_SPE_L1D_ACCESS) {
713 			err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
714 							data_src);
715 			if (err)
716 				return err;
717 		}
718 	}
719 
720 	if (spe->sample_llc) {
721 		if (record->type & ARM_SPE_LLC_MISS) {
722 			err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
723 							data_src);
724 			if (err)
725 				return err;
726 		}
727 
728 		if (record->type & ARM_SPE_LLC_ACCESS) {
729 			err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
730 							data_src);
731 			if (err)
732 				return err;
733 		}
734 	}
735 
736 	if (spe->sample_tlb) {
737 		if (record->type & ARM_SPE_TLB_MISS) {
738 			err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
739 							data_src);
740 			if (err)
741 				return err;
742 		}
743 
744 		if (record->type & ARM_SPE_TLB_ACCESS) {
745 			err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
746 							data_src);
747 			if (err)
748 				return err;
749 		}
750 	}
751 
752 	if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
753 		err = arm_spe__synth_branch_sample(speq, spe->branch_id);
754 		if (err)
755 			return err;
756 	}
757 
758 	if (spe->sample_remote_access &&
759 	    (record->type & ARM_SPE_REMOTE_ACCESS)) {
760 		err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
761 						data_src);
762 		if (err)
763 			return err;
764 	}
765 
766 	/*
767 	 * When data_src is zero it means the record is not a memory operation,
768 	 * skip to synthesize memory sample for this case.
769 	 */
770 	if (spe->sample_memory && data_src) {
771 		err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
772 		if (err)
773 			return err;
774 	}
775 
776 	if (spe->sample_instructions) {
777 		err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
778 		if (err)
779 			return err;
780 	}
781 
782 	return 0;
783 }
784 
785 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
786 {
787 	struct arm_spe *spe = speq->spe;
788 	struct arm_spe_record *record;
789 	int ret;
790 
791 	if (!spe->kernel_start)
792 		spe->kernel_start = machine__kernel_start(spe->machine);
793 
794 	while (1) {
795 		/*
796 		 * The usual logic is firstly to decode the packets, and then
797 		 * based the record to synthesize sample; but here the flow is
798 		 * reversed: it calls arm_spe_sample() for synthesizing samples
799 		 * prior to arm_spe_decode().
800 		 *
801 		 * Two reasons for this code logic:
802 		 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
803 		 * has decoded trace data and generated a record, but the record
804 		 * is left to generate sample until run to here, so it's correct
805 		 * to synthesize sample for the left record.
806 		 * 2. After decoding trace data, it needs to compare the record
807 		 * timestamp with the coming perf event, if the record timestamp
808 		 * is later than the perf event, it needs bail out and pushs the
809 		 * record into auxtrace heap, thus the record can be deferred to
810 		 * synthesize sample until run to here at the next time; so this
811 		 * can correlate samples between Arm SPE trace data and other
812 		 * perf events with correct time ordering.
813 		 */
814 
815 		/*
816 		 * Update pid/tid info.
817 		 */
818 		record = &speq->decoder->record;
819 		if (!spe->timeless_decoding && record->context_id != (u64)-1) {
820 			ret = arm_spe_set_tid(speq, record->context_id);
821 			if (ret)
822 				return ret;
823 
824 			spe->use_ctx_pkt_for_pid = true;
825 		}
826 
827 		ret = arm_spe_sample(speq);
828 		if (ret)
829 			return ret;
830 
831 		ret = arm_spe_decode(speq->decoder);
832 		if (!ret) {
833 			pr_debug("No data or all data has been processed.\n");
834 			return 1;
835 		}
836 
837 		/*
838 		 * Error is detected when decode SPE trace data, continue to
839 		 * the next trace data and find out more records.
840 		 */
841 		if (ret < 0)
842 			continue;
843 
844 		record = &speq->decoder->record;
845 
846 		/* Update timestamp for the last record */
847 		if (record->timestamp > speq->timestamp)
848 			speq->timestamp = record->timestamp;
849 
850 		/*
851 		 * If the timestamp of the queue is later than timestamp of the
852 		 * coming perf event, bail out so can allow the perf event to
853 		 * be processed ahead.
854 		 */
855 		if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
856 			*timestamp = speq->timestamp;
857 			return 0;
858 		}
859 	}
860 
861 	return 0;
862 }
863 
864 static int arm_spe__setup_queue(struct arm_spe *spe,
865 			       struct auxtrace_queue *queue,
866 			       unsigned int queue_nr)
867 {
868 	struct arm_spe_queue *speq = queue->priv;
869 	struct arm_spe_record *record;
870 
871 	if (list_empty(&queue->head) || speq)
872 		return 0;
873 
874 	speq = arm_spe__alloc_queue(spe, queue_nr);
875 
876 	if (!speq)
877 		return -ENOMEM;
878 
879 	queue->priv = speq;
880 
881 	if (queue->cpu != -1)
882 		speq->cpu = queue->cpu;
883 
884 	if (!speq->on_heap) {
885 		int ret;
886 
887 		if (spe->timeless_decoding)
888 			return 0;
889 
890 retry:
891 		ret = arm_spe_decode(speq->decoder);
892 
893 		if (!ret)
894 			return 0;
895 
896 		if (ret < 0)
897 			goto retry;
898 
899 		record = &speq->decoder->record;
900 
901 		speq->timestamp = record->timestamp;
902 		ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
903 		if (ret)
904 			return ret;
905 		speq->on_heap = true;
906 	}
907 
908 	return 0;
909 }
910 
911 static int arm_spe__setup_queues(struct arm_spe *spe)
912 {
913 	unsigned int i;
914 	int ret;
915 
916 	for (i = 0; i < spe->queues.nr_queues; i++) {
917 		ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
918 		if (ret)
919 			return ret;
920 	}
921 
922 	return 0;
923 }
924 
925 static int arm_spe__update_queues(struct arm_spe *spe)
926 {
927 	if (spe->queues.new_data) {
928 		spe->queues.new_data = false;
929 		return arm_spe__setup_queues(spe);
930 	}
931 
932 	return 0;
933 }
934 
935 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
936 {
937 	struct evsel *evsel;
938 	struct evlist *evlist = spe->session->evlist;
939 	bool timeless_decoding = true;
940 
941 	/*
942 	 * Circle through the list of event and complain if we find one
943 	 * with the time bit set.
944 	 */
945 	evlist__for_each_entry(evlist, evsel) {
946 		if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
947 			timeless_decoding = false;
948 	}
949 
950 	return timeless_decoding;
951 }
952 
953 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
954 {
955 	unsigned int queue_nr;
956 	u64 ts;
957 	int ret;
958 
959 	while (1) {
960 		struct auxtrace_queue *queue;
961 		struct arm_spe_queue *speq;
962 
963 		if (!spe->heap.heap_cnt)
964 			return 0;
965 
966 		if (spe->heap.heap_array[0].ordinal >= timestamp)
967 			return 0;
968 
969 		queue_nr = spe->heap.heap_array[0].queue_nr;
970 		queue = &spe->queues.queue_array[queue_nr];
971 		speq = queue->priv;
972 
973 		auxtrace_heap__pop(&spe->heap);
974 
975 		if (spe->heap.heap_cnt) {
976 			ts = spe->heap.heap_array[0].ordinal + 1;
977 			if (ts > timestamp)
978 				ts = timestamp;
979 		} else {
980 			ts = timestamp;
981 		}
982 
983 		/*
984 		 * A previous context-switch event has set pid/tid in the machine's context, so
985 		 * here we need to update the pid/tid in the thread and SPE queue.
986 		 */
987 		if (!spe->use_ctx_pkt_for_pid)
988 			arm_spe_set_pid_tid_cpu(spe, queue);
989 
990 		ret = arm_spe_run_decoder(speq, &ts);
991 		if (ret < 0) {
992 			auxtrace_heap__add(&spe->heap, queue_nr, ts);
993 			return ret;
994 		}
995 
996 		if (!ret) {
997 			ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
998 			if (ret < 0)
999 				return ret;
1000 		} else {
1001 			speq->on_heap = false;
1002 		}
1003 	}
1004 
1005 	return 0;
1006 }
1007 
1008 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
1009 					    u64 time_)
1010 {
1011 	struct auxtrace_queues *queues = &spe->queues;
1012 	unsigned int i;
1013 	u64 ts = 0;
1014 
1015 	for (i = 0; i < queues->nr_queues; i++) {
1016 		struct auxtrace_queue *queue = &spe->queues.queue_array[i];
1017 		struct arm_spe_queue *speq = queue->priv;
1018 
1019 		if (speq && (tid == -1 || speq->tid == tid)) {
1020 			speq->time = time_;
1021 			arm_spe_set_pid_tid_cpu(spe, queue);
1022 			arm_spe_run_decoder(speq, &ts);
1023 		}
1024 	}
1025 	return 0;
1026 }
1027 
1028 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
1029 				  struct perf_sample *sample)
1030 {
1031 	pid_t pid, tid;
1032 	int cpu;
1033 
1034 	if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
1035 		return 0;
1036 
1037 	pid = event->context_switch.next_prev_pid;
1038 	tid = event->context_switch.next_prev_tid;
1039 	cpu = sample->cpu;
1040 
1041 	if (tid == -1)
1042 		pr_warning("context_switch event has no tid\n");
1043 
1044 	return machine__set_current_tid(spe->machine, cpu, pid, tid);
1045 }
1046 
1047 static int arm_spe_process_event(struct perf_session *session,
1048 				 union perf_event *event,
1049 				 struct perf_sample *sample,
1050 				 const struct perf_tool *tool)
1051 {
1052 	int err = 0;
1053 	u64 timestamp;
1054 	struct arm_spe *spe = container_of(session->auxtrace,
1055 			struct arm_spe, auxtrace);
1056 
1057 	if (dump_trace)
1058 		return 0;
1059 
1060 	if (!tool->ordered_events) {
1061 		pr_err("SPE trace requires ordered events\n");
1062 		return -EINVAL;
1063 	}
1064 
1065 	if (sample->time && (sample->time != (u64) -1))
1066 		timestamp = perf_time_to_tsc(sample->time, &spe->tc);
1067 	else
1068 		timestamp = 0;
1069 
1070 	if (timestamp || spe->timeless_decoding) {
1071 		err = arm_spe__update_queues(spe);
1072 		if (err)
1073 			return err;
1074 	}
1075 
1076 	if (spe->timeless_decoding) {
1077 		if (event->header.type == PERF_RECORD_EXIT) {
1078 			err = arm_spe_process_timeless_queues(spe,
1079 					event->fork.tid,
1080 					sample->time);
1081 		}
1082 	} else if (timestamp) {
1083 		err = arm_spe_process_queues(spe, timestamp);
1084 		if (err)
1085 			return err;
1086 
1087 		if (!spe->use_ctx_pkt_for_pid &&
1088 		    (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
1089 		    event->header.type == PERF_RECORD_SWITCH))
1090 			err = arm_spe_context_switch(spe, event, sample);
1091 	}
1092 
1093 	return err;
1094 }
1095 
1096 static int arm_spe_process_auxtrace_event(struct perf_session *session,
1097 					  union perf_event *event,
1098 					  const struct perf_tool *tool __maybe_unused)
1099 {
1100 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1101 					     auxtrace);
1102 
1103 	if (!spe->data_queued) {
1104 		struct auxtrace_buffer *buffer;
1105 		off_t data_offset;
1106 		int fd = perf_data__fd(session->data);
1107 		int err;
1108 
1109 		if (perf_data__is_pipe(session->data)) {
1110 			data_offset = 0;
1111 		} else {
1112 			data_offset = lseek(fd, 0, SEEK_CUR);
1113 			if (data_offset == -1)
1114 				return -errno;
1115 		}
1116 
1117 		err = auxtrace_queues__add_event(&spe->queues, session, event,
1118 				data_offset, &buffer);
1119 		if (err)
1120 			return err;
1121 
1122 		/* Dump here now we have copied a piped trace out of the pipe */
1123 		if (dump_trace) {
1124 			if (auxtrace_buffer__get_data(buffer, fd)) {
1125 				arm_spe_dump_event(spe, buffer->data,
1126 						buffer->size);
1127 				auxtrace_buffer__put_data(buffer);
1128 			}
1129 		}
1130 	}
1131 
1132 	return 0;
1133 }
1134 
1135 static int arm_spe_flush(struct perf_session *session __maybe_unused,
1136 			 const struct perf_tool *tool __maybe_unused)
1137 {
1138 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1139 			auxtrace);
1140 	int ret;
1141 
1142 	if (dump_trace)
1143 		return 0;
1144 
1145 	if (!tool->ordered_events)
1146 		return -EINVAL;
1147 
1148 	ret = arm_spe__update_queues(spe);
1149 	if (ret < 0)
1150 		return ret;
1151 
1152 	if (spe->timeless_decoding)
1153 		return arm_spe_process_timeless_queues(spe, -1,
1154 				MAX_TIMESTAMP - 1);
1155 
1156 	ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
1157 	if (ret)
1158 		return ret;
1159 
1160 	if (!spe->use_ctx_pkt_for_pid)
1161 		ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
1162 			    "Matching of TIDs to SPE events could be inaccurate.\n");
1163 
1164 	return 0;
1165 }
1166 
1167 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
1168 {
1169 	u64 *metadata;
1170 
1171 	metadata = zalloc(per_cpu_size);
1172 	if (!metadata)
1173 		return NULL;
1174 
1175 	memcpy(metadata, buf, per_cpu_size);
1176 	return metadata;
1177 }
1178 
1179 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
1180 {
1181 	int i;
1182 
1183 	for (i = 0; i < nr_cpu; i++)
1184 		zfree(&metadata[i]);
1185 	free(metadata);
1186 }
1187 
1188 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
1189 				     u64 *ver, int *nr_cpu)
1190 {
1191 	u64 *ptr = (u64 *)info->priv;
1192 	u64 metadata_size;
1193 	u64 **metadata = NULL;
1194 	int hdr_sz, per_cpu_sz, i;
1195 
1196 	metadata_size = info->header.size -
1197 		sizeof(struct perf_record_auxtrace_info);
1198 
1199 	/* Metadata version 1 */
1200 	if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
1201 		*ver = 1;
1202 		*nr_cpu = 0;
1203 		/* No per CPU metadata */
1204 		return NULL;
1205 	}
1206 
1207 	*ver = ptr[ARM_SPE_HEADER_VERSION];
1208 	hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
1209 	*nr_cpu = ptr[ARM_SPE_CPUS_NUM];
1210 
1211 	metadata = calloc(*nr_cpu, sizeof(*metadata));
1212 	if (!metadata)
1213 		return NULL;
1214 
1215 	/* Locate the start address of per CPU metadata */
1216 	ptr += hdr_sz;
1217 	per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
1218 
1219 	for (i = 0; i < *nr_cpu; i++) {
1220 		metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
1221 		if (!metadata[i])
1222 			goto err_per_cpu_metadata;
1223 
1224 		ptr += per_cpu_sz / sizeof(u64);
1225 	}
1226 
1227 	return metadata;
1228 
1229 err_per_cpu_metadata:
1230 	arm_spe__free_metadata(metadata, *nr_cpu);
1231 	return NULL;
1232 }
1233 
1234 static void arm_spe_free_queue(void *priv)
1235 {
1236 	struct arm_spe_queue *speq = priv;
1237 
1238 	if (!speq)
1239 		return;
1240 	thread__zput(speq->thread);
1241 	arm_spe_decoder_free(speq->decoder);
1242 	zfree(&speq->event_buf);
1243 	free(speq);
1244 }
1245 
1246 static void arm_spe_free_events(struct perf_session *session)
1247 {
1248 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1249 					     auxtrace);
1250 	struct auxtrace_queues *queues = &spe->queues;
1251 	unsigned int i;
1252 
1253 	for (i = 0; i < queues->nr_queues; i++) {
1254 		arm_spe_free_queue(queues->queue_array[i].priv);
1255 		queues->queue_array[i].priv = NULL;
1256 	}
1257 	auxtrace_queues__free(queues);
1258 }
1259 
1260 static void arm_spe_free(struct perf_session *session)
1261 {
1262 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1263 					     auxtrace);
1264 
1265 	auxtrace_heap__free(&spe->heap);
1266 	arm_spe_free_events(session);
1267 	session->auxtrace = NULL;
1268 	arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
1269 	free(spe);
1270 }
1271 
1272 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
1273 				      struct evsel *evsel)
1274 {
1275 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);
1276 
1277 	return evsel->core.attr.type == spe->pmu_type;
1278 }
1279 
1280 static const char * const metadata_hdr_v1_fmts[] = {
1281 	[ARM_SPE_PMU_TYPE]		= "  PMU Type           :%"PRId64"\n",
1282 	[ARM_SPE_PER_CPU_MMAPS]		= "  Per CPU mmaps      :%"PRId64"\n",
1283 };
1284 
1285 static const char * const metadata_hdr_fmts[] = {
1286 	[ARM_SPE_HEADER_VERSION]	= "  Header version     :%"PRId64"\n",
1287 	[ARM_SPE_HEADER_SIZE]		= "  Header size        :%"PRId64"\n",
1288 	[ARM_SPE_PMU_TYPE_V2]		= "  PMU type v2        :%"PRId64"\n",
1289 	[ARM_SPE_CPUS_NUM]		= "  CPU number         :%"PRId64"\n",
1290 };
1291 
1292 static const char * const metadata_per_cpu_fmts[] = {
1293 	[ARM_SPE_MAGIC]			= "    Magic            :0x%"PRIx64"\n",
1294 	[ARM_SPE_CPU]			= "    CPU #            :%"PRId64"\n",
1295 	[ARM_SPE_CPU_NR_PARAMS]		= "    Num of params    :%"PRId64"\n",
1296 	[ARM_SPE_CPU_MIDR]		= "    MIDR             :0x%"PRIx64"\n",
1297 	[ARM_SPE_CPU_PMU_TYPE]		= "    PMU Type         :%"PRId64"\n",
1298 	[ARM_SPE_CAP_MIN_IVAL]		= "    Min Interval     :%"PRId64"\n",
1299 };
1300 
1301 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
1302 {
1303 	unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
1304 	const char * const *hdr_fmts;
1305 
1306 	if (!dump_trace)
1307 		return;
1308 
1309 	if (spe->metadata_ver == 1) {
1310 		cpu_num = 0;
1311 		hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
1312 		hdr_fmts = metadata_hdr_v1_fmts;
1313 	} else {
1314 		cpu_num = arr[ARM_SPE_CPUS_NUM];
1315 		hdr_size = arr[ARM_SPE_HEADER_SIZE];
1316 		hdr_fmts = metadata_hdr_fmts;
1317 	}
1318 
1319 	for (i = 0; i < hdr_size; i++)
1320 		fprintf(stdout, hdr_fmts[i], arr[i]);
1321 
1322 	arr += hdr_size;
1323 	for (cpu = 0; cpu < cpu_num; cpu++) {
1324 		/*
1325 		 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
1326 		 * are fixed. The sequential parameter size is decided by the
1327 		 * field 'ARM_SPE_CPU_NR_PARAMS'.
1328 		 */
1329 		cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
1330 		for (i = 0; i < cpu_size; i++)
1331 			fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
1332 		arr += cpu_size;
1333 	}
1334 }
1335 
1336 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
1337 				    const char *name)
1338 {
1339 	struct evsel *evsel;
1340 
1341 	evlist__for_each_entry(evlist, evsel) {
1342 		if (evsel->core.id && evsel->core.id[0] == id) {
1343 			if (evsel->name)
1344 				zfree(&evsel->name);
1345 			evsel->name = strdup(name);
1346 			break;
1347 		}
1348 	}
1349 }
1350 
1351 static int
1352 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
1353 {
1354 	struct evlist *evlist = session->evlist;
1355 	struct evsel *evsel;
1356 	struct perf_event_attr attr;
1357 	bool found = false;
1358 	u64 id;
1359 	int err;
1360 
1361 	evlist__for_each_entry(evlist, evsel) {
1362 		if (evsel->core.attr.type == spe->pmu_type) {
1363 			found = true;
1364 			break;
1365 		}
1366 	}
1367 
1368 	if (!found) {
1369 		pr_debug("No selected events with SPE trace data\n");
1370 		return 0;
1371 	}
1372 
1373 	memset(&attr, 0, sizeof(struct perf_event_attr));
1374 	attr.size = sizeof(struct perf_event_attr);
1375 	attr.type = PERF_TYPE_HARDWARE;
1376 	attr.sample_type = evsel->core.attr.sample_type &
1377 				(PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
1378 	attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
1379 			    PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
1380 			    PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
1381 	if (spe->timeless_decoding)
1382 		attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
1383 	else
1384 		attr.sample_type |= PERF_SAMPLE_TIME;
1385 
1386 	spe->sample_type = attr.sample_type;
1387 
1388 	attr.exclude_user = evsel->core.attr.exclude_user;
1389 	attr.exclude_kernel = evsel->core.attr.exclude_kernel;
1390 	attr.exclude_hv = evsel->core.attr.exclude_hv;
1391 	attr.exclude_host = evsel->core.attr.exclude_host;
1392 	attr.exclude_guest = evsel->core.attr.exclude_guest;
1393 	attr.sample_id_all = evsel->core.attr.sample_id_all;
1394 	attr.read_format = evsel->core.attr.read_format;
1395 
1396 	/* create new id val to be a fixed offset from evsel id */
1397 	id = evsel->core.id[0] + 1000000000;
1398 
1399 	if (!id)
1400 		id = 1;
1401 
1402 	if (spe->synth_opts.flc) {
1403 		spe->sample_flc = true;
1404 
1405 		/* Level 1 data cache miss */
1406 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1407 		if (err)
1408 			return err;
1409 		spe->l1d_miss_id = id;
1410 		arm_spe_set_event_name(evlist, id, "l1d-miss");
1411 		id += 1;
1412 
1413 		/* Level 1 data cache access */
1414 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1415 		if (err)
1416 			return err;
1417 		spe->l1d_access_id = id;
1418 		arm_spe_set_event_name(evlist, id, "l1d-access");
1419 		id += 1;
1420 	}
1421 
1422 	if (spe->synth_opts.llc) {
1423 		spe->sample_llc = true;
1424 
1425 		/* Last level cache miss */
1426 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1427 		if (err)
1428 			return err;
1429 		spe->llc_miss_id = id;
1430 		arm_spe_set_event_name(evlist, id, "llc-miss");
1431 		id += 1;
1432 
1433 		/* Last level cache access */
1434 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1435 		if (err)
1436 			return err;
1437 		spe->llc_access_id = id;
1438 		arm_spe_set_event_name(evlist, id, "llc-access");
1439 		id += 1;
1440 	}
1441 
1442 	if (spe->synth_opts.tlb) {
1443 		spe->sample_tlb = true;
1444 
1445 		/* TLB miss */
1446 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1447 		if (err)
1448 			return err;
1449 		spe->tlb_miss_id = id;
1450 		arm_spe_set_event_name(evlist, id, "tlb-miss");
1451 		id += 1;
1452 
1453 		/* TLB access */
1454 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1455 		if (err)
1456 			return err;
1457 		spe->tlb_access_id = id;
1458 		arm_spe_set_event_name(evlist, id, "tlb-access");
1459 		id += 1;
1460 	}
1461 
1462 	if (spe->synth_opts.branches) {
1463 		spe->sample_branch = true;
1464 
1465 		/* Branch */
1466 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1467 		if (err)
1468 			return err;
1469 		spe->branch_id = id;
1470 		arm_spe_set_event_name(evlist, id, "branch");
1471 		id += 1;
1472 	}
1473 
1474 	if (spe->synth_opts.remote_access) {
1475 		spe->sample_remote_access = true;
1476 
1477 		/* Remote access */
1478 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1479 		if (err)
1480 			return err;
1481 		spe->remote_access_id = id;
1482 		arm_spe_set_event_name(evlist, id, "remote-access");
1483 		id += 1;
1484 	}
1485 
1486 	if (spe->synth_opts.mem) {
1487 		spe->sample_memory = true;
1488 
1489 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1490 		if (err)
1491 			return err;
1492 		spe->memory_id = id;
1493 		arm_spe_set_event_name(evlist, id, "memory");
1494 		id += 1;
1495 	}
1496 
1497 	if (spe->synth_opts.instructions) {
1498 		if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
1499 			pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n");
1500 			goto synth_instructions_out;
1501 		}
1502 		if (spe->synth_opts.period > 1)
1503 			pr_warning("Arm SPE has a hardware-based sample period.\n"
1504 				   "Additional instruction events will be discarded by --itrace\n");
1505 
1506 		spe->sample_instructions = true;
1507 		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
1508 		attr.sample_period = spe->synth_opts.period;
1509 		spe->instructions_sample_period = attr.sample_period;
1510 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1511 		if (err)
1512 			return err;
1513 		spe->instructions_id = id;
1514 		arm_spe_set_event_name(evlist, id, "instructions");
1515 	}
1516 synth_instructions_out:
1517 
1518 	return 0;
1519 }
1520 
1521 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
1522 {
1523 	u64 midr;
1524 	int i;
1525 
1526 	if (!nr_cpu)
1527 		return false;
1528 
1529 	for (i = 0; i < nr_cpu; i++) {
1530 		if (!metadata[i])
1531 			return false;
1532 
1533 		if (i == 0) {
1534 			midr = metadata[i][ARM_SPE_CPU_MIDR];
1535 			continue;
1536 		}
1537 
1538 		if (midr != metadata[i][ARM_SPE_CPU_MIDR])
1539 			return false;
1540 	}
1541 
1542 	return true;
1543 }
1544 
1545 int arm_spe_process_auxtrace_info(union perf_event *event,
1546 				  struct perf_session *session)
1547 {
1548 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1549 	size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
1550 	struct perf_record_time_conv *tc = &session->time_conv;
1551 	struct arm_spe *spe;
1552 	u64 **metadata = NULL;
1553 	u64 metadata_ver;
1554 	int nr_cpu, err;
1555 
1556 	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
1557 					min_sz)
1558 		return -EINVAL;
1559 
1560 	metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
1561 					   &nr_cpu);
1562 	if (!metadata && metadata_ver != 1) {
1563 		pr_err("Failed to parse Arm SPE metadata.\n");
1564 		return -EINVAL;
1565 	}
1566 
1567 	spe = zalloc(sizeof(struct arm_spe));
1568 	if (!spe) {
1569 		err = -ENOMEM;
1570 		goto err_free_metadata;
1571 	}
1572 
1573 	err = auxtrace_queues__init(&spe->queues);
1574 	if (err)
1575 		goto err_free;
1576 
1577 	spe->session = session;
1578 	spe->machine = &session->machines.host; /* No kvm support */
1579 	spe->auxtrace_type = auxtrace_info->type;
1580 	if (metadata_ver == 1)
1581 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1582 	else
1583 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
1584 	spe->metadata = metadata;
1585 	spe->metadata_ver = metadata_ver;
1586 	spe->metadata_nr_cpu = nr_cpu;
1587 	spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
1588 
1589 	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
1590 
1591 	/*
1592 	 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
1593 	 * and the parameters for hardware clock are stored in the session
1594 	 * context.  Passes these parameters to the struct perf_tsc_conversion
1595 	 * in "spe->tc", which is used for later conversion between clock
1596 	 * counter and timestamp.
1597 	 *
1598 	 * For backward compatibility, copies the fields starting from
1599 	 * "time_cycles" only if they are contained in the event.
1600 	 */
1601 	spe->tc.time_shift = tc->time_shift;
1602 	spe->tc.time_mult = tc->time_mult;
1603 	spe->tc.time_zero = tc->time_zero;
1604 
1605 	if (event_contains(*tc, time_cycles)) {
1606 		spe->tc.time_cycles = tc->time_cycles;
1607 		spe->tc.time_mask = tc->time_mask;
1608 		spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
1609 		spe->tc.cap_user_time_short = tc->cap_user_time_short;
1610 	}
1611 
1612 	spe->auxtrace.process_event = arm_spe_process_event;
1613 	spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
1614 	spe->auxtrace.flush_events = arm_spe_flush;
1615 	spe->auxtrace.free_events = arm_spe_free_events;
1616 	spe->auxtrace.free = arm_spe_free;
1617 	spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
1618 	session->auxtrace = &spe->auxtrace;
1619 
1620 	arm_spe_print_info(spe, &auxtrace_info->priv[0]);
1621 
1622 	if (dump_trace)
1623 		return 0;
1624 
1625 	if (session->itrace_synth_opts && session->itrace_synth_opts->set)
1626 		spe->synth_opts = *session->itrace_synth_opts;
1627 	else
1628 		itrace_synth_opts__set_default(&spe->synth_opts, false);
1629 
1630 	err = arm_spe_synth_events(spe, session);
1631 	if (err)
1632 		goto err_free_queues;
1633 
1634 	err = auxtrace_queues__process_index(&spe->queues, session);
1635 	if (err)
1636 		goto err_free_queues;
1637 
1638 	if (spe->queues.populated)
1639 		spe->data_queued = true;
1640 
1641 	return 0;
1642 
1643 err_free_queues:
1644 	auxtrace_queues__free(&spe->queues);
1645 	session->auxtrace = NULL;
1646 err_free:
1647 	free(spe);
1648 err_free_metadata:
1649 	arm_spe__free_metadata(metadata, nr_cpu);
1650 	return err;
1651 }
1652