xref: /linux/tools/perf/arch/x86/util/intel-pt.c (revision 566ab427f827b0256d3e8ce0235d088e6a9c28bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_pt.c: Intel Processor Trace support
4  * Copyright (c) 2013-2015, Intel Corporation.
5  */
6 
7 #include <errno.h>
8 #include <stdbool.h>
9 #include <linux/kernel.h>
10 #include <linux/types.h>
11 #include <linux/bitops.h>
12 #include <linux/log2.h>
13 #include <linux/zalloc.h>
14 #include <linux/err.h>
15 #include <cpuid.h>
16 
17 #include "../../../util/session.h"
18 #include "../../../util/event.h"
19 #include "../../../util/evlist.h"
20 #include "../../../util/evsel.h"
21 #include "../../../util/evsel_config.h"
22 #include "../../../util/cpumap.h"
23 #include "../../../util/mmap.h"
24 #include <subcmd/parse-options.h>
25 #include "../../../util/parse-events.h"
26 #include "../../../util/pmus.h"
27 #include "../../../util/debug.h"
28 #include "../../../util/auxtrace.h"
29 #include "../../../util/perf_api_probe.h"
30 #include "../../../util/record.h"
31 #include "../../../util/target.h"
32 #include "../../../util/tsc.h"
33 #include <internal/lib.h> // page_size
34 #include "../../../util/intel-pt.h"
35 #include <api/fs/fs.h>
36 
37 #define KiB(x) ((x) * 1024)
38 #define MiB(x) ((x) * 1024 * 1024)
39 #define KiB_MASK(x) (KiB(x) - 1)
40 #define MiB_MASK(x) (MiB(x) - 1)
41 
42 #define INTEL_PT_PSB_PERIOD_NEAR	256
43 
44 struct intel_pt_snapshot_ref {
45 	void *ref_buf;
46 	size_t ref_offset;
47 	bool wrapped;
48 };
49 
50 struct intel_pt_recording {
51 	struct auxtrace_record		itr;
52 	struct perf_pmu			*intel_pt_pmu;
53 	int				have_sched_switch;
54 	struct evlist		*evlist;
55 	bool				snapshot_mode;
56 	bool				snapshot_init_done;
57 	size_t				snapshot_size;
58 	size_t				snapshot_ref_buf_size;
59 	int				snapshot_ref_cnt;
60 	struct intel_pt_snapshot_ref	*snapshot_refs;
61 	size_t				priv_size;
62 };
63 
64 static int intel_pt_parse_terms_with_default(const struct perf_pmu *pmu,
65 					     const char *str,
66 					     u64 *config)
67 {
68 	struct parse_events_terms terms;
69 	struct perf_event_attr attr = { .size = 0, };
70 	int err;
71 
72 	parse_events_terms__init(&terms);
73 	err = parse_events_terms(&terms, str, /*input=*/ NULL);
74 	if (err)
75 		goto out_free;
76 
77 	attr.config = *config;
78 	err = perf_pmu__config_terms(pmu, &attr, &terms, /*zero=*/true, /*err=*/NULL);
79 	if (err)
80 		goto out_free;
81 
82 	*config = attr.config;
83 out_free:
84 	parse_events_terms__exit(&terms);
85 	return err;
86 }
87 
88 static int intel_pt_parse_terms(const struct perf_pmu *pmu, const char *str, u64 *config)
89 {
90 	*config = 0;
91 	return intel_pt_parse_terms_with_default(pmu, str, config);
92 }
93 
94 static u64 intel_pt_masked_bits(u64 mask, u64 bits)
95 {
96 	const u64 top_bit = 1ULL << 63;
97 	u64 res = 0;
98 	int i;
99 
100 	for (i = 0; i < 64; i++) {
101 		if (mask & top_bit) {
102 			res <<= 1;
103 			if (bits & top_bit)
104 				res |= 1;
105 		}
106 		mask <<= 1;
107 		bits <<= 1;
108 	}
109 
110 	return res;
111 }
112 
113 static int intel_pt_read_config(struct perf_pmu *intel_pt_pmu, const char *str,
114 				struct evlist *evlist, u64 *res)
115 {
116 	struct evsel *evsel;
117 	u64 mask;
118 
119 	*res = 0;
120 
121 	mask = perf_pmu__format_bits(intel_pt_pmu, str);
122 	if (!mask)
123 		return -EINVAL;
124 
125 	evlist__for_each_entry(evlist, evsel) {
126 		if (evsel->core.attr.type == intel_pt_pmu->type) {
127 			*res = intel_pt_masked_bits(mask, evsel->core.attr.config);
128 			return 0;
129 		}
130 	}
131 
132 	return -EINVAL;
133 }
134 
135 static size_t intel_pt_psb_period(struct perf_pmu *intel_pt_pmu,
136 				  struct evlist *evlist)
137 {
138 	u64 val;
139 	int err, topa_multiple_entries;
140 	size_t psb_period;
141 
142 	if (perf_pmu__scan_file(intel_pt_pmu, "caps/topa_multiple_entries",
143 				"%d", &topa_multiple_entries) != 1)
144 		topa_multiple_entries = 0;
145 
146 	/*
147 	 * Use caps/topa_multiple_entries to indicate early hardware that had
148 	 * extra frequent PSBs.
149 	 */
150 	if (!topa_multiple_entries) {
151 		psb_period = 256;
152 		goto out;
153 	}
154 
155 	err = intel_pt_read_config(intel_pt_pmu, "psb_period", evlist, &val);
156 	if (err)
157 		val = 0;
158 
159 	psb_period = 1 << (val + 11);
160 out:
161 	pr_debug2("%s psb_period %zu\n", intel_pt_pmu->name, psb_period);
162 	return psb_period;
163 }
164 
165 static int intel_pt_pick_bit(int bits, int target)
166 {
167 	int pos, pick = -1;
168 
169 	for (pos = 0; bits; bits >>= 1, pos++) {
170 		if (bits & 1) {
171 			if (pos <= target || pick < 0)
172 				pick = pos;
173 			if (pos >= target)
174 				break;
175 		}
176 	}
177 
178 	return pick;
179 }
180 
181 static u64 intel_pt_default_config(const struct perf_pmu *intel_pt_pmu)
182 {
183 	char buf[256];
184 	int mtc, mtc_periods = 0, mtc_period;
185 	int psb_cyc, psb_periods, psb_period;
186 	int pos = 0;
187 	u64 config;
188 	char c;
189 	int dirfd;
190 
191 	dirfd = perf_pmu__event_source_devices_fd();
192 
193 	pos += scnprintf(buf + pos, sizeof(buf) - pos, "tsc");
194 
195 	if (perf_pmu__scan_file_at(intel_pt_pmu, dirfd, "caps/mtc", "%d",
196 				   &mtc) != 1)
197 		mtc = 1;
198 
199 	if (mtc) {
200 		if (perf_pmu__scan_file_at(intel_pt_pmu, dirfd, "caps/mtc_periods", "%x",
201 					   &mtc_periods) != 1)
202 			mtc_periods = 0;
203 		if (mtc_periods) {
204 			mtc_period = intel_pt_pick_bit(mtc_periods, 3);
205 			pos += scnprintf(buf + pos, sizeof(buf) - pos,
206 					 ",mtc,mtc_period=%d", mtc_period);
207 		}
208 	}
209 
210 	if (perf_pmu__scan_file_at(intel_pt_pmu, dirfd, "caps/psb_cyc", "%d",
211 				   &psb_cyc) != 1)
212 		psb_cyc = 1;
213 
214 	if (psb_cyc && mtc_periods) {
215 		if (perf_pmu__scan_file_at(intel_pt_pmu, dirfd, "caps/psb_periods", "%x",
216 					   &psb_periods) != 1)
217 			psb_periods = 0;
218 		if (psb_periods) {
219 			psb_period = intel_pt_pick_bit(psb_periods, 3);
220 			pos += scnprintf(buf + pos, sizeof(buf) - pos,
221 					 ",psb_period=%d", psb_period);
222 		}
223 	}
224 
225 	if (perf_pmu__scan_file_at(intel_pt_pmu, dirfd, "format/pt", "%c", &c) == 1 &&
226 	    perf_pmu__scan_file_at(intel_pt_pmu, dirfd, "format/branch", "%c", &c) == 1)
227 		pos += scnprintf(buf + pos, sizeof(buf) - pos, ",pt,branch");
228 
229 	pr_debug2("%s default config: %s\n", intel_pt_pmu->name, buf);
230 
231 	intel_pt_parse_terms(intel_pt_pmu, buf, &config);
232 
233 	close(dirfd);
234 	return config;
235 }
236 
237 static int intel_pt_parse_snapshot_options(struct auxtrace_record *itr,
238 					   struct record_opts *opts,
239 					   const char *str)
240 {
241 	struct intel_pt_recording *ptr =
242 			container_of(itr, struct intel_pt_recording, itr);
243 	unsigned long long snapshot_size = 0;
244 	char *endptr;
245 
246 	if (str) {
247 		snapshot_size = strtoull(str, &endptr, 0);
248 		if (*endptr || snapshot_size > SIZE_MAX)
249 			return -1;
250 	}
251 
252 	opts->auxtrace_snapshot_mode = true;
253 	opts->auxtrace_snapshot_size = snapshot_size;
254 
255 	ptr->snapshot_size = snapshot_size;
256 
257 	return 0;
258 }
259 
260 void intel_pt_pmu_default_config(const struct perf_pmu *intel_pt_pmu,
261 				 struct perf_event_attr *attr)
262 {
263 	static u64 config;
264 	static bool initialized;
265 
266 	if (!initialized) {
267 		config = intel_pt_default_config(intel_pt_pmu);
268 		initialized = true;
269 	}
270 	attr->config = config;
271 }
272 
273 static const char *intel_pt_find_filter(struct evlist *evlist,
274 					struct perf_pmu *intel_pt_pmu)
275 {
276 	struct evsel *evsel;
277 
278 	evlist__for_each_entry(evlist, evsel) {
279 		if (evsel->core.attr.type == intel_pt_pmu->type)
280 			return evsel->filter;
281 	}
282 
283 	return NULL;
284 }
285 
286 static size_t intel_pt_filter_bytes(const char *filter)
287 {
288 	size_t len = filter ? strlen(filter) : 0;
289 
290 	return len ? roundup(len + 1, 8) : 0;
291 }
292 
293 static size_t
294 intel_pt_info_priv_size(struct auxtrace_record *itr, struct evlist *evlist)
295 {
296 	struct intel_pt_recording *ptr =
297 			container_of(itr, struct intel_pt_recording, itr);
298 	const char *filter = intel_pt_find_filter(evlist, ptr->intel_pt_pmu);
299 
300 	ptr->priv_size = (INTEL_PT_AUXTRACE_PRIV_MAX * sizeof(u64)) +
301 			 intel_pt_filter_bytes(filter);
302 	ptr->priv_size += sizeof(u64); /* Cap Event Trace */
303 
304 	return ptr->priv_size;
305 }
306 
307 static void intel_pt_tsc_ctc_ratio(u32 *n, u32 *d)
308 {
309 	unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
310 
311 	__get_cpuid(0x15, &eax, &ebx, &ecx, &edx);
312 	*n = ebx;
313 	*d = eax;
314 }
315 
316 static int intel_pt_info_fill(struct auxtrace_record *itr,
317 			      struct perf_session *session,
318 			      struct perf_record_auxtrace_info *auxtrace_info,
319 			      size_t priv_size)
320 {
321 	struct intel_pt_recording *ptr =
322 			container_of(itr, struct intel_pt_recording, itr);
323 	struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu;
324 	struct perf_event_mmap_page *pc;
325 	struct perf_tsc_conversion tc = { .time_mult = 0, };
326 	bool cap_user_time_zero = false, per_cpu_mmaps;
327 	u64 tsc_bit, mtc_bit, mtc_freq_bits, cyc_bit, noretcomp_bit;
328 	u32 tsc_ctc_ratio_n, tsc_ctc_ratio_d;
329 	unsigned long max_non_turbo_ratio;
330 	size_t filter_str_len;
331 	const char *filter;
332 	int event_trace;
333 	__u64 *info;
334 	int err;
335 
336 	if (priv_size != ptr->priv_size)
337 		return -EINVAL;
338 
339 	intel_pt_parse_terms(intel_pt_pmu, "tsc", &tsc_bit);
340 	intel_pt_parse_terms(intel_pt_pmu, "noretcomp", &noretcomp_bit);
341 	intel_pt_parse_terms(intel_pt_pmu, "mtc", &mtc_bit);
342 	mtc_freq_bits = perf_pmu__format_bits(intel_pt_pmu, "mtc_period");
343 	intel_pt_parse_terms(intel_pt_pmu, "cyc", &cyc_bit);
344 
345 	intel_pt_tsc_ctc_ratio(&tsc_ctc_ratio_n, &tsc_ctc_ratio_d);
346 
347 	if (perf_pmu__scan_file(intel_pt_pmu, "max_nonturbo_ratio",
348 				"%lu", &max_non_turbo_ratio) != 1)
349 		max_non_turbo_ratio = 0;
350 	if (perf_pmu__scan_file(intel_pt_pmu, "caps/event_trace",
351 				"%d", &event_trace) != 1)
352 		event_trace = 0;
353 
354 	filter = intel_pt_find_filter(session->evlist, ptr->intel_pt_pmu);
355 	filter_str_len = filter ? strlen(filter) : 0;
356 
357 	if (!session->evlist->core.nr_mmaps)
358 		return -EINVAL;
359 
360 	pc = session->evlist->mmap[0].core.base;
361 	if (pc) {
362 		err = perf_read_tsc_conversion(pc, &tc);
363 		if (err) {
364 			if (err != -EOPNOTSUPP)
365 				return err;
366 		} else {
367 			cap_user_time_zero = tc.time_mult != 0;
368 		}
369 		if (!cap_user_time_zero)
370 			ui__warning("Intel Processor Trace: TSC not available\n");
371 	}
372 
373 	per_cpu_mmaps = !perf_cpu_map__is_any_cpu_or_is_empty(session->evlist->core.user_requested_cpus);
374 
375 	auxtrace_info->type = PERF_AUXTRACE_INTEL_PT;
376 	auxtrace_info->priv[INTEL_PT_PMU_TYPE] = intel_pt_pmu->type;
377 	auxtrace_info->priv[INTEL_PT_TIME_SHIFT] = tc.time_shift;
378 	auxtrace_info->priv[INTEL_PT_TIME_MULT] = tc.time_mult;
379 	auxtrace_info->priv[INTEL_PT_TIME_ZERO] = tc.time_zero;
380 	auxtrace_info->priv[INTEL_PT_CAP_USER_TIME_ZERO] = cap_user_time_zero;
381 	auxtrace_info->priv[INTEL_PT_TSC_BIT] = tsc_bit;
382 	auxtrace_info->priv[INTEL_PT_NORETCOMP_BIT] = noretcomp_bit;
383 	auxtrace_info->priv[INTEL_PT_HAVE_SCHED_SWITCH] = ptr->have_sched_switch;
384 	auxtrace_info->priv[INTEL_PT_SNAPSHOT_MODE] = ptr->snapshot_mode;
385 	auxtrace_info->priv[INTEL_PT_PER_CPU_MMAPS] = per_cpu_mmaps;
386 	auxtrace_info->priv[INTEL_PT_MTC_BIT] = mtc_bit;
387 	auxtrace_info->priv[INTEL_PT_MTC_FREQ_BITS] = mtc_freq_bits;
388 	auxtrace_info->priv[INTEL_PT_TSC_CTC_N] = tsc_ctc_ratio_n;
389 	auxtrace_info->priv[INTEL_PT_TSC_CTC_D] = tsc_ctc_ratio_d;
390 	auxtrace_info->priv[INTEL_PT_CYC_BIT] = cyc_bit;
391 	auxtrace_info->priv[INTEL_PT_MAX_NONTURBO_RATIO] = max_non_turbo_ratio;
392 	auxtrace_info->priv[INTEL_PT_FILTER_STR_LEN] = filter_str_len;
393 
394 	info = &auxtrace_info->priv[INTEL_PT_FILTER_STR_LEN] + 1;
395 
396 	if (filter_str_len) {
397 		size_t len = intel_pt_filter_bytes(filter);
398 
399 		strncpy((char *)info, filter, len);
400 		info += len >> 3;
401 	}
402 
403 	*info++ = event_trace;
404 
405 	return 0;
406 }
407 
408 #ifdef HAVE_LIBTRACEEVENT
409 static int intel_pt_track_switches(struct evlist *evlist)
410 {
411 	const char *sched_switch = "sched:sched_switch";
412 	struct evsel *evsel;
413 	int err;
414 
415 	if (!evlist__can_select_event(evlist, sched_switch))
416 		return -EPERM;
417 
418 	evsel = evlist__add_sched_switch(evlist, true);
419 	if (IS_ERR(evsel)) {
420 		err = PTR_ERR(evsel);
421 		pr_debug2("%s: failed to create %s, error = %d\n",
422 			  __func__, sched_switch, err);
423 		return err;
424 	}
425 
426 	evsel->immediate = true;
427 
428 	return 0;
429 }
430 #endif
431 
432 static bool intel_pt_exclude_guest(void)
433 {
434 	int pt_mode;
435 
436 	if (sysfs__read_int("module/kvm_intel/parameters/pt_mode", &pt_mode))
437 		pt_mode = 0;
438 
439 	return pt_mode == 1;
440 }
441 
442 static void intel_pt_valid_str(char *str, size_t len, u64 valid)
443 {
444 	unsigned int val, last = 0, state = 1;
445 	int p = 0;
446 
447 	str[0] = '\0';
448 
449 	for (val = 0; val <= 64; val++, valid >>= 1) {
450 		if (valid & 1) {
451 			last = val;
452 			switch (state) {
453 			case 0:
454 				p += scnprintf(str + p, len - p, ",");
455 				/* Fall through */
456 			case 1:
457 				p += scnprintf(str + p, len - p, "%u", val);
458 				state = 2;
459 				break;
460 			case 2:
461 				state = 3;
462 				break;
463 			case 3:
464 				state = 4;
465 				break;
466 			default:
467 				break;
468 			}
469 		} else {
470 			switch (state) {
471 			case 3:
472 				p += scnprintf(str + p, len - p, ",%u", last);
473 				state = 0;
474 				break;
475 			case 4:
476 				p += scnprintf(str + p, len - p, "-%u", last);
477 				state = 0;
478 				break;
479 			default:
480 				break;
481 			}
482 			if (state != 1)
483 				state = 0;
484 		}
485 	}
486 }
487 
488 static int intel_pt_val_config_term(struct perf_pmu *intel_pt_pmu, int dirfd,
489 				    const char *caps, const char *name,
490 				    const char *supported, u64 config)
491 {
492 	char valid_str[256];
493 	unsigned int shift;
494 	unsigned long long valid;
495 	u64 bits;
496 	int ok;
497 
498 	if (perf_pmu__scan_file_at(intel_pt_pmu, dirfd, caps, "%llx", &valid) != 1)
499 		valid = 0;
500 
501 	if (supported &&
502 	    perf_pmu__scan_file_at(intel_pt_pmu, dirfd, supported, "%d", &ok) == 1 && !ok)
503 		valid = 0;
504 
505 	valid |= 1;
506 
507 	bits = perf_pmu__format_bits(intel_pt_pmu, name);
508 
509 	config &= bits;
510 
511 	for (shift = 0; bits && !(bits & 1); shift++)
512 		bits >>= 1;
513 
514 	config >>= shift;
515 
516 	if (config > 63)
517 		goto out_err;
518 
519 	if (valid & (1 << config))
520 		return 0;
521 out_err:
522 	intel_pt_valid_str(valid_str, sizeof(valid_str), valid);
523 	pr_err("Invalid %s for %s. Valid values are: %s\n",
524 	       name, INTEL_PT_PMU_NAME, valid_str);
525 	return -EINVAL;
526 }
527 
528 static int intel_pt_validate_config(struct perf_pmu *intel_pt_pmu,
529 				    struct evsel *evsel)
530 {
531 	int err, dirfd;
532 	char c;
533 
534 	if (!evsel)
535 		return 0;
536 
537 	dirfd = perf_pmu__event_source_devices_fd();
538 	if (dirfd < 0)
539 		return dirfd;
540 
541 	/*
542 	 * If supported, force pass-through config term (pt=1) even if user
543 	 * sets pt=0, which avoids senseless kernel errors.
544 	 */
545 	if (perf_pmu__scan_file_at(intel_pt_pmu, dirfd, "format/pt", "%c", &c) == 1 &&
546 	    !(evsel->core.attr.config & 1)) {
547 		pr_warning("pt=0 doesn't make sense, forcing pt=1\n");
548 		evsel->core.attr.config |= 1;
549 	}
550 
551 	err = intel_pt_val_config_term(intel_pt_pmu, dirfd, "caps/cycle_thresholds",
552 				       "cyc_thresh", "caps/psb_cyc",
553 				       evsel->core.attr.config);
554 	if (err)
555 		goto out;
556 
557 	err = intel_pt_val_config_term(intel_pt_pmu, dirfd, "caps/mtc_periods",
558 				       "mtc_period", "caps/mtc",
559 				       evsel->core.attr.config);
560 	if (err)
561 		goto out;
562 
563 	err = intel_pt_val_config_term(intel_pt_pmu, dirfd, "caps/psb_periods",
564 					"psb_period", "caps/psb_cyc",
565 					evsel->core.attr.config);
566 
567 out:
568 	close(dirfd);
569 	return err;
570 }
571 
572 static void intel_pt_min_max_sample_sz(struct evlist *evlist,
573 				       size_t *min_sz, size_t *max_sz)
574 {
575 	struct evsel *evsel;
576 
577 	evlist__for_each_entry(evlist, evsel) {
578 		size_t sz = evsel->core.attr.aux_sample_size;
579 
580 		if (!sz)
581 			continue;
582 		if (min_sz && (sz < *min_sz || !*min_sz))
583 			*min_sz = sz;
584 		if (max_sz && sz > *max_sz)
585 			*max_sz = sz;
586 	}
587 }
588 
589 /*
590  * Currently, there is not enough information to disambiguate different PEBS
591  * events, so only allow one.
592  */
593 static bool intel_pt_too_many_aux_output(struct evlist *evlist)
594 {
595 	struct evsel *evsel;
596 	int aux_output_cnt = 0;
597 
598 	evlist__for_each_entry(evlist, evsel)
599 		aux_output_cnt += !!evsel->core.attr.aux_output;
600 
601 	if (aux_output_cnt > 1) {
602 		pr_err(INTEL_PT_PMU_NAME " supports at most one event with aux-output\n");
603 		return true;
604 	}
605 
606 	return false;
607 }
608 
609 static int intel_pt_recording_options(struct auxtrace_record *itr,
610 				      struct evlist *evlist,
611 				      struct record_opts *opts)
612 {
613 	struct intel_pt_recording *ptr =
614 			container_of(itr, struct intel_pt_recording, itr);
615 	struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu;
616 	bool have_timing_info, need_immediate = false;
617 	struct evsel *evsel, *intel_pt_evsel = NULL;
618 	const struct perf_cpu_map *cpus = evlist->core.user_requested_cpus;
619 	bool privileged = perf_event_paranoid_check(-1);
620 	u64 tsc_bit;
621 	int err;
622 
623 	ptr->evlist = evlist;
624 	ptr->snapshot_mode = opts->auxtrace_snapshot_mode;
625 
626 	evlist__for_each_entry(evlist, evsel) {
627 		if (evsel->core.attr.type == intel_pt_pmu->type) {
628 			if (intel_pt_evsel) {
629 				pr_err("There may be only one " INTEL_PT_PMU_NAME " event\n");
630 				return -EINVAL;
631 			}
632 			evsel->core.attr.freq = 0;
633 			evsel->core.attr.sample_period = 1;
634 			evsel->core.attr.exclude_guest = intel_pt_exclude_guest();
635 			evsel->no_aux_samples = true;
636 			evsel->needs_auxtrace_mmap = true;
637 			intel_pt_evsel = evsel;
638 			opts->full_auxtrace = true;
639 		}
640 	}
641 
642 	if (opts->auxtrace_snapshot_mode && !opts->full_auxtrace) {
643 		pr_err("Snapshot mode (-S option) requires " INTEL_PT_PMU_NAME " PMU event (-e " INTEL_PT_PMU_NAME ")\n");
644 		return -EINVAL;
645 	}
646 
647 	if (opts->auxtrace_snapshot_mode && opts->auxtrace_sample_mode) {
648 		pr_err("Snapshot mode (" INTEL_PT_PMU_NAME " PMU) and sample trace cannot be used together\n");
649 		return -EINVAL;
650 	}
651 
652 	if (opts->use_clockid) {
653 		pr_err("Cannot use clockid (-k option) with " INTEL_PT_PMU_NAME "\n");
654 		return -EINVAL;
655 	}
656 
657 	if (intel_pt_too_many_aux_output(evlist))
658 		return -EINVAL;
659 
660 	if (!opts->full_auxtrace)
661 		return 0;
662 
663 	if (opts->auxtrace_sample_mode)
664 		evsel__set_config_if_unset(intel_pt_pmu, intel_pt_evsel,
665 					   "psb_period", 0);
666 
667 	err = intel_pt_validate_config(intel_pt_pmu, intel_pt_evsel);
668 	if (err)
669 		return err;
670 
671 	/* Set default sizes for snapshot mode */
672 	if (opts->auxtrace_snapshot_mode) {
673 		size_t psb_period = intel_pt_psb_period(intel_pt_pmu, evlist);
674 
675 		if (!opts->auxtrace_snapshot_size && !opts->auxtrace_mmap_pages) {
676 			if (privileged) {
677 				opts->auxtrace_mmap_pages = MiB(4) / page_size;
678 			} else {
679 				opts->auxtrace_mmap_pages = KiB(128) / page_size;
680 				if (opts->mmap_pages == UINT_MAX)
681 					opts->mmap_pages = KiB(256) / page_size;
682 			}
683 		} else if (!opts->auxtrace_mmap_pages && !privileged &&
684 			   opts->mmap_pages == UINT_MAX) {
685 			opts->mmap_pages = KiB(256) / page_size;
686 		}
687 		if (!opts->auxtrace_snapshot_size)
688 			opts->auxtrace_snapshot_size =
689 				opts->auxtrace_mmap_pages * (size_t)page_size;
690 		if (!opts->auxtrace_mmap_pages) {
691 			size_t sz = opts->auxtrace_snapshot_size;
692 
693 			sz = round_up(sz, page_size) / page_size;
694 			opts->auxtrace_mmap_pages = roundup_pow_of_two(sz);
695 		}
696 		if (opts->auxtrace_snapshot_size >
697 				opts->auxtrace_mmap_pages * (size_t)page_size) {
698 			pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n",
699 			       opts->auxtrace_snapshot_size,
700 			       opts->auxtrace_mmap_pages * (size_t)page_size);
701 			return -EINVAL;
702 		}
703 		if (!opts->auxtrace_snapshot_size || !opts->auxtrace_mmap_pages) {
704 			pr_err("Failed to calculate default snapshot size and/or AUX area tracing mmap pages\n");
705 			return -EINVAL;
706 		}
707 		pr_debug2("Intel PT snapshot size: %zu\n",
708 			  opts->auxtrace_snapshot_size);
709 		if (psb_period &&
710 		    opts->auxtrace_snapshot_size <= psb_period +
711 						  INTEL_PT_PSB_PERIOD_NEAR)
712 			ui__warning("Intel PT snapshot size (%zu) may be too small for PSB period (%zu)\n",
713 				    opts->auxtrace_snapshot_size, psb_period);
714 	}
715 
716 	/* Set default sizes for sample mode */
717 	if (opts->auxtrace_sample_mode) {
718 		size_t psb_period = intel_pt_psb_period(intel_pt_pmu, evlist);
719 		size_t min_sz = 0, max_sz = 0;
720 
721 		intel_pt_min_max_sample_sz(evlist, &min_sz, &max_sz);
722 		if (!opts->auxtrace_mmap_pages && !privileged &&
723 		    opts->mmap_pages == UINT_MAX)
724 			opts->mmap_pages = KiB(256) / page_size;
725 		if (!opts->auxtrace_mmap_pages) {
726 			size_t sz = round_up(max_sz, page_size) / page_size;
727 
728 			opts->auxtrace_mmap_pages = roundup_pow_of_two(sz);
729 		}
730 		if (max_sz > opts->auxtrace_mmap_pages * (size_t)page_size) {
731 			pr_err("Sample size %zu must not be greater than AUX area tracing mmap size %zu\n",
732 			       max_sz,
733 			       opts->auxtrace_mmap_pages * (size_t)page_size);
734 			return -EINVAL;
735 		}
736 		pr_debug2("Intel PT min. sample size: %zu max. sample size: %zu\n",
737 			  min_sz, max_sz);
738 		if (psb_period &&
739 		    min_sz <= psb_period + INTEL_PT_PSB_PERIOD_NEAR)
740 			ui__warning("Intel PT sample size (%zu) may be too small for PSB period (%zu)\n",
741 				    min_sz, psb_period);
742 	}
743 
744 	/* Set default sizes for full trace mode */
745 	if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) {
746 		if (privileged) {
747 			opts->auxtrace_mmap_pages = MiB(4) / page_size;
748 		} else {
749 			opts->auxtrace_mmap_pages = KiB(128) / page_size;
750 			if (opts->mmap_pages == UINT_MAX)
751 				opts->mmap_pages = KiB(256) / page_size;
752 		}
753 	}
754 
755 	/* Validate auxtrace_mmap_pages */
756 	if (opts->auxtrace_mmap_pages) {
757 		size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size;
758 		size_t min_sz;
759 
760 		if (opts->auxtrace_snapshot_mode || opts->auxtrace_sample_mode)
761 			min_sz = KiB(4);
762 		else
763 			min_sz = KiB(8);
764 
765 		if (sz < min_sz || !is_power_of_2(sz)) {
766 			pr_err("Invalid mmap size for Intel Processor Trace: must be at least %zuKiB and a power of 2\n",
767 			       min_sz / 1024);
768 			return -EINVAL;
769 		}
770 	}
771 
772 	if (!opts->auxtrace_snapshot_mode && !opts->auxtrace_sample_mode) {
773 		size_t aw = opts->auxtrace_mmap_pages * (size_t)page_size / 4;
774 		u32 aux_watermark = aw > UINT_MAX ? UINT_MAX : aw;
775 
776 		intel_pt_evsel->core.attr.aux_watermark = aux_watermark;
777 	}
778 
779 	intel_pt_parse_terms(intel_pt_pmu, "tsc", &tsc_bit);
780 
781 	if (opts->full_auxtrace && (intel_pt_evsel->core.attr.config & tsc_bit))
782 		have_timing_info = true;
783 	else
784 		have_timing_info = false;
785 
786 	/*
787 	 * Per-cpu recording needs sched_switch events to distinguish different
788 	 * threads.
789 	 */
790 	if (have_timing_info && !perf_cpu_map__is_any_cpu_or_is_empty(cpus) &&
791 	    !record_opts__no_switch_events(opts)) {
792 		if (perf_can_record_switch_events()) {
793 			bool cpu_wide = !target__none(&opts->target) &&
794 					!target__has_task(&opts->target);
795 
796 			if (!cpu_wide && perf_can_record_cpu_wide()) {
797 				struct evsel *switch_evsel;
798 
799 				switch_evsel = evlist__add_dummy_on_all_cpus(evlist);
800 				if (!switch_evsel)
801 					return -ENOMEM;
802 
803 				switch_evsel->core.attr.context_switch = 1;
804 				switch_evsel->immediate = true;
805 
806 				evsel__set_sample_bit(switch_evsel, TID);
807 				evsel__set_sample_bit(switch_evsel, TIME);
808 				evsel__set_sample_bit(switch_evsel, CPU);
809 				evsel__reset_sample_bit(switch_evsel, BRANCH_STACK);
810 
811 				opts->record_switch_events = false;
812 				ptr->have_sched_switch = 3;
813 			} else {
814 				opts->record_switch_events = true;
815 				need_immediate = true;
816 				if (cpu_wide)
817 					ptr->have_sched_switch = 3;
818 				else
819 					ptr->have_sched_switch = 2;
820 			}
821 		} else {
822 #ifdef HAVE_LIBTRACEEVENT
823 			err = intel_pt_track_switches(evlist);
824 			if (err == -EPERM)
825 				pr_debug2("Unable to select sched:sched_switch\n");
826 			else if (err)
827 				return err;
828 			else
829 				ptr->have_sched_switch = 1;
830 #endif
831 		}
832 	}
833 
834 	if (have_timing_info && !intel_pt_evsel->core.attr.exclude_kernel &&
835 	    perf_can_record_text_poke_events() && perf_can_record_cpu_wide())
836 		opts->text_poke = true;
837 
838 	if (intel_pt_evsel) {
839 		/*
840 		 * To obtain the auxtrace buffer file descriptor, the auxtrace
841 		 * event must come first.
842 		 */
843 		evlist__to_front(evlist, intel_pt_evsel);
844 		/*
845 		 * In the case of per-cpu mmaps, we need the CPU on the
846 		 * AUX event.
847 		 */
848 		if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus))
849 			evsel__set_sample_bit(intel_pt_evsel, CPU);
850 	}
851 
852 	/* Add dummy event to keep tracking */
853 	if (opts->full_auxtrace) {
854 		bool need_system_wide_tracking;
855 		struct evsel *tracking_evsel;
856 
857 		/*
858 		 * User space tasks can migrate between CPUs, so when tracing
859 		 * selected CPUs, sideband for all CPUs is still needed.
860 		 */
861 		need_system_wide_tracking = opts->target.cpu_list &&
862 					    !intel_pt_evsel->core.attr.exclude_user;
863 
864 		tracking_evsel = evlist__add_aux_dummy(evlist, need_system_wide_tracking);
865 		if (!tracking_evsel)
866 			return -ENOMEM;
867 
868 		evlist__set_tracking_event(evlist, tracking_evsel);
869 
870 		if (need_immediate)
871 			tracking_evsel->immediate = true;
872 
873 		/* In per-cpu case, always need the time of mmap events etc */
874 		if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) {
875 			evsel__set_sample_bit(tracking_evsel, TIME);
876 			/* And the CPU for switch events */
877 			evsel__set_sample_bit(tracking_evsel, CPU);
878 		}
879 		evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK);
880 	}
881 
882 	/*
883 	 * Warn the user when we do not have enough information to decode i.e.
884 	 * per-cpu with no sched_switch (except workload-only).
885 	 */
886 	if (!ptr->have_sched_switch && !perf_cpu_map__is_any_cpu_or_is_empty(cpus) &&
887 	    !target__none(&opts->target) &&
888 	    !intel_pt_evsel->core.attr.exclude_user)
889 		ui__warning("Intel Processor Trace decoding will not be possible except for kernel tracing!\n");
890 
891 	return 0;
892 }
893 
894 static int intel_pt_snapshot_start(struct auxtrace_record *itr)
895 {
896 	struct intel_pt_recording *ptr =
897 			container_of(itr, struct intel_pt_recording, itr);
898 	struct evsel *evsel;
899 
900 	evlist__for_each_entry(ptr->evlist, evsel) {
901 		if (evsel->core.attr.type == ptr->intel_pt_pmu->type)
902 			return evsel__disable(evsel);
903 	}
904 	return -EINVAL;
905 }
906 
907 static int intel_pt_snapshot_finish(struct auxtrace_record *itr)
908 {
909 	struct intel_pt_recording *ptr =
910 			container_of(itr, struct intel_pt_recording, itr);
911 	struct evsel *evsel;
912 
913 	evlist__for_each_entry(ptr->evlist, evsel) {
914 		if (evsel->core.attr.type == ptr->intel_pt_pmu->type)
915 			return evsel__enable(evsel);
916 	}
917 	return -EINVAL;
918 }
919 
920 static int intel_pt_alloc_snapshot_refs(struct intel_pt_recording *ptr, int idx)
921 {
922 	const size_t sz = sizeof(struct intel_pt_snapshot_ref);
923 	int cnt = ptr->snapshot_ref_cnt, new_cnt = cnt * 2;
924 	struct intel_pt_snapshot_ref *refs;
925 
926 	if (!new_cnt)
927 		new_cnt = 16;
928 
929 	while (new_cnt <= idx)
930 		new_cnt *= 2;
931 
932 	refs = calloc(new_cnt, sz);
933 	if (!refs)
934 		return -ENOMEM;
935 
936 	memcpy(refs, ptr->snapshot_refs, cnt * sz);
937 
938 	ptr->snapshot_refs = refs;
939 	ptr->snapshot_ref_cnt = new_cnt;
940 
941 	return 0;
942 }
943 
944 static void intel_pt_free_snapshot_refs(struct intel_pt_recording *ptr)
945 {
946 	int i;
947 
948 	for (i = 0; i < ptr->snapshot_ref_cnt; i++)
949 		zfree(&ptr->snapshot_refs[i].ref_buf);
950 	zfree(&ptr->snapshot_refs);
951 }
952 
953 static void intel_pt_recording_free(struct auxtrace_record *itr)
954 {
955 	struct intel_pt_recording *ptr =
956 			container_of(itr, struct intel_pt_recording, itr);
957 
958 	intel_pt_free_snapshot_refs(ptr);
959 	free(ptr);
960 }
961 
962 static int intel_pt_alloc_snapshot_ref(struct intel_pt_recording *ptr, int idx,
963 				       size_t snapshot_buf_size)
964 {
965 	size_t ref_buf_size = ptr->snapshot_ref_buf_size;
966 	void *ref_buf;
967 
968 	ref_buf = zalloc(ref_buf_size);
969 	if (!ref_buf)
970 		return -ENOMEM;
971 
972 	ptr->snapshot_refs[idx].ref_buf = ref_buf;
973 	ptr->snapshot_refs[idx].ref_offset = snapshot_buf_size - ref_buf_size;
974 
975 	return 0;
976 }
977 
978 static size_t intel_pt_snapshot_ref_buf_size(struct intel_pt_recording *ptr,
979 					     size_t snapshot_buf_size)
980 {
981 	const size_t max_size = 256 * 1024;
982 	size_t buf_size = 0, psb_period;
983 
984 	if (ptr->snapshot_size <= 64 * 1024)
985 		return 0;
986 
987 	psb_period = intel_pt_psb_period(ptr->intel_pt_pmu, ptr->evlist);
988 	if (psb_period)
989 		buf_size = psb_period * 2;
990 
991 	if (!buf_size || buf_size > max_size)
992 		buf_size = max_size;
993 
994 	if (buf_size >= snapshot_buf_size)
995 		return 0;
996 
997 	if (buf_size >= ptr->snapshot_size / 2)
998 		return 0;
999 
1000 	return buf_size;
1001 }
1002 
1003 static int intel_pt_snapshot_init(struct intel_pt_recording *ptr,
1004 				  size_t snapshot_buf_size)
1005 {
1006 	if (ptr->snapshot_init_done)
1007 		return 0;
1008 
1009 	ptr->snapshot_init_done = true;
1010 
1011 	ptr->snapshot_ref_buf_size = intel_pt_snapshot_ref_buf_size(ptr,
1012 							snapshot_buf_size);
1013 
1014 	return 0;
1015 }
1016 
1017 /**
1018  * intel_pt_compare_buffers - compare bytes in a buffer to a circular buffer.
1019  * @buf1: first buffer
1020  * @compare_size: number of bytes to compare
1021  * @buf2: second buffer (a circular buffer)
1022  * @offs2: offset in second buffer
1023  * @buf2_size: size of second buffer
1024  *
1025  * The comparison allows for the possibility that the bytes to compare in the
1026  * circular buffer are not contiguous.  It is assumed that @compare_size <=
1027  * @buf2_size.  This function returns %false if the bytes are identical, %true
1028  * otherwise.
1029  */
1030 static bool intel_pt_compare_buffers(void *buf1, size_t compare_size,
1031 				     void *buf2, size_t offs2, size_t buf2_size)
1032 {
1033 	size_t end2 = offs2 + compare_size, part_size;
1034 
1035 	if (end2 <= buf2_size)
1036 		return memcmp(buf1, buf2 + offs2, compare_size);
1037 
1038 	part_size = end2 - buf2_size;
1039 	if (memcmp(buf1, buf2 + offs2, part_size))
1040 		return true;
1041 
1042 	compare_size -= part_size;
1043 
1044 	return memcmp(buf1 + part_size, buf2, compare_size);
1045 }
1046 
1047 static bool intel_pt_compare_ref(void *ref_buf, size_t ref_offset,
1048 				 size_t ref_size, size_t buf_size,
1049 				 void *data, size_t head)
1050 {
1051 	size_t ref_end = ref_offset + ref_size;
1052 
1053 	if (ref_end > buf_size) {
1054 		if (head > ref_offset || head < ref_end - buf_size)
1055 			return true;
1056 	} else if (head > ref_offset && head < ref_end) {
1057 		return true;
1058 	}
1059 
1060 	return intel_pt_compare_buffers(ref_buf, ref_size, data, ref_offset,
1061 					buf_size);
1062 }
1063 
1064 static void intel_pt_copy_ref(void *ref_buf, size_t ref_size, size_t buf_size,
1065 			      void *data, size_t head)
1066 {
1067 	if (head >= ref_size) {
1068 		memcpy(ref_buf, data + head - ref_size, ref_size);
1069 	} else {
1070 		memcpy(ref_buf, data, head);
1071 		ref_size -= head;
1072 		memcpy(ref_buf + head, data + buf_size - ref_size, ref_size);
1073 	}
1074 }
1075 
1076 static bool intel_pt_wrapped(struct intel_pt_recording *ptr, int idx,
1077 			     struct auxtrace_mmap *mm, unsigned char *data,
1078 			     u64 head)
1079 {
1080 	struct intel_pt_snapshot_ref *ref = &ptr->snapshot_refs[idx];
1081 	bool wrapped;
1082 
1083 	wrapped = intel_pt_compare_ref(ref->ref_buf, ref->ref_offset,
1084 				       ptr->snapshot_ref_buf_size, mm->len,
1085 				       data, head);
1086 
1087 	intel_pt_copy_ref(ref->ref_buf, ptr->snapshot_ref_buf_size, mm->len,
1088 			  data, head);
1089 
1090 	return wrapped;
1091 }
1092 
1093 static bool intel_pt_first_wrap(u64 *data, size_t buf_size)
1094 {
1095 	int i, a, b;
1096 
1097 	b = buf_size >> 3;
1098 	a = b - 512;
1099 	if (a < 0)
1100 		a = 0;
1101 
1102 	for (i = a; i < b; i++) {
1103 		if (data[i])
1104 			return true;
1105 	}
1106 
1107 	return false;
1108 }
1109 
1110 static int intel_pt_find_snapshot(struct auxtrace_record *itr, int idx,
1111 				  struct auxtrace_mmap *mm, unsigned char *data,
1112 				  u64 *head, u64 *old)
1113 {
1114 	struct intel_pt_recording *ptr =
1115 			container_of(itr, struct intel_pt_recording, itr);
1116 	bool wrapped;
1117 	int err;
1118 
1119 	pr_debug3("%s: mmap index %d old head %zu new head %zu\n",
1120 		  __func__, idx, (size_t)*old, (size_t)*head);
1121 
1122 	err = intel_pt_snapshot_init(ptr, mm->len);
1123 	if (err)
1124 		goto out_err;
1125 
1126 	if (idx >= ptr->snapshot_ref_cnt) {
1127 		err = intel_pt_alloc_snapshot_refs(ptr, idx);
1128 		if (err)
1129 			goto out_err;
1130 	}
1131 
1132 	if (ptr->snapshot_ref_buf_size) {
1133 		if (!ptr->snapshot_refs[idx].ref_buf) {
1134 			err = intel_pt_alloc_snapshot_ref(ptr, idx, mm->len);
1135 			if (err)
1136 				goto out_err;
1137 		}
1138 		wrapped = intel_pt_wrapped(ptr, idx, mm, data, *head);
1139 	} else {
1140 		wrapped = ptr->snapshot_refs[idx].wrapped;
1141 		if (!wrapped && intel_pt_first_wrap((u64 *)data, mm->len)) {
1142 			ptr->snapshot_refs[idx].wrapped = true;
1143 			wrapped = true;
1144 		}
1145 	}
1146 
1147 	/*
1148 	 * In full trace mode 'head' continually increases.  However in snapshot
1149 	 * mode 'head' is an offset within the buffer.  Here 'old' and 'head'
1150 	 * are adjusted to match the full trace case which expects that 'old' is
1151 	 * always less than 'head'.
1152 	 */
1153 	if (wrapped) {
1154 		*old = *head;
1155 		*head += mm->len;
1156 	} else {
1157 		if (mm->mask)
1158 			*old &= mm->mask;
1159 		else
1160 			*old %= mm->len;
1161 		if (*old > *head)
1162 			*head += mm->len;
1163 	}
1164 
1165 	pr_debug3("%s: wrap-around %sdetected, adjusted old head %zu adjusted new head %zu\n",
1166 		  __func__, wrapped ? "" : "not ", (size_t)*old, (size_t)*head);
1167 
1168 	return 0;
1169 
1170 out_err:
1171 	pr_err("%s: failed, error %d\n", __func__, err);
1172 	return err;
1173 }
1174 
1175 static u64 intel_pt_reference(struct auxtrace_record *itr __maybe_unused)
1176 {
1177 	return rdtsc();
1178 }
1179 
1180 struct auxtrace_record *intel_pt_recording_init(int *err)
1181 {
1182 	struct perf_pmu *intel_pt_pmu = perf_pmus__find(INTEL_PT_PMU_NAME);
1183 	struct intel_pt_recording *ptr;
1184 
1185 	if (!intel_pt_pmu)
1186 		return NULL;
1187 
1188 	if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) {
1189 		*err = -errno;
1190 		return NULL;
1191 	}
1192 
1193 	ptr = zalloc(sizeof(struct intel_pt_recording));
1194 	if (!ptr) {
1195 		*err = -ENOMEM;
1196 		return NULL;
1197 	}
1198 
1199 	ptr->intel_pt_pmu = intel_pt_pmu;
1200 	ptr->itr.recording_options = intel_pt_recording_options;
1201 	ptr->itr.info_priv_size = intel_pt_info_priv_size;
1202 	ptr->itr.info_fill = intel_pt_info_fill;
1203 	ptr->itr.free = intel_pt_recording_free;
1204 	ptr->itr.snapshot_start = intel_pt_snapshot_start;
1205 	ptr->itr.snapshot_finish = intel_pt_snapshot_finish;
1206 	ptr->itr.find_snapshot = intel_pt_find_snapshot;
1207 	ptr->itr.parse_snapshot_options = intel_pt_parse_snapshot_options;
1208 	ptr->itr.reference = intel_pt_reference;
1209 	ptr->itr.read_finish = auxtrace_record__read_finish;
1210 	/*
1211 	 * Decoding starts at a PSB packet. Minimum PSB period is 2K so 4K
1212 	 * should give at least 1 PSB per sample.
1213 	 */
1214 	ptr->itr.default_aux_sample_size = 4096;
1215 	return &ptr->itr;
1216 }
1217