xref: /linux/arch/x86/events/intel/pt.c (revision f3a8b6645dc2e60d11f20c1c23afd964ff4e55ae)
1 /*
2  * Intel(R) Processor Trace PMU driver for perf
3  * Copyright (c) 2013-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  *
14  * Intel PT is specified in the Intel Architecture Instruction Set Extensions
15  * Programming Reference:
16  * http://software.intel.com/en-us/intel-isa-extensions
17  */
18 
19 #undef DEBUG
20 
21 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 
23 #include <linux/types.h>
24 #include <linux/slab.h>
25 #include <linux/device.h>
26 
27 #include <asm/perf_event.h>
28 #include <asm/insn.h>
29 #include <asm/io.h>
30 #include <asm/intel_pt.h>
31 
32 #include "../perf_event.h"
33 #include "pt.h"
34 
35 static DEFINE_PER_CPU(struct pt, pt_ctx);
36 
37 static struct pt_pmu pt_pmu;
38 
39 enum cpuid_regs {
40 	CR_EAX = 0,
41 	CR_ECX,
42 	CR_EDX,
43 	CR_EBX
44 };
45 
46 /*
47  * Capabilities of Intel PT hardware, such as number of address bits or
48  * supported output schemes, are cached and exported to userspace as "caps"
49  * attribute group of pt pmu device
50  * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
51  * relevant bits together with intel_pt traces.
52  *
53  * These are necessary for both trace decoding (payloads_lip, contains address
54  * width encoded in IP-related packets), and event configuration (bitmasks with
55  * permitted values for certain bit fields).
56  */
57 #define PT_CAP(_n, _l, _r, _m)						\
58 	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
59 			    .reg = _r, .mask = _m }
60 
61 static struct pt_cap_desc {
62 	const char	*name;
63 	u32		leaf;
64 	u8		reg;
65 	u32		mask;
66 } pt_caps[] = {
67 	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
68 	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
69 	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
70 	PT_CAP(ip_filtering,		0, CR_EBX, BIT(2)),
71 	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
72 	PT_CAP(ptwrite,			0, CR_EBX, BIT(4)),
73 	PT_CAP(power_event_trace,	0, CR_EBX, BIT(5)),
74 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
75 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
76 	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
77 	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
78 	PT_CAP(num_address_ranges,	1, CR_EAX, 0x3),
79 	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
80 	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
81 	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
82 };
83 
84 static u32 pt_cap_get(enum pt_capabilities cap)
85 {
86 	struct pt_cap_desc *cd = &pt_caps[cap];
87 	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
88 	unsigned int shift = __ffs(cd->mask);
89 
90 	return (c & cd->mask) >> shift;
91 }
92 
93 static ssize_t pt_cap_show(struct device *cdev,
94 			   struct device_attribute *attr,
95 			   char *buf)
96 {
97 	struct dev_ext_attribute *ea =
98 		container_of(attr, struct dev_ext_attribute, attr);
99 	enum pt_capabilities cap = (long)ea->var;
100 
101 	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
102 }
103 
104 static struct attribute_group pt_cap_group = {
105 	.name	= "caps",
106 };
107 
108 PMU_FORMAT_ATTR(cyc,		"config:1"	);
109 PMU_FORMAT_ATTR(mtc,		"config:9"	);
110 PMU_FORMAT_ATTR(tsc,		"config:10"	);
111 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
112 PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
113 PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
114 PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
115 
116 static struct attribute *pt_formats_attr[] = {
117 	&format_attr_cyc.attr,
118 	&format_attr_mtc.attr,
119 	&format_attr_tsc.attr,
120 	&format_attr_noretcomp.attr,
121 	&format_attr_mtc_period.attr,
122 	&format_attr_cyc_thresh.attr,
123 	&format_attr_psb_period.attr,
124 	NULL,
125 };
126 
127 static struct attribute_group pt_format_group = {
128 	.name	= "format",
129 	.attrs	= pt_formats_attr,
130 };
131 
132 static ssize_t
133 pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
134 		    char *page)
135 {
136 	struct perf_pmu_events_attr *pmu_attr =
137 		container_of(attr, struct perf_pmu_events_attr, attr);
138 
139 	switch (pmu_attr->id) {
140 	case 0:
141 		return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
142 	case 1:
143 		return sprintf(page, "%u:%u\n",
144 			       pt_pmu.tsc_art_num,
145 			       pt_pmu.tsc_art_den);
146 	default:
147 		break;
148 	}
149 
150 	return -EINVAL;
151 }
152 
153 PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
154 	       pt_timing_attr_show);
155 PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
156 	       pt_timing_attr_show);
157 
158 static struct attribute *pt_timing_attr[] = {
159 	&timing_attr_max_nonturbo_ratio.attr.attr,
160 	&timing_attr_tsc_art_ratio.attr.attr,
161 	NULL,
162 };
163 
164 static struct attribute_group pt_timing_group = {
165 	.attrs	= pt_timing_attr,
166 };
167 
168 static const struct attribute_group *pt_attr_groups[] = {
169 	&pt_cap_group,
170 	&pt_format_group,
171 	&pt_timing_group,
172 	NULL,
173 };
174 
175 static int __init pt_pmu_hw_init(void)
176 {
177 	struct dev_ext_attribute *de_attrs;
178 	struct attribute **attrs;
179 	size_t size;
180 	u64 reg;
181 	int ret;
182 	long i;
183 
184 	rdmsrl(MSR_PLATFORM_INFO, reg);
185 	pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
186 
187 	/*
188 	 * if available, read in TSC to core crystal clock ratio,
189 	 * otherwise, zero for numerator stands for "not enumerated"
190 	 * as per SDM
191 	 */
192 	if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
193 		u32 eax, ebx, ecx, edx;
194 
195 		cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
196 
197 		pt_pmu.tsc_art_num = ebx;
198 		pt_pmu.tsc_art_den = eax;
199 	}
200 
201 	if (boot_cpu_has(X86_FEATURE_VMX)) {
202 		/*
203 		 * Intel SDM, 36.5 "Tracing post-VMXON" says that
204 		 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
205 		 * post-VMXON.
206 		 */
207 		rdmsrl(MSR_IA32_VMX_MISC, reg);
208 		if (reg & BIT(14))
209 			pt_pmu.vmx = true;
210 	}
211 
212 	attrs = NULL;
213 
214 	for (i = 0; i < PT_CPUID_LEAVES; i++) {
215 		cpuid_count(20, i,
216 			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
217 			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
218 			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
219 			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
220 	}
221 
222 	ret = -ENOMEM;
223 	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
224 	attrs = kzalloc(size, GFP_KERNEL);
225 	if (!attrs)
226 		goto fail;
227 
228 	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
229 	de_attrs = kzalloc(size, GFP_KERNEL);
230 	if (!de_attrs)
231 		goto fail;
232 
233 	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
234 		struct dev_ext_attribute *de_attr = de_attrs + i;
235 
236 		de_attr->attr.attr.name = pt_caps[i].name;
237 
238 		sysfs_attr_init(&de_attr->attr.attr);
239 
240 		de_attr->attr.attr.mode		= S_IRUGO;
241 		de_attr->attr.show		= pt_cap_show;
242 		de_attr->var			= (void *)i;
243 
244 		attrs[i] = &de_attr->attr.attr;
245 	}
246 
247 	pt_cap_group.attrs = attrs;
248 
249 	return 0;
250 
251 fail:
252 	kfree(attrs);
253 
254 	return ret;
255 }
256 
257 #define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
258 			  RTIT_CTL_CYC_THRESH	| \
259 			  RTIT_CTL_PSB_FREQ)
260 
261 #define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
262 			 RTIT_CTL_MTC_RANGE)
263 
264 #define RTIT_CTL_PTW	(RTIT_CTL_PTW_EN	| \
265 			 RTIT_CTL_FUP_ON_PTW)
266 
267 #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
268 			RTIT_CTL_DISRETC	| \
269 			RTIT_CTL_CYC_PSB	| \
270 			RTIT_CTL_MTC		| \
271 			RTIT_CTL_PWR_EVT_EN	| \
272 			RTIT_CTL_FUP_ON_PTW	| \
273 			RTIT_CTL_PTW_EN)
274 
275 static bool pt_event_valid(struct perf_event *event)
276 {
277 	u64 config = event->attr.config;
278 	u64 allowed, requested;
279 
280 	if ((config & PT_CONFIG_MASK) != config)
281 		return false;
282 
283 	if (config & RTIT_CTL_CYC_PSB) {
284 		if (!pt_cap_get(PT_CAP_psb_cyc))
285 			return false;
286 
287 		allowed = pt_cap_get(PT_CAP_psb_periods);
288 		requested = (config & RTIT_CTL_PSB_FREQ) >>
289 			RTIT_CTL_PSB_FREQ_OFFSET;
290 		if (requested && (!(allowed & BIT(requested))))
291 			return false;
292 
293 		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
294 		requested = (config & RTIT_CTL_CYC_THRESH) >>
295 			RTIT_CTL_CYC_THRESH_OFFSET;
296 		if (requested && (!(allowed & BIT(requested))))
297 			return false;
298 	}
299 
300 	if (config & RTIT_CTL_MTC) {
301 		/*
302 		 * In the unlikely case that CPUID lists valid mtc periods,
303 		 * but not the mtc capability, drop out here.
304 		 *
305 		 * Spec says that setting mtc period bits while mtc bit in
306 		 * CPUID is 0 will #GP, so better safe than sorry.
307 		 */
308 		if (!pt_cap_get(PT_CAP_mtc))
309 			return false;
310 
311 		allowed = pt_cap_get(PT_CAP_mtc_periods);
312 		if (!allowed)
313 			return false;
314 
315 		requested = (config & RTIT_CTL_MTC_RANGE) >>
316 			RTIT_CTL_MTC_RANGE_OFFSET;
317 
318 		if (!(allowed & BIT(requested)))
319 			return false;
320 	}
321 
322 	if (config & RTIT_CTL_PWR_EVT_EN &&
323 	    !pt_cap_get(PT_CAP_power_event_trace))
324 		return false;
325 
326 	if (config & RTIT_CTL_PTW) {
327 		if (!pt_cap_get(PT_CAP_ptwrite))
328 			return false;
329 
330 		/* FUPonPTW without PTW doesn't make sense */
331 		if ((config & RTIT_CTL_FUP_ON_PTW) &&
332 		    !(config & RTIT_CTL_PTW_EN))
333 			return false;
334 	}
335 
336 	return true;
337 }
338 
339 /*
340  * PT configuration helpers
341  * These all are cpu affine and operate on a local PT
342  */
343 
344 /* Address ranges and their corresponding msr configuration registers */
345 static const struct pt_address_range {
346 	unsigned long	msr_a;
347 	unsigned long	msr_b;
348 	unsigned int	reg_off;
349 } pt_address_ranges[] = {
350 	{
351 		.msr_a	 = MSR_IA32_RTIT_ADDR0_A,
352 		.msr_b	 = MSR_IA32_RTIT_ADDR0_B,
353 		.reg_off = RTIT_CTL_ADDR0_OFFSET,
354 	},
355 	{
356 		.msr_a	 = MSR_IA32_RTIT_ADDR1_A,
357 		.msr_b	 = MSR_IA32_RTIT_ADDR1_B,
358 		.reg_off = RTIT_CTL_ADDR1_OFFSET,
359 	},
360 	{
361 		.msr_a	 = MSR_IA32_RTIT_ADDR2_A,
362 		.msr_b	 = MSR_IA32_RTIT_ADDR2_B,
363 		.reg_off = RTIT_CTL_ADDR2_OFFSET,
364 	},
365 	{
366 		.msr_a	 = MSR_IA32_RTIT_ADDR3_A,
367 		.msr_b	 = MSR_IA32_RTIT_ADDR3_B,
368 		.reg_off = RTIT_CTL_ADDR3_OFFSET,
369 	}
370 };
371 
372 static u64 pt_config_filters(struct perf_event *event)
373 {
374 	struct pt_filters *filters = event->hw.addr_filters;
375 	struct pt *pt = this_cpu_ptr(&pt_ctx);
376 	unsigned int range = 0;
377 	u64 rtit_ctl = 0;
378 
379 	if (!filters)
380 		return 0;
381 
382 	perf_event_addr_filters_sync(event);
383 
384 	for (range = 0; range < filters->nr_filters; range++) {
385 		struct pt_filter *filter = &filters->filter[range];
386 
387 		/*
388 		 * Note, if the range has zero start/end addresses due
389 		 * to its dynamic object not being loaded yet, we just
390 		 * go ahead and program zeroed range, which will simply
391 		 * produce no data. Note^2: if executable code at 0x0
392 		 * is a concern, we can set up an "invalid" configuration
393 		 * such as msr_b < msr_a.
394 		 */
395 
396 		/* avoid redundant msr writes */
397 		if (pt->filters.filter[range].msr_a != filter->msr_a) {
398 			wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
399 			pt->filters.filter[range].msr_a = filter->msr_a;
400 		}
401 
402 		if (pt->filters.filter[range].msr_b != filter->msr_b) {
403 			wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
404 			pt->filters.filter[range].msr_b = filter->msr_b;
405 		}
406 
407 		rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
408 	}
409 
410 	return rtit_ctl;
411 }
412 
413 static void pt_config(struct perf_event *event)
414 {
415 	u64 reg;
416 
417 	if (!event->hw.itrace_started) {
418 		event->hw.itrace_started = 1;
419 		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
420 	}
421 
422 	reg = pt_config_filters(event);
423 	reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
424 
425 	if (!event->attr.exclude_kernel)
426 		reg |= RTIT_CTL_OS;
427 	if (!event->attr.exclude_user)
428 		reg |= RTIT_CTL_USR;
429 
430 	reg |= (event->attr.config & PT_CONFIG_MASK);
431 
432 	event->hw.config = reg;
433 	wrmsrl(MSR_IA32_RTIT_CTL, reg);
434 }
435 
436 static void pt_config_stop(struct perf_event *event)
437 {
438 	u64 ctl = READ_ONCE(event->hw.config);
439 
440 	/* may be already stopped by a PMI */
441 	if (!(ctl & RTIT_CTL_TRACEEN))
442 		return;
443 
444 	ctl &= ~RTIT_CTL_TRACEEN;
445 	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
446 
447 	WRITE_ONCE(event->hw.config, ctl);
448 
449 	/*
450 	 * A wrmsr that disables trace generation serializes other PT
451 	 * registers and causes all data packets to be written to memory,
452 	 * but a fence is required for the data to become globally visible.
453 	 *
454 	 * The below WMB, separating data store and aux_head store matches
455 	 * the consumer's RMB that separates aux_head load and data load.
456 	 */
457 	wmb();
458 }
459 
460 static void pt_config_buffer(void *buf, unsigned int topa_idx,
461 			     unsigned int output_off)
462 {
463 	u64 reg;
464 
465 	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
466 
467 	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
468 
469 	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
470 }
471 
472 /*
473  * Keep ToPA table-related metadata on the same page as the actual table,
474  * taking up a few words from the top
475  */
476 
477 #define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
478 
479 /**
480  * struct topa - page-sized ToPA table with metadata at the top
481  * @table:	actual ToPA table entries, as understood by PT hardware
482  * @list:	linkage to struct pt_buffer's list of tables
483  * @phys:	physical address of this page
484  * @offset:	offset of the first entry in this table in the buffer
485  * @size:	total size of all entries in this table
486  * @last:	index of the last initialized entry in this table
487  */
488 struct topa {
489 	struct topa_entry	table[TENTS_PER_PAGE];
490 	struct list_head	list;
491 	u64			phys;
492 	u64			offset;
493 	size_t			size;
494 	int			last;
495 };
496 
497 /* make -1 stand for the last table entry */
498 #define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
499 
500 /**
501  * topa_alloc() - allocate page-sized ToPA table
502  * @cpu:	CPU on which to allocate.
503  * @gfp:	Allocation flags.
504  *
505  * Return:	On success, return the pointer to ToPA table page.
506  */
507 static struct topa *topa_alloc(int cpu, gfp_t gfp)
508 {
509 	int node = cpu_to_node(cpu);
510 	struct topa *topa;
511 	struct page *p;
512 
513 	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
514 	if (!p)
515 		return NULL;
516 
517 	topa = page_address(p);
518 	topa->last = 0;
519 	topa->phys = page_to_phys(p);
520 
521 	/*
522 	 * In case of singe-entry ToPA, always put the self-referencing END
523 	 * link as the 2nd entry in the table
524 	 */
525 	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
526 		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
527 		TOPA_ENTRY(topa, 1)->end = 1;
528 	}
529 
530 	return topa;
531 }
532 
533 /**
534  * topa_free() - free a page-sized ToPA table
535  * @topa:	Table to deallocate.
536  */
537 static void topa_free(struct topa *topa)
538 {
539 	free_page((unsigned long)topa);
540 }
541 
542 /**
543  * topa_insert_table() - insert a ToPA table into a buffer
544  * @buf:	 PT buffer that's being extended.
545  * @topa:	 New topa table to be inserted.
546  *
547  * If it's the first table in this buffer, set up buffer's pointers
548  * accordingly; otherwise, add a END=1 link entry to @topa to the current
549  * "last" table and adjust the last table pointer to @topa.
550  */
551 static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
552 {
553 	struct topa *last = buf->last;
554 
555 	list_add_tail(&topa->list, &buf->tables);
556 
557 	if (!buf->first) {
558 		buf->first = buf->last = buf->cur = topa;
559 		return;
560 	}
561 
562 	topa->offset = last->offset + last->size;
563 	buf->last = topa;
564 
565 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
566 		return;
567 
568 	BUG_ON(last->last != TENTS_PER_PAGE - 1);
569 
570 	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
571 	TOPA_ENTRY(last, -1)->end = 1;
572 }
573 
574 /**
575  * topa_table_full() - check if a ToPA table is filled up
576  * @topa:	ToPA table.
577  */
578 static bool topa_table_full(struct topa *topa)
579 {
580 	/* single-entry ToPA is a special case */
581 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
582 		return !!topa->last;
583 
584 	return topa->last == TENTS_PER_PAGE - 1;
585 }
586 
587 /**
588  * topa_insert_pages() - create a list of ToPA tables
589  * @buf:	PT buffer being initialized.
590  * @gfp:	Allocation flags.
591  *
592  * This initializes a list of ToPA tables with entries from
593  * the data_pages provided by rb_alloc_aux().
594  *
595  * Return:	0 on success or error code.
596  */
597 static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
598 {
599 	struct topa *topa = buf->last;
600 	int order = 0;
601 	struct page *p;
602 
603 	p = virt_to_page(buf->data_pages[buf->nr_pages]);
604 	if (PagePrivate(p))
605 		order = page_private(p);
606 
607 	if (topa_table_full(topa)) {
608 		topa = topa_alloc(buf->cpu, gfp);
609 		if (!topa)
610 			return -ENOMEM;
611 
612 		topa_insert_table(buf, topa);
613 	}
614 
615 	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
616 	TOPA_ENTRY(topa, -1)->size = order;
617 	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
618 		TOPA_ENTRY(topa, -1)->intr = 1;
619 		TOPA_ENTRY(topa, -1)->stop = 1;
620 	}
621 
622 	topa->last++;
623 	topa->size += sizes(order);
624 
625 	buf->nr_pages += 1ul << order;
626 
627 	return 0;
628 }
629 
630 /**
631  * pt_topa_dump() - print ToPA tables and their entries
632  * @buf:	PT buffer.
633  */
634 static void pt_topa_dump(struct pt_buffer *buf)
635 {
636 	struct topa *topa;
637 
638 	list_for_each_entry(topa, &buf->tables, list) {
639 		int i;
640 
641 		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
642 			 topa->phys, topa->offset, topa->size);
643 		for (i = 0; i < TENTS_PER_PAGE; i++) {
644 			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
645 				 &topa->table[i],
646 				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
647 				 sizes(topa->table[i].size),
648 				 topa->table[i].end ?  'E' : ' ',
649 				 topa->table[i].intr ? 'I' : ' ',
650 				 topa->table[i].stop ? 'S' : ' ',
651 				 *(u64 *)&topa->table[i]);
652 			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
653 			     topa->table[i].stop) ||
654 			    topa->table[i].end)
655 				break;
656 		}
657 	}
658 }
659 
660 /**
661  * pt_buffer_advance() - advance to the next output region
662  * @buf:	PT buffer.
663  *
664  * Advance the current pointers in the buffer to the next ToPA entry.
665  */
666 static void pt_buffer_advance(struct pt_buffer *buf)
667 {
668 	buf->output_off = 0;
669 	buf->cur_idx++;
670 
671 	if (buf->cur_idx == buf->cur->last) {
672 		if (buf->cur == buf->last)
673 			buf->cur = buf->first;
674 		else
675 			buf->cur = list_entry(buf->cur->list.next, struct topa,
676 					      list);
677 		buf->cur_idx = 0;
678 	}
679 }
680 
681 /**
682  * pt_update_head() - calculate current offsets and sizes
683  * @pt:		Per-cpu pt context.
684  *
685  * Update buffer's current write pointer position and data size.
686  */
687 static void pt_update_head(struct pt *pt)
688 {
689 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
690 	u64 topa_idx, base, old;
691 
692 	/* offset of the first region in this table from the beginning of buf */
693 	base = buf->cur->offset + buf->output_off;
694 
695 	/* offset of the current output region within this table */
696 	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
697 		base += sizes(buf->cur->table[topa_idx].size);
698 
699 	if (buf->snapshot) {
700 		local_set(&buf->data_size, base);
701 	} else {
702 		old = (local64_xchg(&buf->head, base) &
703 		       ((buf->nr_pages << PAGE_SHIFT) - 1));
704 		if (base < old)
705 			base += buf->nr_pages << PAGE_SHIFT;
706 
707 		local_add(base - old, &buf->data_size);
708 	}
709 }
710 
711 /**
712  * pt_buffer_region() - obtain current output region's address
713  * @buf:	PT buffer.
714  */
715 static void *pt_buffer_region(struct pt_buffer *buf)
716 {
717 	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
718 }
719 
720 /**
721  * pt_buffer_region_size() - obtain current output region's size
722  * @buf:	PT buffer.
723  */
724 static size_t pt_buffer_region_size(struct pt_buffer *buf)
725 {
726 	return sizes(buf->cur->table[buf->cur_idx].size);
727 }
728 
729 /**
730  * pt_handle_status() - take care of possible status conditions
731  * @pt:		Per-cpu pt context.
732  */
733 static void pt_handle_status(struct pt *pt)
734 {
735 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
736 	int advance = 0;
737 	u64 status;
738 
739 	rdmsrl(MSR_IA32_RTIT_STATUS, status);
740 
741 	if (status & RTIT_STATUS_ERROR) {
742 		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
743 		pt_topa_dump(buf);
744 		status &= ~RTIT_STATUS_ERROR;
745 	}
746 
747 	if (status & RTIT_STATUS_STOPPED) {
748 		status &= ~RTIT_STATUS_STOPPED;
749 
750 		/*
751 		 * On systems that only do single-entry ToPA, hitting STOP
752 		 * means we are already losing data; need to let the decoder
753 		 * know.
754 		 */
755 		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
756 		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
757 			local_inc(&buf->lost);
758 			advance++;
759 		}
760 	}
761 
762 	/*
763 	 * Also on single-entry ToPA implementations, interrupt will come
764 	 * before the output reaches its output region's boundary.
765 	 */
766 	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
767 	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
768 		void *head = pt_buffer_region(buf);
769 
770 		/* everything within this margin needs to be zeroed out */
771 		memset(head + buf->output_off, 0,
772 		       pt_buffer_region_size(buf) -
773 		       buf->output_off);
774 		advance++;
775 	}
776 
777 	if (advance)
778 		pt_buffer_advance(buf);
779 
780 	wrmsrl(MSR_IA32_RTIT_STATUS, status);
781 }
782 
783 /**
784  * pt_read_offset() - translate registers into buffer pointers
785  * @buf:	PT buffer.
786  *
787  * Set buffer's output pointers from MSR values.
788  */
789 static void pt_read_offset(struct pt_buffer *buf)
790 {
791 	u64 offset, base_topa;
792 
793 	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
794 	buf->cur = phys_to_virt(base_topa);
795 
796 	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
797 	/* offset within current output region */
798 	buf->output_off = offset >> 32;
799 	/* index of current output region within this table */
800 	buf->cur_idx = (offset & 0xffffff80) >> 7;
801 }
802 
803 /**
804  * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
805  * @buf:	PT buffer.
806  * @pg:		Page offset in the buffer.
807  *
808  * When advancing to the next output region (ToPA entry), given a page offset
809  * into the buffer, we need to find the offset of the first page in the next
810  * region.
811  */
812 static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
813 {
814 	struct topa_entry *te = buf->topa_index[pg];
815 
816 	/* one region */
817 	if (buf->first == buf->last && buf->first->last == 1)
818 		return pg;
819 
820 	do {
821 		pg++;
822 		pg &= buf->nr_pages - 1;
823 	} while (buf->topa_index[pg] == te);
824 
825 	return pg;
826 }
827 
828 /**
829  * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
830  * @buf:	PT buffer.
831  * @handle:	Current output handle.
832  *
833  * Place INT and STOP marks to prevent overwriting old data that the consumer
834  * hasn't yet collected and waking up the consumer after a certain fraction of
835  * the buffer has filled up. Only needed and sensible for non-snapshot counters.
836  *
837  * This obviously relies on buf::head to figure out buffer markers, so it has
838  * to be called after pt_buffer_reset_offsets() and before the hardware tracing
839  * is enabled.
840  */
841 static int pt_buffer_reset_markers(struct pt_buffer *buf,
842 				   struct perf_output_handle *handle)
843 
844 {
845 	unsigned long head = local64_read(&buf->head);
846 	unsigned long idx, npages, wakeup;
847 
848 	/* can't stop in the middle of an output region */
849 	if (buf->output_off + handle->size + 1 <
850 	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
851 		return -EINVAL;
852 
853 
854 	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
855 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
856 		return 0;
857 
858 	/* clear STOP and INT from current entry */
859 	buf->topa_index[buf->stop_pos]->stop = 0;
860 	buf->topa_index[buf->stop_pos]->intr = 0;
861 	buf->topa_index[buf->intr_pos]->intr = 0;
862 
863 	/* how many pages till the STOP marker */
864 	npages = handle->size >> PAGE_SHIFT;
865 
866 	/* if it's on a page boundary, fill up one more page */
867 	if (!offset_in_page(head + handle->size + 1))
868 		npages++;
869 
870 	idx = (head >> PAGE_SHIFT) + npages;
871 	idx &= buf->nr_pages - 1;
872 	buf->stop_pos = idx;
873 
874 	wakeup = handle->wakeup >> PAGE_SHIFT;
875 
876 	/* in the worst case, wake up the consumer one page before hard stop */
877 	idx = (head >> PAGE_SHIFT) + npages - 1;
878 	if (idx > wakeup)
879 		idx = wakeup;
880 
881 	idx &= buf->nr_pages - 1;
882 	buf->intr_pos = idx;
883 
884 	buf->topa_index[buf->stop_pos]->stop = 1;
885 	buf->topa_index[buf->stop_pos]->intr = 1;
886 	buf->topa_index[buf->intr_pos]->intr = 1;
887 
888 	return 0;
889 }
890 
891 /**
892  * pt_buffer_setup_topa_index() - build topa_index[] table of regions
893  * @buf:	PT buffer.
894  *
895  * topa_index[] references output regions indexed by offset into the
896  * buffer for purposes of quick reverse lookup.
897  */
898 static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
899 {
900 	struct topa *cur = buf->first, *prev = buf->last;
901 	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
902 		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
903 	int pg = 0, idx = 0;
904 
905 	while (pg < buf->nr_pages) {
906 		int tidx;
907 
908 		/* pages within one topa entry */
909 		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
910 			buf->topa_index[pg] = te_prev;
911 
912 		te_prev = te_cur;
913 
914 		if (idx == cur->last - 1) {
915 			/* advance to next topa table */
916 			idx = 0;
917 			cur = list_entry(cur->list.next, struct topa, list);
918 		} else {
919 			idx++;
920 		}
921 		te_cur = TOPA_ENTRY(cur, idx);
922 	}
923 
924 }
925 
926 /**
927  * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
928  * @buf:	PT buffer.
929  * @head:	Write pointer (aux_head) from AUX buffer.
930  *
931  * Find the ToPA table and entry corresponding to given @head and set buffer's
932  * "current" pointers accordingly. This is done after we have obtained the
933  * current aux_head position from a successful call to perf_aux_output_begin()
934  * to make sure the hardware is writing to the right place.
935  *
936  * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
937  * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
938  * which are used to determine INT and STOP markers' locations by a subsequent
939  * call to pt_buffer_reset_markers().
940  */
941 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
942 {
943 	int pg;
944 
945 	if (buf->snapshot)
946 		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
947 
948 	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
949 	pg = pt_topa_next_entry(buf, pg);
950 
951 	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
952 	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
953 			(unsigned long)buf->cur) / sizeof(struct topa_entry);
954 	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
955 
956 	local64_set(&buf->head, head);
957 	local_set(&buf->data_size, 0);
958 }
959 
960 /**
961  * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
962  * @buf:	PT buffer.
963  */
964 static void pt_buffer_fini_topa(struct pt_buffer *buf)
965 {
966 	struct topa *topa, *iter;
967 
968 	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
969 		/*
970 		 * right now, this is in free_aux() path only, so
971 		 * no need to unlink this table from the list
972 		 */
973 		topa_free(topa);
974 	}
975 }
976 
977 /**
978  * pt_buffer_init_topa() - initialize ToPA table for pt buffer
979  * @buf:	PT buffer.
980  * @size:	Total size of all regions within this ToPA.
981  * @gfp:	Allocation flags.
982  */
983 static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
984 			       gfp_t gfp)
985 {
986 	struct topa *topa;
987 	int err;
988 
989 	topa = topa_alloc(buf->cpu, gfp);
990 	if (!topa)
991 		return -ENOMEM;
992 
993 	topa_insert_table(buf, topa);
994 
995 	while (buf->nr_pages < nr_pages) {
996 		err = topa_insert_pages(buf, gfp);
997 		if (err) {
998 			pt_buffer_fini_topa(buf);
999 			return -ENOMEM;
1000 		}
1001 	}
1002 
1003 	pt_buffer_setup_topa_index(buf);
1004 
1005 	/* link last table to the first one, unless we're double buffering */
1006 	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
1007 		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
1008 		TOPA_ENTRY(buf->last, -1)->end = 1;
1009 	}
1010 
1011 	pt_topa_dump(buf);
1012 	return 0;
1013 }
1014 
1015 /**
1016  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
1017  * @cpu:	Cpu on which to allocate, -1 means current.
1018  * @pages:	Array of pointers to buffer pages passed from perf core.
1019  * @nr_pages:	Number of pages in the buffer.
1020  * @snapshot:	If this is a snapshot/overwrite counter.
1021  *
1022  * This is a pmu::setup_aux callback that sets up ToPA tables and all the
1023  * bookkeeping for an AUX buffer.
1024  *
1025  * Return:	Our private PT buffer structure.
1026  */
1027 static void *
1028 pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
1029 {
1030 	struct pt_buffer *buf;
1031 	int node, ret;
1032 
1033 	if (!nr_pages)
1034 		return NULL;
1035 
1036 	if (cpu == -1)
1037 		cpu = raw_smp_processor_id();
1038 	node = cpu_to_node(cpu);
1039 
1040 	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
1041 			   GFP_KERNEL, node);
1042 	if (!buf)
1043 		return NULL;
1044 
1045 	buf->cpu = cpu;
1046 	buf->snapshot = snapshot;
1047 	buf->data_pages = pages;
1048 
1049 	INIT_LIST_HEAD(&buf->tables);
1050 
1051 	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
1052 	if (ret) {
1053 		kfree(buf);
1054 		return NULL;
1055 	}
1056 
1057 	return buf;
1058 }
1059 
1060 /**
1061  * pt_buffer_free_aux() - perf AUX deallocation path callback
1062  * @data:	PT buffer.
1063  */
1064 static void pt_buffer_free_aux(void *data)
1065 {
1066 	struct pt_buffer *buf = data;
1067 
1068 	pt_buffer_fini_topa(buf);
1069 	kfree(buf);
1070 }
1071 
1072 static int pt_addr_filters_init(struct perf_event *event)
1073 {
1074 	struct pt_filters *filters;
1075 	int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
1076 
1077 	if (!pt_cap_get(PT_CAP_num_address_ranges))
1078 		return 0;
1079 
1080 	filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
1081 	if (!filters)
1082 		return -ENOMEM;
1083 
1084 	if (event->parent)
1085 		memcpy(filters, event->parent->hw.addr_filters,
1086 		       sizeof(*filters));
1087 
1088 	event->hw.addr_filters = filters;
1089 
1090 	return 0;
1091 }
1092 
1093 static void pt_addr_filters_fini(struct perf_event *event)
1094 {
1095 	kfree(event->hw.addr_filters);
1096 	event->hw.addr_filters = NULL;
1097 }
1098 
1099 static inline bool valid_kernel_ip(unsigned long ip)
1100 {
1101 	return virt_addr_valid(ip) && kernel_ip(ip);
1102 }
1103 
1104 static int pt_event_addr_filters_validate(struct list_head *filters)
1105 {
1106 	struct perf_addr_filter *filter;
1107 	int range = 0;
1108 
1109 	list_for_each_entry(filter, filters, entry) {
1110 		/* PT doesn't support single address triggers */
1111 		if (!filter->range || !filter->size)
1112 			return -EOPNOTSUPP;
1113 
1114 		if (!filter->inode) {
1115 			if (!valid_kernel_ip(filter->offset))
1116 				return -EINVAL;
1117 
1118 			if (!valid_kernel_ip(filter->offset + filter->size))
1119 				return -EINVAL;
1120 		}
1121 
1122 		if (++range > pt_cap_get(PT_CAP_num_address_ranges))
1123 			return -EOPNOTSUPP;
1124 	}
1125 
1126 	return 0;
1127 }
1128 
1129 static void pt_event_addr_filters_sync(struct perf_event *event)
1130 {
1131 	struct perf_addr_filters_head *head = perf_event_addr_filters(event);
1132 	unsigned long msr_a, msr_b, *offs = event->addr_filters_offs;
1133 	struct pt_filters *filters = event->hw.addr_filters;
1134 	struct perf_addr_filter *filter;
1135 	int range = 0;
1136 
1137 	if (!filters)
1138 		return;
1139 
1140 	list_for_each_entry(filter, &head->list, entry) {
1141 		if (filter->inode && !offs[range]) {
1142 			msr_a = msr_b = 0;
1143 		} else {
1144 			/* apply the offset */
1145 			msr_a = filter->offset + offs[range];
1146 			msr_b = filter->size + msr_a - 1;
1147 		}
1148 
1149 		filters->filter[range].msr_a  = msr_a;
1150 		filters->filter[range].msr_b  = msr_b;
1151 		filters->filter[range].config = filter->filter ? 1 : 2;
1152 		range++;
1153 	}
1154 
1155 	filters->nr_filters = range;
1156 }
1157 
1158 /**
1159  * intel_pt_interrupt() - PT PMI handler
1160  */
1161 void intel_pt_interrupt(void)
1162 {
1163 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1164 	struct pt_buffer *buf;
1165 	struct perf_event *event = pt->handle.event;
1166 
1167 	/*
1168 	 * There may be a dangling PT bit in the interrupt status register
1169 	 * after PT has been disabled by pt_event_stop(). Make sure we don't
1170 	 * do anything (particularly, re-enable) for this event here.
1171 	 */
1172 	if (!READ_ONCE(pt->handle_nmi))
1173 		return;
1174 
1175 	/*
1176 	 * If VMX is on and PT does not support it, don't touch anything.
1177 	 */
1178 	if (READ_ONCE(pt->vmx_on))
1179 		return;
1180 
1181 	if (!event)
1182 		return;
1183 
1184 	pt_config_stop(event);
1185 
1186 	buf = perf_get_aux(&pt->handle);
1187 	if (!buf)
1188 		return;
1189 
1190 	pt_read_offset(buf);
1191 
1192 	pt_handle_status(pt);
1193 
1194 	pt_update_head(pt);
1195 
1196 	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
1197 			    local_xchg(&buf->lost, 0));
1198 
1199 	if (!event->hw.state) {
1200 		int ret;
1201 
1202 		buf = perf_aux_output_begin(&pt->handle, event);
1203 		if (!buf) {
1204 			event->hw.state = PERF_HES_STOPPED;
1205 			return;
1206 		}
1207 
1208 		pt_buffer_reset_offsets(buf, pt->handle.head);
1209 		/* snapshot counters don't use PMI, so it's safe */
1210 		ret = pt_buffer_reset_markers(buf, &pt->handle);
1211 		if (ret) {
1212 			perf_aux_output_end(&pt->handle, 0, true);
1213 			return;
1214 		}
1215 
1216 		pt_config_buffer(buf->cur->table, buf->cur_idx,
1217 				 buf->output_off);
1218 		pt_config(event);
1219 	}
1220 }
1221 
1222 void intel_pt_handle_vmx(int on)
1223 {
1224 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1225 	struct perf_event *event;
1226 	unsigned long flags;
1227 
1228 	/* PT plays nice with VMX, do nothing */
1229 	if (pt_pmu.vmx)
1230 		return;
1231 
1232 	/*
1233 	 * VMXON will clear RTIT_CTL.TraceEn; we need to make
1234 	 * sure to not try to set it while VMX is on. Disable
1235 	 * interrupts to avoid racing with pmu callbacks;
1236 	 * concurrent PMI should be handled fine.
1237 	 */
1238 	local_irq_save(flags);
1239 	WRITE_ONCE(pt->vmx_on, on);
1240 
1241 	if (on) {
1242 		/* prevent pt_config_stop() from writing RTIT_CTL */
1243 		event = pt->handle.event;
1244 		if (event)
1245 			event->hw.config = 0;
1246 	}
1247 	local_irq_restore(flags);
1248 }
1249 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
1250 
1251 /*
1252  * PMU callbacks
1253  */
1254 
1255 static void pt_event_start(struct perf_event *event, int mode)
1256 {
1257 	struct hw_perf_event *hwc = &event->hw;
1258 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1259 	struct pt_buffer *buf;
1260 
1261 	if (READ_ONCE(pt->vmx_on))
1262 		return;
1263 
1264 	buf = perf_aux_output_begin(&pt->handle, event);
1265 	if (!buf)
1266 		goto fail_stop;
1267 
1268 	pt_buffer_reset_offsets(buf, pt->handle.head);
1269 	if (!buf->snapshot) {
1270 		if (pt_buffer_reset_markers(buf, &pt->handle))
1271 			goto fail_end_stop;
1272 	}
1273 
1274 	WRITE_ONCE(pt->handle_nmi, 1);
1275 	hwc->state = 0;
1276 
1277 	pt_config_buffer(buf->cur->table, buf->cur_idx,
1278 			 buf->output_off);
1279 	pt_config(event);
1280 
1281 	return;
1282 
1283 fail_end_stop:
1284 	perf_aux_output_end(&pt->handle, 0, true);
1285 fail_stop:
1286 	hwc->state = PERF_HES_STOPPED;
1287 }
1288 
1289 static void pt_event_stop(struct perf_event *event, int mode)
1290 {
1291 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1292 
1293 	/*
1294 	 * Protect against the PMI racing with disabling wrmsr,
1295 	 * see comment in intel_pt_interrupt().
1296 	 */
1297 	WRITE_ONCE(pt->handle_nmi, 0);
1298 
1299 	pt_config_stop(event);
1300 
1301 	if (event->hw.state == PERF_HES_STOPPED)
1302 		return;
1303 
1304 	event->hw.state = PERF_HES_STOPPED;
1305 
1306 	if (mode & PERF_EF_UPDATE) {
1307 		struct pt_buffer *buf = perf_get_aux(&pt->handle);
1308 
1309 		if (!buf)
1310 			return;
1311 
1312 		if (WARN_ON_ONCE(pt->handle.event != event))
1313 			return;
1314 
1315 		pt_read_offset(buf);
1316 
1317 		pt_handle_status(pt);
1318 
1319 		pt_update_head(pt);
1320 
1321 		if (buf->snapshot)
1322 			pt->handle.head =
1323 				local_xchg(&buf->data_size,
1324 					   buf->nr_pages << PAGE_SHIFT);
1325 		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
1326 				    local_xchg(&buf->lost, 0));
1327 	}
1328 }
1329 
1330 static void pt_event_del(struct perf_event *event, int mode)
1331 {
1332 	pt_event_stop(event, PERF_EF_UPDATE);
1333 }
1334 
1335 static int pt_event_add(struct perf_event *event, int mode)
1336 {
1337 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1338 	struct hw_perf_event *hwc = &event->hw;
1339 	int ret = -EBUSY;
1340 
1341 	if (pt->handle.event)
1342 		goto fail;
1343 
1344 	if (mode & PERF_EF_START) {
1345 		pt_event_start(event, 0);
1346 		ret = -EINVAL;
1347 		if (hwc->state == PERF_HES_STOPPED)
1348 			goto fail;
1349 	} else {
1350 		hwc->state = PERF_HES_STOPPED;
1351 	}
1352 
1353 	ret = 0;
1354 fail:
1355 
1356 	return ret;
1357 }
1358 
1359 static void pt_event_read(struct perf_event *event)
1360 {
1361 }
1362 
1363 static void pt_event_destroy(struct perf_event *event)
1364 {
1365 	pt_addr_filters_fini(event);
1366 	x86_del_exclusive(x86_lbr_exclusive_pt);
1367 }
1368 
1369 static int pt_event_init(struct perf_event *event)
1370 {
1371 	if (event->attr.type != pt_pmu.pmu.type)
1372 		return -ENOENT;
1373 
1374 	if (!pt_event_valid(event))
1375 		return -EINVAL;
1376 
1377 	if (x86_add_exclusive(x86_lbr_exclusive_pt))
1378 		return -EBUSY;
1379 
1380 	if (pt_addr_filters_init(event)) {
1381 		x86_del_exclusive(x86_lbr_exclusive_pt);
1382 		return -ENOMEM;
1383 	}
1384 
1385 	event->destroy = pt_event_destroy;
1386 
1387 	return 0;
1388 }
1389 
1390 void cpu_emergency_stop_pt(void)
1391 {
1392 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1393 
1394 	if (pt->handle.event)
1395 		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
1396 }
1397 
1398 static __init int pt_init(void)
1399 {
1400 	int ret, cpu, prior_warn = 0;
1401 
1402 	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1403 
1404 	if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
1405 		return -ENODEV;
1406 
1407 	get_online_cpus();
1408 	for_each_online_cpu(cpu) {
1409 		u64 ctl;
1410 
1411 		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1412 		if (!ret && (ctl & RTIT_CTL_TRACEEN))
1413 			prior_warn++;
1414 	}
1415 	put_online_cpus();
1416 
1417 	if (prior_warn) {
1418 		x86_add_exclusive(x86_lbr_exclusive_pt);
1419 		pr_warn("PT is enabled at boot time, doing nothing\n");
1420 
1421 		return -EBUSY;
1422 	}
1423 
1424 	ret = pt_pmu_hw_init();
1425 	if (ret)
1426 		return ret;
1427 
1428 	if (!pt_cap_get(PT_CAP_topa_output)) {
1429 		pr_warn("ToPA output is not supported on this CPU\n");
1430 		return -ENODEV;
1431 	}
1432 
1433 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
1434 		pt_pmu.pmu.capabilities =
1435 			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1436 
1437 	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1438 	pt_pmu.pmu.attr_groups		 = pt_attr_groups;
1439 	pt_pmu.pmu.task_ctx_nr		 = perf_sw_context;
1440 	pt_pmu.pmu.event_init		 = pt_event_init;
1441 	pt_pmu.pmu.add			 = pt_event_add;
1442 	pt_pmu.pmu.del			 = pt_event_del;
1443 	pt_pmu.pmu.start		 = pt_event_start;
1444 	pt_pmu.pmu.stop			 = pt_event_stop;
1445 	pt_pmu.pmu.read			 = pt_event_read;
1446 	pt_pmu.pmu.setup_aux		 = pt_buffer_setup_aux;
1447 	pt_pmu.pmu.free_aux		 = pt_buffer_free_aux;
1448 	pt_pmu.pmu.addr_filters_sync     = pt_event_addr_filters_sync;
1449 	pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
1450 	pt_pmu.pmu.nr_addr_filters       =
1451 		pt_cap_get(PT_CAP_num_address_ranges);
1452 
1453 	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1454 
1455 	return ret;
1456 }
1457 arch_initcall(pt_init);
1458