xref: /linux/arch/x86/events/amd/ibs.c (revision fbf5df34a4dbcd09d433dd4f0916bf9b2ddb16de)
1 /*
2  * Performance events - AMD IBS
3  *
4  *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
5  *
6  *  For licencing details see kernel-base/COPYING
7  */
8 
9 #include <linux/perf_event.h>
10 #include <linux/init.h>
11 #include <linux/export.h>
12 #include <linux/pci.h>
13 #include <linux/ptrace.h>
14 #include <linux/syscore_ops.h>
15 #include <linux/sched/clock.h>
16 
17 #include <asm/apic.h>
18 #include <asm/msr.h>
19 
20 #include "../perf_event.h"
21 
22 static u32 ibs_caps;
23 
24 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
25 
26 #include <linux/kprobes.h>
27 #include <linux/hardirq.h>
28 
29 #include <asm/nmi.h>
30 #include <asm/amd/ibs.h>
31 
32 /* attr.config2 */
33 #define IBS_SW_FILTER_MASK	1
34 
35 /* attr.config1 */
36 #define IBS_OP_CONFIG1_LDLAT_MASK		(0xFFFULL <<  0)
37 #define IBS_OP_CONFIG1_STRMST_MASK		    (1ULL << 12)
38 #define IBS_OP_CONFIG1_STRMST_SHIFT			    (12)
39 
40 #define IBS_FETCH_CONFIG1_FETCHLAT_MASK		(0x7FFULL <<  0)
41 
42 /*
43  * IBS states:
44  *
45  * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
46  * and any further add()s must fail.
47  *
48  * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
49  * complicated by the fact that the IBS hardware can send late NMIs (ie. after
50  * we've cleared the EN bit).
51  *
52  * In order to consume these late NMIs we have the STOPPED state, any NMI that
53  * happens after we've cleared the EN state will clear this bit and report the
54  * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
55  * someone else can consume our BIT and our NMI will go unhandled).
56  *
57  * And since we cannot set/clear this separate bit together with the EN bit,
58  * there are races; if we cleared STARTED early, an NMI could land in
59  * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
60  * could happen if the period is small enough), and consume our STOPPED bit
61  * and trigger streams of unhandled NMIs.
62  *
63  * If, however, we clear STARTED late, an NMI can hit between clearing the
64  * EN bit and clearing STARTED, still see STARTED set and process the event.
65  * If this event will have the VALID bit clear, we bail properly, but this
66  * is not a given. With VALID set we can end up calling pmu::stop() again
67  * (the throttle logic) and trigger the WARNs in there.
68  *
69  * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
70  * nesting, and clear STARTED late, so that we have a well defined state over
71  * the clearing of the EN bit.
72  *
73  * XXX: we could probably be using !atomic bitops for all this.
74  */
75 
76 enum ibs_states {
77 	IBS_ENABLED	= 0,
78 	IBS_STARTED	= 1,
79 	IBS_STOPPING	= 2,
80 	IBS_STOPPED	= 3,
81 
82 	IBS_MAX_STATES,
83 };
84 
85 struct cpu_perf_ibs {
86 	struct perf_event	*event;
87 	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
88 };
89 
90 struct perf_ibs {
91 	struct pmu			pmu;
92 	unsigned int			msr;
93 	unsigned int			msr2;
94 	u64				config_mask;
95 	u64				cnt_mask;
96 	u64				enable_mask;
97 	u64				disable_mask;
98 	u64				valid_mask;
99 	u16				min_period;
100 	u64				max_period;
101 	unsigned long			offset_mask[1];
102 	int				offset_max;
103 	unsigned int			fetch_count_reset_broken : 1;
104 	unsigned int			fetch_ignore_if_zero_rip : 1;
105 	struct cpu_perf_ibs __percpu	*pcpu;
106 
107 	u64				(*get_count)(u64 config);
108 };
109 
110 static int
111 perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
112 {
113 	s64 left = local64_read(&hwc->period_left);
114 	s64 period = hwc->sample_period;
115 	int overflow = 0;
116 
117 	/*
118 	 * If we are way outside a reasonable range then just skip forward:
119 	 */
120 	if (unlikely(left <= -period)) {
121 		left = period;
122 		local64_set(&hwc->period_left, left);
123 		hwc->last_period = period;
124 		overflow = 1;
125 	}
126 
127 	if (unlikely(left < (s64)min)) {
128 		left += period;
129 		local64_set(&hwc->period_left, left);
130 		hwc->last_period = period;
131 		overflow = 1;
132 	}
133 
134 	/*
135 	 * If the hw period that triggers the sw overflow is too short
136 	 * we might hit the irq handler. This biases the results.
137 	 * Thus we shorten the next-to-last period and set the last
138 	 * period to the max period.
139 	 */
140 	if (left > max) {
141 		left -= max;
142 		if (left > max)
143 			left = max;
144 		else if (left < min)
145 			left = min;
146 	}
147 
148 	*hw_period = (u64)left;
149 
150 	return overflow;
151 }
152 
153 static  int
154 perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
155 {
156 	struct hw_perf_event *hwc = &event->hw;
157 	int shift = 64 - width;
158 	u64 prev_raw_count;
159 	u64 delta;
160 
161 	/*
162 	 * Careful: an NMI might modify the previous event value.
163 	 *
164 	 * Our tactic to handle this is to first atomically read and
165 	 * exchange a new raw count - then add that new-prev delta
166 	 * count to the generic event atomically:
167 	 */
168 	prev_raw_count = local64_read(&hwc->prev_count);
169 	if (!local64_try_cmpxchg(&hwc->prev_count,
170 				 &prev_raw_count, new_raw_count))
171 		return 0;
172 
173 	/*
174 	 * Now we have the new raw value and have updated the prev
175 	 * timestamp already. We can now calculate the elapsed delta
176 	 * (event-)time and add that to the generic event.
177 	 *
178 	 * Careful, not all hw sign-extends above the physical width
179 	 * of the count.
180 	 */
181 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
182 	delta >>= shift;
183 
184 	local64_add(delta, &event->count);
185 	local64_sub(delta, &hwc->period_left);
186 
187 	return 1;
188 }
189 
190 static struct perf_ibs perf_ibs_fetch;
191 static struct perf_ibs perf_ibs_op;
192 
193 static struct perf_ibs *get_ibs_pmu(int type)
194 {
195 	if (perf_ibs_fetch.pmu.type == type)
196 		return &perf_ibs_fetch;
197 	if (perf_ibs_op.pmu.type == type)
198 		return &perf_ibs_op;
199 	return NULL;
200 }
201 
202 /*
203  * core pmu config -> IBS config
204  *
205  *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
206  *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
207  *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
208  *
209  * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
210  * MSRC001_1033) is used to select either cycle or micro-ops counting
211  * mode.
212  */
213 static int core_pmu_ibs_config(struct perf_event *event, u64 *config)
214 {
215 	switch (event->attr.type) {
216 	case PERF_TYPE_HARDWARE:
217 		switch (event->attr.config) {
218 		case PERF_COUNT_HW_CPU_CYCLES:
219 			*config = 0;
220 			return 0;
221 		}
222 		break;
223 	case PERF_TYPE_RAW:
224 		switch (event->attr.config) {
225 		case 0x0076:
226 			*config = 0;
227 			return 0;
228 		case 0x00C1:
229 			*config = IBS_OP_CNT_CTL;
230 			return 0;
231 		}
232 		break;
233 	default:
234 		return -ENOENT;
235 	}
236 
237 	return -EOPNOTSUPP;
238 }
239 
240 /*
241  * The rip of IBS samples has skid 0. Thus, IBS supports precise
242  * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
243  * rip is invalid when IBS was not able to record the rip correctly.
244  * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
245  */
246 int forward_event_to_ibs(struct perf_event *event)
247 {
248 	u64 config = 0;
249 
250 	if (!event->attr.precise_ip || event->attr.precise_ip > 2)
251 		return -EOPNOTSUPP;
252 
253 	if (!core_pmu_ibs_config(event, &config)) {
254 		event->attr.type = perf_ibs_op.pmu.type;
255 		event->attr.config = config;
256 	}
257 	return -ENOENT;
258 }
259 
260 /*
261  * Grouping of IBS events is not possible since IBS can have only
262  * one event active at any point in time.
263  */
264 static int validate_group(struct perf_event *event)
265 {
266 	struct perf_event *sibling;
267 
268 	if (event->group_leader == event)
269 		return 0;
270 
271 	if (event->group_leader->pmu == event->pmu)
272 		return -EINVAL;
273 
274 	for_each_sibling_event(sibling, event->group_leader) {
275 		if (sibling->pmu == event->pmu)
276 			return -EINVAL;
277 	}
278 	return 0;
279 }
280 
281 static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs,
282 				 struct perf_event *event)
283 {
284 	return perf_ibs == &perf_ibs_op &&
285 	       (ibs_caps & IBS_CAPS_OPLDLAT) &&
286 	       (event->attr.config1 & IBS_OP_CONFIG1_LDLAT_MASK);
287 }
288 
289 static bool perf_ibs_fetch_lat_event(struct perf_ibs *perf_ibs,
290 				     struct perf_event *event)
291 {
292 	return perf_ibs == &perf_ibs_fetch &&
293 	       (ibs_caps & IBS_CAPS_FETCHLAT) &&
294 	       (event->attr.config1 & IBS_FETCH_CONFIG1_FETCHLAT_MASK);
295 }
296 
297 static bool perf_ibs_strmst_event(struct perf_ibs *perf_ibs,
298 				  struct perf_event *event)
299 {
300 	return perf_ibs == &perf_ibs_op &&
301 	       (ibs_caps & IBS_CAPS_STRMST_RMTSOCKET) &&
302 	       (event->attr.config1 & IBS_OP_CONFIG1_STRMST_MASK);
303 }
304 
305 static int perf_ibs_init(struct perf_event *event)
306 {
307 	struct hw_perf_event *hwc = &event->hw;
308 	struct perf_ibs *perf_ibs;
309 	u64 config;
310 	int ret;
311 
312 	perf_ibs = get_ibs_pmu(event->attr.type);
313 	if (!perf_ibs)
314 		return -ENOENT;
315 
316 	config = event->attr.config;
317 	hwc->extra_reg.config = 0;
318 	hwc->extra_reg.reg = 0;
319 
320 	if (event->pmu != &perf_ibs->pmu)
321 		return -ENOENT;
322 
323 	if (config & ~perf_ibs->config_mask)
324 		return -EINVAL;
325 
326 	if (has_branch_stack(event))
327 		return -EOPNOTSUPP;
328 
329 	/* handle exclude_{user,kernel} in the IRQ handler */
330 	if (event->attr.exclude_host || event->attr.exclude_guest ||
331 	    event->attr.exclude_idle)
332 		return -EINVAL;
333 
334 	ret = validate_group(event);
335 	if (ret)
336 		return ret;
337 
338 	if (perf_allow_kernel())
339 		hwc->flags |= PERF_X86_EVENT_UNPRIVILEGED;
340 
341 	if (ibs_caps & IBS_CAPS_DIS) {
342 		hwc->extra_reg.config &= ~perf_ibs->disable_mask;
343 		hwc->extra_reg.reg = perf_ibs->msr2;
344 	}
345 
346 	if (ibs_caps & IBS_CAPS_BIT63_FILTER) {
347 		if (perf_ibs == &perf_ibs_fetch) {
348 			if (event->attr.exclude_kernel) {
349 				hwc->extra_reg.config |= IBS_FETCH_2_EXCL_RIP_63_EQ_1;
350 				hwc->extra_reg.reg = perf_ibs->msr2;
351 			}
352 			if (event->attr.exclude_user) {
353 				hwc->extra_reg.config |= IBS_FETCH_2_EXCL_RIP_63_EQ_0;
354 				hwc->extra_reg.reg = perf_ibs->msr2;
355 			}
356 		} else {
357 			if (event->attr.exclude_kernel) {
358 				hwc->extra_reg.config |= IBS_OP_2_EXCL_RIP_63_EQ_1;
359 				hwc->extra_reg.reg = perf_ibs->msr2;
360 			}
361 			if (event->attr.exclude_user) {
362 				hwc->extra_reg.config |= IBS_OP_2_EXCL_RIP_63_EQ_0;
363 				hwc->extra_reg.reg = perf_ibs->msr2;
364 			}
365 		}
366 	} else if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
367 		   (event->attr.exclude_kernel || event->attr.exclude_user ||
368 		    event->attr.exclude_hv)) {
369 		return -EINVAL;
370 	}
371 
372 	if (hwc->sample_period) {
373 		if (config & perf_ibs->cnt_mask)
374 			/* raw max_cnt may not be set */
375 			return -EINVAL;
376 
377 		if (event->attr.freq) {
378 			hwc->sample_period = perf_ibs->min_period;
379 		} else {
380 			/* Silently mask off lower nibble. IBS hw mandates it. */
381 			hwc->sample_period &= ~0x0FULL;
382 			if (hwc->sample_period < perf_ibs->min_period)
383 				return -EINVAL;
384 		}
385 	} else {
386 		u64 period = 0;
387 
388 		if (event->attr.freq)
389 			return -EINVAL;
390 
391 		if (perf_ibs == &perf_ibs_op) {
392 			period = (config & IBS_OP_MAX_CNT) << 4;
393 			if (ibs_caps & IBS_CAPS_OPCNTEXT)
394 				period |= config & IBS_OP_MAX_CNT_EXT_MASK;
395 		} else {
396 			period = (config & IBS_FETCH_MAX_CNT) << 4;
397 		}
398 
399 		config &= ~perf_ibs->cnt_mask;
400 		event->attr.sample_period = period;
401 		hwc->sample_period = period;
402 
403 		if (hwc->sample_period < perf_ibs->min_period)
404 			return -EINVAL;
405 	}
406 
407 	if (perf_ibs_ldlat_event(perf_ibs, event)) {
408 		u64 ldlat = event->attr.config1 & IBS_OP_CONFIG1_LDLAT_MASK;
409 
410 		if (ldlat < 128 || ldlat > 2048)
411 			return -EINVAL;
412 		ldlat >>= 7;
413 
414 		config |= (ldlat - 1) << IBS_OP_LDLAT_THRSH_SHIFT;
415 
416 		config |= IBS_OP_LDLAT_EN;
417 		if (cpu_feature_enabled(X86_FEATURE_ZEN5))
418 			config |= IBS_OP_L3MISSONLY;
419 	}
420 
421 	if (perf_ibs_fetch_lat_event(perf_ibs, event)) {
422 		u64 fetchlat = event->attr.config1 & IBS_FETCH_CONFIG1_FETCHLAT_MASK;
423 
424 		if (fetchlat < 128 || fetchlat > 1920)
425 			return -EINVAL;
426 		fetchlat >>= 7;
427 
428 		hwc->extra_reg.reg = perf_ibs->msr2;
429 		hwc->extra_reg.config |= fetchlat << IBS_FETCH_2_FETCHLAT_FILTER_SHIFT;
430 	}
431 
432 	if (perf_ibs_strmst_event(perf_ibs, event)) {
433 		u64 strmst = event->attr.config1 & IBS_OP_CONFIG1_STRMST_MASK;
434 
435 		strmst >>= IBS_OP_CONFIG1_STRMST_SHIFT;
436 
437 		hwc->extra_reg.reg = perf_ibs->msr2;
438 		hwc->extra_reg.config |= strmst << IBS_OP_2_STRM_ST_FILTER_SHIFT;
439 	}
440 
441 	/*
442 	 * If we modify hwc->sample_period, we also need to update
443 	 * hwc->last_period and hwc->period_left.
444 	 */
445 	hwc->last_period = hwc->sample_period;
446 	local64_set(&hwc->period_left, hwc->sample_period);
447 
448 	hwc->config_base = perf_ibs->msr;
449 	hwc->config = config;
450 
451 	return 0;
452 }
453 
454 static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
455 			       struct hw_perf_event *hwc, u64 *period)
456 {
457 	int overflow;
458 
459 	/* ignore lower 4 bits in min count: */
460 	overflow = perf_event_set_period(hwc, perf_ibs->min_period,
461 					 perf_ibs->max_period, period);
462 	local64_set(&hwc->prev_count, 0);
463 
464 	return overflow;
465 }
466 
467 static u64 get_ibs_fetch_count(u64 config)
468 {
469 	union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config;
470 
471 	return fetch_ctl.fetch_cnt << 4;
472 }
473 
474 static u64 get_ibs_op_count(u64 config)
475 {
476 	union ibs_op_ctl op_ctl = (union ibs_op_ctl)config;
477 	u64 count = 0;
478 
479 	/*
480 	 * If the internal 27-bit counter rolled over, the count is MaxCnt
481 	 * and the lower 7 bits of CurCnt are randomized.
482 	 * Otherwise CurCnt has the full 27-bit current counter value.
483 	 */
484 	if (op_ctl.op_val) {
485 		count = op_ctl.opmaxcnt << 4;
486 		if (ibs_caps & IBS_CAPS_OPCNTEXT)
487 			count += op_ctl.opmaxcnt_ext << 20;
488 	} else if (ibs_caps & IBS_CAPS_RDWROPCNT) {
489 		count = op_ctl.opcurcnt;
490 	}
491 
492 	return count;
493 }
494 
495 static void
496 perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
497 		      u64 *config)
498 {
499 	u64 count = perf_ibs->get_count(*config);
500 
501 	/*
502 	 * Set width to 64 since we do not overflow on max width but
503 	 * instead on max count. In perf_ibs_set_period() we clear
504 	 * prev count manually on overflow.
505 	 */
506 	while (!perf_event_try_update(event, count, 64)) {
507 		rdmsrq(event->hw.config_base, *config);
508 		count = perf_ibs->get_count(*config);
509 	}
510 }
511 
512 static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
513 					 struct hw_perf_event *hwc, u64 config)
514 {
515 	u64 tmp = hwc->config | config;
516 
517 	if (perf_ibs->fetch_count_reset_broken)
518 		wrmsrq(hwc->config_base, tmp & ~perf_ibs->enable_mask);
519 
520 	wrmsrq(hwc->config_base, tmp | perf_ibs->enable_mask);
521 
522 	if (hwc->extra_reg.reg)
523 		wrmsrq(hwc->extra_reg.reg, hwc->extra_reg.config);
524 }
525 
526 /*
527  * Erratum #420 Instruction-Based Sampling Engine May Generate
528  * Interrupt that Cannot Be Cleared:
529  *
530  * Must clear counter mask first, then clear the enable bit. See
531  * Revision Guide for AMD Family 10h Processors, Publication #41322.
532  */
533 static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
534 					  struct hw_perf_event *hwc, u64 config)
535 {
536 	if (ibs_caps & IBS_CAPS_DIS) {
537 		wrmsrq(hwc->extra_reg.reg, perf_ibs->disable_mask);
538 		return;
539 	}
540 
541 	config &= ~perf_ibs->cnt_mask;
542 	if (boot_cpu_data.x86 == 0x10)
543 		wrmsrq(hwc->config_base, config);
544 	config &= ~perf_ibs->enable_mask;
545 	wrmsrq(hwc->config_base, config);
546 }
547 
548 /*
549  * We cannot restore the ibs pmu state, so we always needs to update
550  * the event while stopping it and then reset the state when starting
551  * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
552  * perf_ibs_start()/perf_ibs_stop() and instead always do it.
553  */
554 static void perf_ibs_start(struct perf_event *event, int flags)
555 {
556 	struct hw_perf_event *hwc = &event->hw;
557 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
558 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
559 	u64 period, config = 0;
560 
561 	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
562 		return;
563 
564 	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
565 	hwc->state = 0;
566 
567 	if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
568 		hwc->sample_period = perf_ibs->min_period;
569 
570 	perf_ibs_set_period(perf_ibs, hwc, &period);
571 	if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
572 		config |= period & IBS_OP_MAX_CNT_EXT_MASK;
573 		period &= ~IBS_OP_MAX_CNT_EXT_MASK;
574 	}
575 	config |= period >> 4;
576 
577 	/*
578 	 * Reset the IBS_{FETCH|OP}_CTL MSR before updating pcpu->state.
579 	 * Doing so prevents a race condition in which an NMI due to other
580 	 * source might accidentally activate the event before we enable
581 	 * it ourselves.
582 	 */
583 	perf_ibs_disable_event(perf_ibs, hwc, 0);
584 
585 	/*
586 	 * Set STARTED before enabling the hardware, such that a subsequent NMI
587 	 * must observe it.
588 	 */
589 	set_bit(IBS_STARTED,    pcpu->state);
590 	clear_bit(IBS_STOPPING, pcpu->state);
591 	perf_ibs_enable_event(perf_ibs, hwc, config);
592 
593 	perf_event_update_userpage(event);
594 }
595 
596 static void perf_ibs_stop(struct perf_event *event, int flags)
597 {
598 	struct hw_perf_event *hwc = &event->hw;
599 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
600 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
601 	u64 config;
602 	int stopping;
603 
604 	if (test_and_set_bit(IBS_STOPPING, pcpu->state))
605 		return;
606 
607 	stopping = test_bit(IBS_STARTED, pcpu->state);
608 
609 	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
610 		return;
611 
612 	rdmsrq(hwc->config_base, config);
613 
614 	if (stopping) {
615 		/*
616 		 * Set STOPPED before disabling the hardware, such that it
617 		 * must be visible to NMIs the moment we clear the EN bit,
618 		 * at which point we can generate an !VALID sample which
619 		 * we need to consume.
620 		 */
621 		set_bit(IBS_STOPPED, pcpu->state);
622 		perf_ibs_disable_event(perf_ibs, hwc, config);
623 		/*
624 		 * Clear STARTED after disabling the hardware; if it were
625 		 * cleared before an NMI hitting after the clear but before
626 		 * clearing the EN bit might think it a spurious NMI and not
627 		 * handle it.
628 		 *
629 		 * Clearing it after, however, creates the problem of the NMI
630 		 * handler seeing STARTED but not having a valid sample.
631 		 */
632 		clear_bit(IBS_STARTED, pcpu->state);
633 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
634 		hwc->state |= PERF_HES_STOPPED;
635 	}
636 
637 	if (hwc->state & PERF_HES_UPTODATE)
638 		return;
639 
640 	/*
641 	 * Clear valid bit to not count rollovers on update, rollovers
642 	 * are only updated in the irq handler.
643 	 */
644 	config &= ~perf_ibs->valid_mask;
645 
646 	perf_ibs_event_update(perf_ibs, event, &config);
647 	hwc->state |= PERF_HES_UPTODATE;
648 }
649 
650 static int perf_ibs_add(struct perf_event *event, int flags)
651 {
652 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
653 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
654 
655 	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
656 		return -ENOSPC;
657 
658 	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
659 
660 	pcpu->event = event;
661 
662 	if (flags & PERF_EF_START)
663 		perf_ibs_start(event, PERF_EF_RELOAD);
664 
665 	return 0;
666 }
667 
668 static void perf_ibs_del(struct perf_event *event, int flags)
669 {
670 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
671 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
672 
673 	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
674 		return;
675 
676 	perf_ibs_stop(event, PERF_EF_UPDATE);
677 
678 	pcpu->event = NULL;
679 
680 	perf_event_update_userpage(event);
681 }
682 
683 static void perf_ibs_read(struct perf_event *event) { }
684 
685 static int perf_ibs_check_period(struct perf_event *event, u64 value)
686 {
687 	struct perf_ibs *perf_ibs;
688 	u64 low_nibble;
689 
690 	if (event->attr.freq)
691 		return 0;
692 
693 	perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
694 	low_nibble = value & 0xFULL;
695 
696 	/*
697 	 * This contradicts with perf_ibs_init() which allows sample period
698 	 * with lower nibble bits set but silently masks them off. Whereas
699 	 * this returns error.
700 	 */
701 	if (low_nibble || value < perf_ibs->min_period)
702 		return -EINVAL;
703 
704 	return 0;
705 }
706 
707 /*
708  * We need to initialize with empty group if all attributes in the
709  * group are dynamic.
710  */
711 static struct attribute *attrs_empty[] = {
712 	NULL,
713 };
714 
715 static struct attribute_group empty_caps_group = {
716 	.name = "caps",
717 	.attrs = attrs_empty,
718 };
719 
720 PMU_FORMAT_ATTR(rand_en,	"config:57");
721 PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
722 PMU_FORMAT_ATTR(swfilt,		"config2:0");
723 PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
724 PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
725 PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11");
726 PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
727 PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1");
728 PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1");
729 PMU_EVENT_ATTR_STRING(fetchlat, ibs_fetch_lat_format, "config1:0-10");
730 PMU_EVENT_ATTR_STRING(fetchlat, ibs_fetch_lat_cap, "1");
731 PMU_EVENT_ATTR_STRING(strmst, ibs_op_strmst_format, "config1:12");
732 PMU_EVENT_ATTR_STRING(strmst, ibs_op_strmst_cap, "1");
733 PMU_EVENT_ATTR_STRING(rmtsocket, ibs_op_rmtsocket_cap, "1");
734 
735 static umode_t
736 zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
737 {
738 	return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
739 }
740 
741 static umode_t
742 ibs_fetch_lat_is_visible(struct kobject *kobj, struct attribute *attr, int i)
743 {
744 	return ibs_caps & IBS_CAPS_FETCHLAT ? attr->mode : 0;
745 }
746 
747 static umode_t
748 ibs_op_strmst_is_visible(struct kobject *kobj, struct attribute *attr, int i)
749 {
750 	return ibs_caps & IBS_CAPS_STRMST_RMTSOCKET ? attr->mode : 0;
751 }
752 
753 static umode_t
754 ibs_op_rmtsocket_is_visible(struct kobject *kobj, struct attribute *attr, int i)
755 {
756 	return ibs_caps & IBS_CAPS_STRMST_RMTSOCKET ? attr->mode : 0;
757 }
758 
759 static umode_t
760 ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i)
761 {
762 	return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0;
763 }
764 
765 static umode_t
766 ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i)
767 {
768 	return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0;
769 }
770 
771 static struct attribute *fetch_attrs[] = {
772 	&format_attr_rand_en.attr,
773 	&format_attr_swfilt.attr,
774 	NULL,
775 };
776 
777 static struct attribute *fetch_l3missonly_attrs[] = {
778 	&fetch_l3missonly.attr.attr,
779 	NULL,
780 };
781 
782 static struct attribute *zen4_ibs_extensions_attrs[] = {
783 	&zen4_ibs_extensions.attr.attr,
784 	NULL,
785 };
786 
787 static struct attribute *ibs_fetch_lat_format_attrs[] = {
788 	&ibs_fetch_lat_format.attr.attr,
789 	NULL,
790 };
791 
792 static struct attribute *ibs_fetch_lat_cap_attrs[] = {
793 	&ibs_fetch_lat_cap.attr.attr,
794 	NULL,
795 };
796 
797 static struct attribute *ibs_op_ldlat_cap_attrs[] = {
798 	&ibs_op_ldlat_cap.attr.attr,
799 	NULL,
800 };
801 
802 static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = {
803 	&ibs_op_dtlb_pgsize_cap.attr.attr,
804 	NULL,
805 };
806 
807 static struct attribute *ibs_op_strmst_cap_attrs[] = {
808 	&ibs_op_strmst_cap.attr.attr,
809 	NULL,
810 };
811 
812 static struct attribute *ibs_op_rmtsocket_cap_attrs[] = {
813 	&ibs_op_rmtsocket_cap.attr.attr,
814 	NULL,
815 };
816 
817 static struct attribute_group group_fetch_formats = {
818 	.name = "format",
819 	.attrs = fetch_attrs,
820 };
821 
822 static struct attribute_group group_fetch_l3missonly = {
823 	.name = "format",
824 	.attrs = fetch_l3missonly_attrs,
825 	.is_visible = zen4_ibs_extensions_is_visible,
826 };
827 
828 static struct attribute_group group_zen4_ibs_extensions = {
829 	.name = "caps",
830 	.attrs = zen4_ibs_extensions_attrs,
831 	.is_visible = zen4_ibs_extensions_is_visible,
832 };
833 
834 static struct attribute_group group_ibs_fetch_lat_cap = {
835 	.name = "caps",
836 	.attrs = ibs_fetch_lat_cap_attrs,
837 	.is_visible = ibs_fetch_lat_is_visible,
838 };
839 
840 static struct attribute_group group_ibs_fetch_lat_format = {
841 	.name = "format",
842 	.attrs = ibs_fetch_lat_format_attrs,
843 	.is_visible = ibs_fetch_lat_is_visible,
844 };
845 
846 static struct attribute_group group_ibs_op_ldlat_cap = {
847 	.name = "caps",
848 	.attrs = ibs_op_ldlat_cap_attrs,
849 	.is_visible = ibs_op_ldlat_is_visible,
850 };
851 
852 static struct attribute_group group_ibs_op_dtlb_pgsize_cap = {
853 	.name = "caps",
854 	.attrs = ibs_op_dtlb_pgsize_cap_attrs,
855 	.is_visible = ibs_op_dtlb_pgsize_is_visible,
856 };
857 
858 static struct attribute_group group_ibs_op_strmst_cap = {
859 	.name = "caps",
860 	.attrs = ibs_op_strmst_cap_attrs,
861 	.is_visible = ibs_op_strmst_is_visible,
862 };
863 
864 static struct attribute_group group_ibs_op_rmtsocket_cap = {
865 	.name = "caps",
866 	.attrs = ibs_op_rmtsocket_cap_attrs,
867 	.is_visible = ibs_op_rmtsocket_is_visible,
868 };
869 
870 static const struct attribute_group *fetch_attr_groups[] = {
871 	&group_fetch_formats,
872 	&empty_caps_group,
873 	NULL,
874 };
875 
876 static const struct attribute_group *fetch_attr_update[] = {
877 	&group_fetch_l3missonly,
878 	&group_zen4_ibs_extensions,
879 	&group_ibs_fetch_lat_cap,
880 	&group_ibs_fetch_lat_format,
881 	NULL,
882 };
883 
884 static umode_t
885 cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
886 {
887 	return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
888 }
889 
890 static struct attribute *op_attrs[] = {
891 	&format_attr_swfilt.attr,
892 	NULL,
893 };
894 
895 static struct attribute *cnt_ctl_attrs[] = {
896 	&format_attr_cnt_ctl.attr,
897 	NULL,
898 };
899 
900 static struct attribute *op_l3missonly_attrs[] = {
901 	&op_l3missonly.attr.attr,
902 	NULL,
903 };
904 
905 static struct attribute_group group_op_formats = {
906 	.name = "format",
907 	.attrs = op_attrs,
908 };
909 
910 static struct attribute *ibs_op_ldlat_format_attrs[] = {
911 	&ibs_op_ldlat_format.attr.attr,
912 	NULL,
913 };
914 
915 static struct attribute *ibs_op_strmst_format_attrs[] = {
916 	&ibs_op_strmst_format.attr.attr,
917 	NULL,
918 };
919 
920 static struct attribute_group group_cnt_ctl = {
921 	.name = "format",
922 	.attrs = cnt_ctl_attrs,
923 	.is_visible = cnt_ctl_is_visible,
924 };
925 
926 static struct attribute_group group_op_l3missonly = {
927 	.name = "format",
928 	.attrs = op_l3missonly_attrs,
929 	.is_visible = zen4_ibs_extensions_is_visible,
930 };
931 
932 static const struct attribute_group *op_attr_groups[] = {
933 	&group_op_formats,
934 	&empty_caps_group,
935 	NULL,
936 };
937 
938 static struct attribute_group group_ibs_op_ldlat_format = {
939 	.name = "format",
940 	.attrs = ibs_op_ldlat_format_attrs,
941 	.is_visible = ibs_op_ldlat_is_visible,
942 };
943 
944 static struct attribute_group group_ibs_op_strmst_format = {
945 	.name = "format",
946 	.attrs = ibs_op_strmst_format_attrs,
947 	.is_visible = ibs_op_strmst_is_visible,
948 };
949 
950 static const struct attribute_group *op_attr_update[] = {
951 	&group_cnt_ctl,
952 	&group_op_l3missonly,
953 	&group_zen4_ibs_extensions,
954 	&group_ibs_op_ldlat_cap,
955 	&group_ibs_op_ldlat_format,
956 	&group_ibs_op_dtlb_pgsize_cap,
957 	&group_ibs_op_strmst_cap,
958 	&group_ibs_op_strmst_format,
959 	&group_ibs_op_rmtsocket_cap,
960 	NULL,
961 };
962 
963 static struct perf_ibs perf_ibs_fetch = {
964 	.pmu = {
965 		.task_ctx_nr	= perf_hw_context,
966 
967 		.event_init	= perf_ibs_init,
968 		.add		= perf_ibs_add,
969 		.del		= perf_ibs_del,
970 		.start		= perf_ibs_start,
971 		.stop		= perf_ibs_stop,
972 		.read		= perf_ibs_read,
973 		.check_period	= perf_ibs_check_period,
974 	},
975 	.msr			= MSR_AMD64_IBSFETCHCTL,
976 	.msr2			= MSR_AMD64_IBSFETCHCTL2,
977 	.config_mask		= IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN,
978 	.cnt_mask		= IBS_FETCH_MAX_CNT,
979 	.enable_mask		= IBS_FETCH_ENABLE,
980 	.valid_mask		= IBS_FETCH_VAL,
981 	.min_period		= 0x10,
982 	.max_period		= IBS_FETCH_MAX_CNT << 4,
983 	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
984 	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
985 
986 	.get_count		= get_ibs_fetch_count,
987 };
988 
989 static struct perf_ibs perf_ibs_op = {
990 	.pmu = {
991 		.task_ctx_nr	= perf_hw_context,
992 
993 		.event_init	= perf_ibs_init,
994 		.add		= perf_ibs_add,
995 		.del		= perf_ibs_del,
996 		.start		= perf_ibs_start,
997 		.stop		= perf_ibs_stop,
998 		.read		= perf_ibs_read,
999 		.check_period	= perf_ibs_check_period,
1000 	},
1001 	.msr			= MSR_AMD64_IBSOPCTL,
1002 	.msr2			= MSR_AMD64_IBSOPCTL2,
1003 	.config_mask		= IBS_OP_MAX_CNT,
1004 	.cnt_mask		= IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
1005 				  IBS_OP_CUR_CNT_RAND,
1006 	.enable_mask		= IBS_OP_ENABLE,
1007 	.valid_mask		= IBS_OP_VAL,
1008 	.min_period		= 0x90,
1009 	.max_period		= IBS_OP_MAX_CNT << 4,
1010 	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
1011 	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
1012 
1013 	.get_count		= get_ibs_op_count,
1014 };
1015 
1016 static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3,
1017 				struct perf_sample_data *data)
1018 {
1019 	union perf_mem_data_src *data_src = &data->data_src;
1020 
1021 	data_src->mem_op = PERF_MEM_OP_NA;
1022 
1023 	if (op_data3->ld_op)
1024 		data_src->mem_op = PERF_MEM_OP_LOAD;
1025 	else if (op_data3->st_op)
1026 		data_src->mem_op = PERF_MEM_OP_STORE;
1027 }
1028 
1029 /*
1030  * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has
1031  * more fine granular DataSrc encodings. Others have coarse.
1032  */
1033 static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
1034 {
1035 	if (ibs_caps & IBS_CAPS_ZEN4)
1036 		return (op_data2->data_src_hi << 3) | op_data2->data_src_lo;
1037 
1038 	return op_data2->data_src_lo;
1039 }
1040 
1041 #define	L(x)		(PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
1042 #define	LN(x)		PERF_MEM_S(LVLNUM, x)
1043 #define	REM		PERF_MEM_S(REMOTE, REMOTE)
1044 #define	HOPS(x)		PERF_MEM_S(HOPS, x)
1045 
1046 static u64 g_data_src[8] = {
1047 	[IBS_DATA_SRC_LOC_CACHE]	  = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
1048 	[IBS_DATA_SRC_DRAM]		  = L(LOC_RAM) | LN(RAM),
1049 	[IBS_DATA_SRC_REM_CACHE]	  = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
1050 	[IBS_DATA_SRC_IO]		  = L(IO) | LN(IO),
1051 };
1052 
1053 #define RMT_NODE_BITS			(1 << IBS_DATA_SRC_DRAM)
1054 #define RMT_NODE_APPLICABLE(x)		(RMT_NODE_BITS & (1 << x))
1055 
1056 static u64 g_zen4_data_src[32] = {
1057 	[IBS_DATA_SRC_EXT_LOC_CACHE]	  = L(L3) | LN(L3),
1058 	[IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
1059 	[IBS_DATA_SRC_EXT_DRAM]		  = L(LOC_RAM) | LN(RAM),
1060 	[IBS_DATA_SRC_EXT_FAR_CCX_CACHE]  = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
1061 	[IBS_DATA_SRC_EXT_PMEM]		  = LN(PMEM),
1062 	[IBS_DATA_SRC_EXT_IO]		  = L(IO) | LN(IO),
1063 	[IBS_DATA_SRC_EXT_EXT_MEM]	  = LN(CXL),
1064 };
1065 
1066 #define ZEN4_RMT_NODE_BITS		((1 << IBS_DATA_SRC_EXT_DRAM) | \
1067 					 (1 << IBS_DATA_SRC_EXT_PMEM) | \
1068 					 (1 << IBS_DATA_SRC_EXT_EXT_MEM))
1069 #define ZEN4_RMT_NODE_APPLICABLE(x)	(ZEN4_RMT_NODE_BITS & (1 << x))
1070 
1071 static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
1072 				  union ibs_op_data3 *op_data3,
1073 				  struct perf_sample_data *data)
1074 {
1075 	union perf_mem_data_src *data_src = &data->data_src;
1076 	u8 ibs_data_src = perf_ibs_data_src(op_data2);
1077 
1078 	data_src->mem_lvl = 0;
1079 	data_src->mem_lvl_num = 0;
1080 
1081 	/*
1082 	 * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
1083 	 * memory accesses. So, check DcUcMemAcc bit early.
1084 	 */
1085 	if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
1086 		return L(UNC) | LN(UNC);
1087 
1088 	/* L1 Hit */
1089 	if (op_data3->dc_miss == 0)
1090 		return L(L1) | LN(L1);
1091 
1092 	/* L2 Hit */
1093 	if (op_data3->l2_miss == 0) {
1094 		/* Erratum #1293 */
1095 		if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
1096 		    !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
1097 			return L(L2) | LN(L2);
1098 	}
1099 
1100 	/*
1101 	 * OP_DATA2 is valid only for load ops. Skip all checks which
1102 	 * uses OP_DATA2[DataSrc].
1103 	 */
1104 	if (data_src->mem_op != PERF_MEM_OP_LOAD)
1105 		goto check_mab;
1106 
1107 	if (ibs_caps & IBS_CAPS_ZEN4) {
1108 		u64 val = g_zen4_data_src[ibs_data_src];
1109 
1110 		if (!val)
1111 			goto check_mab;
1112 
1113 		/* HOPS_1 because IBS doesn't provide remote socket detail */
1114 		if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
1115 			if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
1116 				val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
1117 			else
1118 				val |= REM | HOPS(1);
1119 		}
1120 
1121 		return val;
1122 	} else {
1123 		u64 val = g_data_src[ibs_data_src];
1124 
1125 		if (!val)
1126 			goto check_mab;
1127 
1128 		/* HOPS_1 because IBS doesn't provide remote socket detail */
1129 		if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
1130 			if (ibs_data_src == IBS_DATA_SRC_DRAM)
1131 				val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
1132 			else
1133 				val |= REM | HOPS(1);
1134 		}
1135 
1136 		return val;
1137 	}
1138 
1139 check_mab:
1140 	/*
1141 	 * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding
1142 	 * DC misses. However, such data may come from any level in mem
1143 	 * hierarchy. IBS provides detail about both MAB as well as actual
1144 	 * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
1145 	 * MAB only when IBS fails to provide DataSrc.
1146 	 */
1147 	if (op_data3->dc_miss_no_mab_alloc)
1148 		return L(LFB) | LN(LFB);
1149 
1150 	/* Don't set HIT with NA */
1151 	return PERF_MEM_S(LVL, NA) | LN(NA);
1152 }
1153 
1154 static bool perf_ibs_cache_hit_st_valid(void)
1155 {
1156 	/* 0: Uninitialized, 1: Valid, -1: Invalid */
1157 	static int cache_hit_st_valid;
1158 
1159 	if (unlikely(!cache_hit_st_valid)) {
1160 		if (boot_cpu_data.x86 == 0x19 &&
1161 		    (boot_cpu_data.x86_model <= 0xF ||
1162 		    (boot_cpu_data.x86_model >= 0x20 &&
1163 		     boot_cpu_data.x86_model <= 0x5F))) {
1164 			cache_hit_st_valid = -1;
1165 		} else {
1166 			cache_hit_st_valid = 1;
1167 		}
1168 	}
1169 
1170 	return cache_hit_st_valid == 1;
1171 }
1172 
1173 static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2,
1174 				   struct perf_sample_data *data)
1175 {
1176 	union perf_mem_data_src *data_src = &data->data_src;
1177 	u8 ibs_data_src;
1178 
1179 	data_src->mem_snoop = PERF_MEM_SNOOP_NA;
1180 
1181 	if (!perf_ibs_cache_hit_st_valid() ||
1182 	    data_src->mem_op != PERF_MEM_OP_LOAD ||
1183 	    data_src->mem_lvl & PERF_MEM_LVL_L1 ||
1184 	    data_src->mem_lvl & PERF_MEM_LVL_L2 ||
1185 	    op_data2->cache_hit_st)
1186 		return;
1187 
1188 	ibs_data_src = perf_ibs_data_src(op_data2);
1189 
1190 	if (ibs_caps & IBS_CAPS_ZEN4) {
1191 		if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE ||
1192 		    ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE ||
1193 		    ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE)
1194 			data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
1195 	} else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
1196 		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
1197 	}
1198 }
1199 
1200 static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3,
1201 				 struct perf_sample_data *data)
1202 {
1203 	union perf_mem_data_src *data_src = &data->data_src;
1204 
1205 	data_src->mem_dtlb = PERF_MEM_TLB_NA;
1206 
1207 	if (!op_data3->dc_lin_addr_valid)
1208 		return;
1209 
1210 	if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) &&
1211 	    !op_data3->dc_phy_addr_valid)
1212 		return;
1213 
1214 	if (!op_data3->dc_l1tlb_miss) {
1215 		data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT;
1216 		return;
1217 	}
1218 
1219 	if (!op_data3->dc_l2tlb_miss) {
1220 		data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT;
1221 		return;
1222 	}
1223 
1224 	data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS;
1225 }
1226 
1227 static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3,
1228 				  struct perf_sample_data *data)
1229 {
1230 	union perf_mem_data_src *data_src = &data->data_src;
1231 
1232 	data_src->mem_lock = PERF_MEM_LOCK_NA;
1233 
1234 	if (op_data3->dc_locked_op)
1235 		data_src->mem_lock = PERF_MEM_LOCK_LOCKED;
1236 }
1237 
1238 /* Be careful. Works only for contiguous MSRs. */
1239 #define ibs_fetch_msr_idx(msr)	(msr - MSR_AMD64_IBSFETCHCTL)
1240 #define ibs_op_msr_idx(msr)	(msr - MSR_AMD64_IBSOPCTL)
1241 
1242 static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
1243 				  struct perf_sample_data *data,
1244 				  union ibs_op_data2 *op_data2,
1245 				  union ibs_op_data3 *op_data3)
1246 {
1247 	union perf_mem_data_src *data_src = &data->data_src;
1248 
1249 	data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
1250 	perf_ibs_get_mem_snoop(op_data2, data);
1251 	perf_ibs_get_tlb_lvl(op_data3, data);
1252 	perf_ibs_get_mem_lock(op_data3, data);
1253 }
1254 
1255 static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data,
1256 				   union ibs_op_data3 *op_data3)
1257 {
1258 	__u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)];
1259 
1260 	/* Erratum #1293 */
1261 	if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF &&
1262 	    (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
1263 		/*
1264 		 * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode.
1265 		 * DataSrc=0 is 'No valid status' and RmtNode is invalid when
1266 		 * DataSrc=0.
1267 		 */
1268 		val = 0;
1269 	}
1270 	return val;
1271 }
1272 
1273 static void perf_ibs_parse_ld_st_data(__u64 sample_type,
1274 				      struct perf_ibs_data *ibs_data,
1275 				      struct perf_sample_data *data)
1276 {
1277 	union ibs_op_data3 op_data3;
1278 	union ibs_op_data2 op_data2;
1279 	union ibs_op_data op_data;
1280 
1281 	data->data_src.val = PERF_MEM_NA;
1282 	op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
1283 
1284 	perf_ibs_get_mem_op(&op_data3, data);
1285 	if (data->data_src.mem_op != PERF_MEM_OP_LOAD &&
1286 	    data->data_src.mem_op != PERF_MEM_OP_STORE)
1287 		return;
1288 
1289 	op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3);
1290 
1291 	if (sample_type & PERF_SAMPLE_DATA_SRC) {
1292 		perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3);
1293 		data->sample_flags |= PERF_SAMPLE_DATA_SRC;
1294 	}
1295 
1296 	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss &&
1297 	    data->data_src.mem_op == PERF_MEM_OP_LOAD) {
1298 		op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)];
1299 
1300 		if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
1301 			data->weight.var1_dw = op_data3.dc_miss_lat;
1302 			data->weight.var2_w = op_data.tag_to_ret_ctr;
1303 		} else if (sample_type & PERF_SAMPLE_WEIGHT) {
1304 			data->weight.full = op_data3.dc_miss_lat;
1305 		}
1306 		data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
1307 	}
1308 
1309 	if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) {
1310 		data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)];
1311 		data->sample_flags |= PERF_SAMPLE_ADDR;
1312 	}
1313 
1314 	if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) {
1315 		data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)];
1316 		data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
1317 	}
1318 }
1319 
1320 static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs,
1321 					struct perf_event *event)
1322 {
1323 	u64 sample_type = event->attr.sample_type;
1324 
1325 	return perf_ibs == &perf_ibs_op &&
1326 	       sample_type & (PERF_SAMPLE_DATA_SRC |
1327 			      PERF_SAMPLE_WEIGHT_TYPE |
1328 			      PERF_SAMPLE_ADDR |
1329 			      PERF_SAMPLE_PHYS_ADDR);
1330 }
1331 
1332 static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs,
1333 				   struct perf_event *event,
1334 				   int check_rip)
1335 {
1336 	if (event->attr.sample_type & PERF_SAMPLE_RAW ||
1337 	    perf_ibs_is_mem_sample_type(perf_ibs, event) ||
1338 	    perf_ibs_ldlat_event(perf_ibs, event) ||
1339 	    perf_ibs_fetch_lat_event(perf_ibs, event))
1340 		return perf_ibs->offset_max;
1341 	else if (check_rip)
1342 		return 3;
1343 	return 1;
1344 }
1345 
1346 static bool perf_ibs_is_kernel_data_addr(struct perf_event *event,
1347 					 struct perf_ibs_data *ibs_data)
1348 {
1349 	u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW;
1350 	union ibs_op_data3 op_data3;
1351 	u64 dc_lin_addr;
1352 
1353 	op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
1354 	dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)];
1355 
1356 	return unlikely((event->attr.sample_type & sample_type_mask) &&
1357 			op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr));
1358 }
1359 
1360 static bool perf_ibs_is_kernel_br_target(struct perf_event *event,
1361 					 struct perf_ibs_data *ibs_data,
1362 					 int br_target_idx)
1363 {
1364 	union ibs_op_data op_data;
1365 	u64 br_target;
1366 
1367 	op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)];
1368 	br_target = ibs_data->regs[br_target_idx];
1369 
1370 	return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) &&
1371 			op_data.op_brn_ret && kernel_ip(br_target));
1372 }
1373 
1374 static bool perf_ibs_discard_sample(struct perf_ibs *perf_ibs, struct perf_event *event,
1375 				    struct pt_regs *regs, struct perf_ibs_data *ibs_data,
1376 				    int br_target_idx)
1377 {
1378 	if (perf_exclude_event(event, regs))
1379 		return true;
1380 
1381 	if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel)
1382 		return false;
1383 
1384 	if (perf_ibs_is_kernel_data_addr(event, ibs_data))
1385 		return true;
1386 
1387 	if (br_target_idx != -1 &&
1388 	    perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx))
1389 		return true;
1390 
1391 	return false;
1392 }
1393 
1394 static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs,
1395 				   struct perf_ibs_data *ibs_data)
1396 {
1397 	if (perf_ibs == &perf_ibs_op) {
1398 		ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0;
1399 		return;
1400 	}
1401 
1402 	ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0;
1403 }
1404 
1405 static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
1406 {
1407 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
1408 	struct perf_event *event = pcpu->event;
1409 	struct hw_perf_event *hwc;
1410 	struct perf_sample_data data;
1411 	struct perf_raw_record raw;
1412 	struct pt_regs regs;
1413 	struct perf_ibs_data ibs_data;
1414 	int offset, size, check_rip, offset_max, throttle = 0;
1415 	unsigned int msr;
1416 	u64 *buf, *config, period, new_config = 0;
1417 	int br_target_idx = -1;
1418 
1419 	if (!test_bit(IBS_STARTED, pcpu->state)) {
1420 fail:
1421 		/*
1422 		 * Catch spurious interrupts after stopping IBS: After
1423 		 * disabling IBS there could be still incoming NMIs
1424 		 * with samples that even have the valid bit cleared.
1425 		 * Mark all this NMIs as handled.
1426 		 */
1427 		if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
1428 			return 1;
1429 
1430 		return 0;
1431 	}
1432 
1433 	if (WARN_ON_ONCE(!event))
1434 		goto fail;
1435 
1436 	hwc = &event->hw;
1437 	msr = hwc->config_base;
1438 	buf = ibs_data.regs;
1439 	rdmsrq(msr, *buf);
1440 	if (!(*buf++ & perf_ibs->valid_mask))
1441 		goto fail;
1442 
1443 	config = &ibs_data.regs[0];
1444 	perf_ibs_event_update(perf_ibs, event, config);
1445 	perf_sample_data_init(&data, 0, hwc->last_period);
1446 	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
1447 		goto out;	/* no sw counter overflow */
1448 
1449 	ibs_data.caps = ibs_caps;
1450 	size = 1;
1451 	offset = 1;
1452 	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
1453 
1454 	offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip);
1455 
1456 	do {
1457 		rdmsrq(msr + offset, *buf++);
1458 		size++;
1459 		offset = find_next_bit(perf_ibs->offset_mask,
1460 				       perf_ibs->offset_max,
1461 				       offset + 1);
1462 	} while (offset < offset_max);
1463 
1464 	if (perf_ibs_ldlat_event(perf_ibs, event)) {
1465 		union ibs_op_data3 op_data3;
1466 
1467 		op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
1468 		/*
1469 		 * Opening event is errored out if load latency threshold is
1470 		 * outside of [128, 2048] range. Since the event has reached
1471 		 * interrupt handler, we can safely assume the threshold is
1472 		 * within [128, 2048] range.
1473 		 */
1474 		if (!op_data3.ld_op || !op_data3.dc_miss ||
1475 		    op_data3.dc_miss_lat <= (event->attr.config1 & IBS_OP_CONFIG1_LDLAT_MASK)) {
1476 			throttle = perf_event_account_interrupt(event);
1477 			goto out;
1478 		}
1479 	}
1480 
1481 	if (perf_ibs_fetch_lat_event(perf_ibs, event)) {
1482 		union ibs_fetch_ctl fetch_ctl;
1483 
1484 		fetch_ctl.val = ibs_data.regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)];
1485 		if (fetch_ctl.fetch_lat < (event->attr.config1 & IBS_FETCH_CONFIG1_FETCHLAT_MASK)) {
1486 			throttle = perf_event_account_interrupt(event);
1487 			goto out;
1488 		}
1489 	}
1490 
1491 	/*
1492 	 * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
1493 	 * depending on their availability.
1494 	 * Can't add to offset_max as they are staggered
1495 	 */
1496 	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
1497 		if (perf_ibs == &perf_ibs_op) {
1498 			if (ibs_caps & IBS_CAPS_BRNTRGT) {
1499 				rdmsrq(MSR_AMD64_IBSBRTARGET, *buf++);
1500 				br_target_idx = size;
1501 				size++;
1502 			}
1503 			if (ibs_caps & IBS_CAPS_OPDATA4) {
1504 				rdmsrq(MSR_AMD64_IBSOPDATA4, *buf++);
1505 				size++;
1506 			}
1507 		}
1508 		if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
1509 			rdmsrq(MSR_AMD64_ICIBSEXTDCTL, *buf++);
1510 			size++;
1511 		}
1512 	}
1513 	ibs_data.size = sizeof(u64) * size;
1514 
1515 	regs = *iregs;
1516 	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
1517 		regs.flags &= ~PERF_EFLAGS_EXACT;
1518 	} else {
1519 		/* Workaround for erratum #1197 */
1520 		if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) {
1521 			throttle = perf_event_account_interrupt(event);
1522 			goto out;
1523 		}
1524 
1525 		set_linear_ip(&regs, ibs_data.regs[1]);
1526 		regs.flags |= PERF_EFLAGS_EXACT;
1527 	}
1528 
1529 	if (((ibs_caps & IBS_CAPS_BIT63_FILTER) ||
1530 	     (event->attr.config2 & IBS_SW_FILTER_MASK)) &&
1531 	    perf_ibs_discard_sample(perf_ibs, event, &regs, &ibs_data, br_target_idx)) {
1532 		throttle = perf_event_account_interrupt(event);
1533 		goto out;
1534 	}
1535 	/*
1536 	 * Prevent leaking physical addresses to unprivileged users. Skip
1537 	 * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for
1538 	 * unprivileged users.
1539 	 */
1540 	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
1541 	    (hwc->flags & PERF_X86_EVENT_UNPRIVILEGED)) {
1542 		perf_ibs_phyaddr_clear(perf_ibs, &ibs_data);
1543 	}
1544 
1545 	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
1546 		raw = (struct perf_raw_record){
1547 			.frag = {
1548 				.size = sizeof(u32) + ibs_data.size,
1549 				.data = ibs_data.data,
1550 			},
1551 		};
1552 		perf_sample_save_raw_data(&data, event, &raw);
1553 	}
1554 
1555 	if (perf_ibs == &perf_ibs_op)
1556 		perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data);
1557 
1558 	/*
1559 	 * rip recorded by IbsOpRip will not be consistent with rsp and rbp
1560 	 * recorded as part of interrupt regs. Thus we need to use rip from
1561 	 * interrupt regs while unwinding call stack.
1562 	 */
1563 	perf_sample_save_callchain(&data, event, iregs);
1564 
1565 	throttle = perf_event_overflow(event, &data, &regs);
1566 
1567 	if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
1568 		hwc->sample_period = perf_ibs->min_period;
1569 
1570 out:
1571 	if (!throttle) {
1572 		if (ibs_caps & IBS_CAPS_DIS)
1573 			wrmsrq(hwc->extra_reg.reg, perf_ibs->disable_mask);
1574 
1575 		if (perf_ibs == &perf_ibs_op) {
1576 			if (ibs_caps & IBS_CAPS_OPCNTEXT) {
1577 				new_config = period & IBS_OP_MAX_CNT_EXT_MASK;
1578 				period &= ~IBS_OP_MAX_CNT_EXT_MASK;
1579 			}
1580 			if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL))
1581 				new_config |= *config & IBS_OP_CUR_CNT_RAND;
1582 		}
1583 		new_config |= period >> 4;
1584 
1585 		perf_ibs_enable_event(perf_ibs, hwc, new_config);
1586 	}
1587 
1588 	perf_event_update_userpage(event);
1589 
1590 	return 1;
1591 }
1592 
1593 static int
1594 perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1595 {
1596 	u64 stamp = sched_clock();
1597 	int handled = 0;
1598 
1599 	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
1600 	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
1601 
1602 	if (handled)
1603 		inc_irq_stat(apic_perf_irqs);
1604 
1605 	perf_sample_event_took(sched_clock() - stamp);
1606 
1607 	return handled;
1608 }
1609 NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
1610 
1611 static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
1612 {
1613 	struct cpu_perf_ibs __percpu *pcpu;
1614 	int ret;
1615 
1616 	pcpu = alloc_percpu(struct cpu_perf_ibs);
1617 	if (!pcpu)
1618 		return -ENOMEM;
1619 
1620 	perf_ibs->pcpu = pcpu;
1621 
1622 	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
1623 	if (ret) {
1624 		perf_ibs->pcpu = NULL;
1625 		free_percpu(pcpu);
1626 	}
1627 
1628 	return ret;
1629 }
1630 
1631 static __init int perf_ibs_fetch_init(void)
1632 {
1633 	/*
1634 	 * Some chips fail to reset the fetch count when it is written; instead
1635 	 * they need a 0-1 transition of IbsFetchEn.
1636 	 */
1637 	if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
1638 		perf_ibs_fetch.fetch_count_reset_broken = 1;
1639 
1640 	if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
1641 		perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
1642 
1643 	if (ibs_caps & IBS_CAPS_ZEN4)
1644 		perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY;
1645 
1646 	if (ibs_caps & IBS_CAPS_DIS)
1647 		perf_ibs_fetch.disable_mask = IBS_FETCH_2_DIS;
1648 
1649 	perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups;
1650 	perf_ibs_fetch.pmu.attr_update = fetch_attr_update;
1651 
1652 	return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
1653 }
1654 
1655 static __init int perf_ibs_op_init(void)
1656 {
1657 	if (ibs_caps & IBS_CAPS_OPCNT)
1658 		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
1659 
1660 	if (ibs_caps & IBS_CAPS_OPCNTEXT) {
1661 		perf_ibs_op.max_period  |= IBS_OP_MAX_CNT_EXT_MASK;
1662 		perf_ibs_op.config_mask	|= IBS_OP_MAX_CNT_EXT_MASK;
1663 		perf_ibs_op.cnt_mask    |= (IBS_OP_MAX_CNT_EXT_MASK |
1664 					    IBS_OP_CUR_CNT_EXT_MASK);
1665 	}
1666 
1667 	if (ibs_caps & IBS_CAPS_ZEN4)
1668 		perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
1669 
1670 	if (ibs_caps & IBS_CAPS_DIS)
1671 		perf_ibs_op.disable_mask = IBS_OP_2_DIS;
1672 
1673 	perf_ibs_op.pmu.attr_groups = op_attr_groups;
1674 	perf_ibs_op.pmu.attr_update = op_attr_update;
1675 
1676 	return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
1677 }
1678 
1679 static __init int perf_event_ibs_init(void)
1680 {
1681 	int ret;
1682 
1683 	ret = perf_ibs_fetch_init();
1684 	if (ret)
1685 		return ret;
1686 
1687 	ret = perf_ibs_op_init();
1688 	if (ret)
1689 		goto err_op;
1690 
1691 	ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
1692 	if (ret)
1693 		goto err_nmi;
1694 
1695 	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
1696 	return 0;
1697 
1698 err_nmi:
1699 	perf_pmu_unregister(&perf_ibs_op.pmu);
1700 	free_percpu(perf_ibs_op.pcpu);
1701 	perf_ibs_op.pcpu = NULL;
1702 err_op:
1703 	perf_pmu_unregister(&perf_ibs_fetch.pmu);
1704 	free_percpu(perf_ibs_fetch.pcpu);
1705 	perf_ibs_fetch.pcpu = NULL;
1706 
1707 	return ret;
1708 }
1709 
1710 #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
1711 
1712 static __init int perf_event_ibs_init(void)
1713 {
1714 	return 0;
1715 }
1716 
1717 #endif
1718 
1719 /* IBS - apic initialization, for perf and oprofile */
1720 
1721 static __init u32 __get_ibs_caps(void)
1722 {
1723 	u32 caps;
1724 	unsigned int max_level;
1725 
1726 	if (!boot_cpu_has(X86_FEATURE_IBS))
1727 		return 0;
1728 
1729 	/* check IBS cpuid feature flags */
1730 	max_level = cpuid_eax(0x80000000);
1731 	if (max_level < IBS_CPUID_FEATURES)
1732 		return IBS_CAPS_DEFAULT;
1733 
1734 	caps = cpuid_eax(IBS_CPUID_FEATURES);
1735 	if (!(caps & IBS_CAPS_AVAIL))
1736 		/* cpuid flags not valid */
1737 		return IBS_CAPS_DEFAULT;
1738 
1739 	return caps;
1740 }
1741 
1742 u32 get_ibs_caps(void)
1743 {
1744 	return ibs_caps;
1745 }
1746 
1747 EXPORT_SYMBOL(get_ibs_caps);
1748 
1749 static inline int get_eilvt(int offset)
1750 {
1751 	return !setup_APIC_eilvt(offset, 0, APIC_DELIVERY_MODE_NMI, 1);
1752 }
1753 
1754 static inline int put_eilvt(int offset)
1755 {
1756 	return !setup_APIC_eilvt(offset, 0, 0, 1);
1757 }
1758 
1759 /*
1760  * Check and reserve APIC extended interrupt LVT offset for IBS if available.
1761  */
1762 static inline int ibs_eilvt_valid(void)
1763 {
1764 	int offset;
1765 	u64 val;
1766 	int valid = 0;
1767 
1768 	preempt_disable();
1769 
1770 	rdmsrq(MSR_AMD64_IBSCTL, val);
1771 	offset = val & IBSCTL_LVT_OFFSET_MASK;
1772 
1773 	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
1774 		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
1775 		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
1776 		goto out;
1777 	}
1778 
1779 	if (!get_eilvt(offset)) {
1780 		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
1781 		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
1782 		goto out;
1783 	}
1784 
1785 	valid = 1;
1786 out:
1787 	preempt_enable();
1788 
1789 	return valid;
1790 }
1791 
1792 static int setup_ibs_ctl(int ibs_eilvt_off)
1793 {
1794 	struct pci_dev *cpu_cfg;
1795 	int nodes;
1796 	u32 value = 0;
1797 
1798 	nodes = 0;
1799 	cpu_cfg = NULL;
1800 	do {
1801 		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
1802 					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
1803 					 cpu_cfg);
1804 		if (!cpu_cfg)
1805 			break;
1806 		++nodes;
1807 		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
1808 				       | IBSCTL_LVT_OFFSET_VALID);
1809 		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
1810 		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
1811 			pci_dev_put(cpu_cfg);
1812 			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
1813 				 value);
1814 			return -EINVAL;
1815 		}
1816 	} while (1);
1817 
1818 	if (!nodes) {
1819 		pr_debug("No CPU node configured for IBS\n");
1820 		return -ENODEV;
1821 	}
1822 
1823 	return 0;
1824 }
1825 
1826 /*
1827  * This runs only on the current cpu. We try to find an LVT offset and
1828  * setup the local APIC. For this we must disable preemption. On
1829  * success we initialize all nodes with this offset. This updates then
1830  * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
1831  * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
1832  * is using the new offset.
1833  */
1834 static void force_ibs_eilvt_setup(void)
1835 {
1836 	int offset;
1837 	int ret;
1838 
1839 	preempt_disable();
1840 	/* find the next free available EILVT entry, skip offset 0 */
1841 	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
1842 		if (get_eilvt(offset))
1843 			break;
1844 	}
1845 	preempt_enable();
1846 
1847 	if (offset == APIC_EILVT_NR_MAX) {
1848 		pr_debug("No EILVT entry available\n");
1849 		return;
1850 	}
1851 
1852 	ret = setup_ibs_ctl(offset);
1853 	if (ret)
1854 		goto out;
1855 
1856 	if (!ibs_eilvt_valid())
1857 		goto out;
1858 
1859 	pr_info("LVT offset %d assigned\n", offset);
1860 
1861 	return;
1862 out:
1863 	preempt_disable();
1864 	put_eilvt(offset);
1865 	preempt_enable();
1866 	return;
1867 }
1868 
1869 static void ibs_eilvt_setup(void)
1870 {
1871 	/*
1872 	 * Force LVT offset assignment for family 10h: The offsets are
1873 	 * not assigned by the BIOS for this family, so the OS is
1874 	 * responsible for doing it. If the OS assignment fails, fall
1875 	 * back to BIOS settings and try to setup this.
1876 	 */
1877 	if (boot_cpu_data.x86 == 0x10)
1878 		force_ibs_eilvt_setup();
1879 }
1880 
1881 static inline int get_ibs_lvt_offset(void)
1882 {
1883 	u64 val;
1884 
1885 	rdmsrq(MSR_AMD64_IBSCTL, val);
1886 	if (!(val & IBSCTL_LVT_OFFSET_VALID))
1887 		return -EINVAL;
1888 
1889 	return val & IBSCTL_LVT_OFFSET_MASK;
1890 }
1891 
1892 static void setup_APIC_ibs(void)
1893 {
1894 	int offset;
1895 
1896 	offset = get_ibs_lvt_offset();
1897 	if (offset < 0)
1898 		goto failed;
1899 
1900 	if (!setup_APIC_eilvt(offset, 0, APIC_DELIVERY_MODE_NMI, 0))
1901 		return;
1902 failed:
1903 	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
1904 		smp_processor_id());
1905 }
1906 
1907 static void clear_APIC_ibs(void)
1908 {
1909 	int offset;
1910 
1911 	offset = get_ibs_lvt_offset();
1912 	if (offset >= 0)
1913 		setup_APIC_eilvt(offset, 0, APIC_DELIVERY_MODE_FIXED, 1);
1914 }
1915 
1916 static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
1917 {
1918 	setup_APIC_ibs();
1919 
1920 	if (ibs_caps & IBS_CAPS_DIS) {
1921 		/*
1922 		 * IBS enable sequence:
1923 		 *   CTL[En] = 1;
1924 		 *   CTL2[Dis] = 0;
1925 		 *
1926 		 * IBS disable sequence:
1927 		 *   CTL2[Dis] = 1;
1928 		 *
1929 		 * Set CTL2[Dis] when CPU comes up. This is needed to make
1930 		 * enable sequence effective.
1931 		 */
1932 		wrmsrq(MSR_AMD64_IBSFETCHCTL2, IBS_FETCH_2_DIS);
1933 		wrmsrq(MSR_AMD64_IBSOPCTL2, IBS_OP_2_DIS);
1934 	}
1935 
1936 	return 0;
1937 }
1938 
1939 #ifdef CONFIG_PM
1940 
1941 static int perf_ibs_suspend(void *data)
1942 {
1943 	clear_APIC_ibs();
1944 	return 0;
1945 }
1946 
1947 static void perf_ibs_resume(void *data)
1948 {
1949 	ibs_eilvt_setup();
1950 	setup_APIC_ibs();
1951 }
1952 
1953 static const struct syscore_ops perf_ibs_syscore_ops = {
1954 	.resume		= perf_ibs_resume,
1955 	.suspend	= perf_ibs_suspend,
1956 };
1957 
1958 static struct syscore perf_ibs_syscore = {
1959 	.ops = &perf_ibs_syscore_ops,
1960 };
1961 
1962 static void perf_ibs_pm_init(void)
1963 {
1964 	register_syscore(&perf_ibs_syscore);
1965 }
1966 
1967 #else
1968 
1969 static inline void perf_ibs_pm_init(void) { }
1970 
1971 #endif
1972 
1973 static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu)
1974 {
1975 	clear_APIC_ibs();
1976 	return 0;
1977 }
1978 
1979 static __init int amd_ibs_init(void)
1980 {
1981 	u32 caps;
1982 
1983 	caps = __get_ibs_caps();
1984 	if (!caps)
1985 		return -ENODEV;	/* ibs not supported by the cpu */
1986 
1987 	ibs_eilvt_setup();
1988 
1989 	if (!ibs_eilvt_valid())
1990 		return -EINVAL;
1991 
1992 	perf_ibs_pm_init();
1993 
1994 #ifdef CONFIG_X86_32
1995 	/*
1996 	 * IBS_CAPS_BIT63_FILTER is used for exclude_kernel/user filtering,
1997 	 * which obviously won't work for 32 bit kernel.
1998 	 */
1999 	caps &= ~IBS_CAPS_BIT63_FILTER;
2000 #endif
2001 
2002 	ibs_caps = caps;
2003 	/* make ibs_caps visible to other cpus: */
2004 	smp_mb();
2005 	/*
2006 	 * x86_pmu_amd_ibs_starting_cpu will be called from core on
2007 	 * all online cpus.
2008 	 */
2009 	cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
2010 			  "perf/x86/amd/ibs:starting",
2011 			  x86_pmu_amd_ibs_starting_cpu,
2012 			  x86_pmu_amd_ibs_dying_cpu);
2013 
2014 	return perf_event_ibs_init();
2015 }
2016 
2017 /* Since we need the pci subsystem to init ibs we can't do this earlier: */
2018 device_initcall(amd_ibs_init);
2019