xref: /linux/arch/x86/events/rapl.c (revision c94cd9508b1335b949fd13ebd269313c65492df0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Support Intel/AMD RAPL energy consumption counters
4  * Copyright (C) 2013 Google, Inc., Stephane Eranian
5  *
6  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7  * section 14.7.1 (September 2013)
8  *
9  * AMD RAPL interface for Fam17h is described in the public PPR:
10  * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11  *
12  * RAPL provides more controls than just reporting energy consumption
13  * however here we only expose the 3 energy consumption free running
14  * counters (pp0, pkg, dram).
15  *
16  * Each of those counters increments in a power unit defined by the
17  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18  * but it can vary.
19  *
20  * Counter to rapl events mappings:
21  *
22  *  pp0 counter: consumption of all physical cores (power plane 0)
23  * 	  event: rapl_energy_cores
24  *    perf code: 0x1
25  *
26  *  pkg counter: consumption of the whole processor package
27  *	  event: rapl_energy_pkg
28  *    perf code: 0x2
29  *
30  * dram counter: consumption of the dram domain (servers only)
31  *	  event: rapl_energy_dram
32  *    perf code: 0x3
33  *
34  * gpu counter: consumption of the builtin-gpu domain (client only)
35  *	  event: rapl_energy_gpu
36  *    perf code: 0x4
37  *
38  *  psys counter: consumption of the builtin-psys domain (client only)
39  *	  event: rapl_energy_psys
40  *    perf code: 0x5
41  *
42  * We manage those counters as free running (read-only). They may be
43  * use simultaneously by other tools, such as turbostat.
44  *
45  * The events only support system-wide mode counting. There is no
46  * sampling support because it does not make sense and is not
47  * supported by the RAPL hardware.
48  *
49  * Because we want to avoid floating-point operations in the kernel,
50  * the events are all reported in fixed point arithmetic (32.32).
51  * Tools must adjust the counts to convert them to Watts using
52  * the duration of the measurement. Tools may use a function such as
53  * ldexp(raw_count, -32);
54  */
55 
56 #define pr_fmt(fmt) "RAPL PMU: " fmt
57 
58 #include <linux/module.h>
59 #include <linux/slab.h>
60 #include <linux/perf_event.h>
61 #include <linux/nospec.h>
62 #include <asm/cpu_device_id.h>
63 #include <asm/intel-family.h>
64 #include "perf_event.h"
65 #include "probe.h"
66 
67 MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters");
68 MODULE_LICENSE("GPL");
69 
70 /*
71  * RAPL energy status counters
72  */
73 enum perf_rapl_events {
74 	PERF_RAPL_PP0 = 0,		/* all cores */
75 	PERF_RAPL_PKG,			/* entire package */
76 	PERF_RAPL_RAM,			/* DRAM */
77 	PERF_RAPL_PP1,			/* gpu */
78 	PERF_RAPL_PSYS,			/* psys */
79 
80 	PERF_RAPL_MAX,
81 	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
82 };
83 
84 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
85 	"pp0-core",
86 	"package",
87 	"dram",
88 	"pp1-gpu",
89 	"psys",
90 };
91 
92 /*
93  * event code: LSB 8 bits, passed in attr->config
94  * any other bit is reserved
95  */
96 #define RAPL_EVENT_MASK	0xFFULL
97 #define RAPL_CNTR_WIDTH 32
98 
99 #define RAPL_EVENT_ATTR_STR(_name, v, str)					\
100 static struct perf_pmu_events_attr event_attr_##v = {				\
101 	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
102 	.id		= 0,							\
103 	.event_str	= str,							\
104 };
105 
106 /*
107  * RAPL Package energy counter scope:
108  * 1. AMD/HYGON platforms have a per-PKG package energy counter
109  * 2. For Intel platforms
110  *	2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
111  *	2.2. Other Intel platforms are single die systems so the scope can be
112  *	     considered as either pkg-scope or die-scope, and we are considering
113  *	     them as die-scope.
114  */
115 #define rapl_pmu_is_pkg_scope()				\
116 	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||	\
117 	 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
118 
119 struct rapl_pmu {
120 	raw_spinlock_t		lock;
121 	int			n_active;
122 	int			cpu;
123 	struct list_head	active_list;
124 	struct pmu		*pmu;
125 	ktime_t			timer_interval;
126 	struct hrtimer		hrtimer;
127 };
128 
129 struct rapl_pmus {
130 	struct pmu		pmu;
131 	unsigned int		nr_rapl_pmu;
132 	struct rapl_pmu		*pmus[] __counted_by(nr_rapl_pmu);
133 };
134 
135 enum rapl_unit_quirk {
136 	RAPL_UNIT_QUIRK_NONE,
137 	RAPL_UNIT_QUIRK_INTEL_HSW,
138 	RAPL_UNIT_QUIRK_INTEL_SPR,
139 };
140 
141 struct rapl_model {
142 	struct perf_msr *rapl_msrs;
143 	unsigned long	events;
144 	unsigned int	msr_power_unit;
145 	enum rapl_unit_quirk	unit_quirk;
146 };
147 
148  /* 1/2^hw_unit Joule */
149 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
150 static struct rapl_pmus *rapl_pmus;
151 static cpumask_t rapl_cpu_mask;
152 static unsigned int rapl_cntr_mask;
153 static u64 rapl_timer_ms;
154 static struct perf_msr *rapl_msrs;
155 
156 /*
157  * Helper functions to get the correct topology macros according to the
158  * RAPL PMU scope.
159  */
160 static inline unsigned int get_rapl_pmu_idx(int cpu)
161 {
162 	return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
163 					 topology_logical_die_id(cpu);
164 }
165 
166 static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
167 {
168 	return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
169 					 topology_die_cpumask(cpu);
170 }
171 
172 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
173 {
174 	unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
175 
176 	/*
177 	 * The unsigned check also catches the '-1' return value for non
178 	 * existent mappings in the topology map.
179 	 */
180 	return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
181 }
182 
183 static inline u64 rapl_read_counter(struct perf_event *event)
184 {
185 	u64 raw;
186 	rdmsrl(event->hw.event_base, raw);
187 	return raw;
188 }
189 
190 static inline u64 rapl_scale(u64 v, int cfg)
191 {
192 	if (cfg > NR_RAPL_DOMAINS) {
193 		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
194 		return v;
195 	}
196 	/*
197 	 * scale delta to smallest unit (1/2^32)
198 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
199 	 * or use ldexp(count, -32).
200 	 * Watts = Joules/Time delta
201 	 */
202 	return v << (32 - rapl_hw_unit[cfg - 1]);
203 }
204 
205 static u64 rapl_event_update(struct perf_event *event)
206 {
207 	struct hw_perf_event *hwc = &event->hw;
208 	u64 prev_raw_count, new_raw_count;
209 	s64 delta, sdelta;
210 	int shift = RAPL_CNTR_WIDTH;
211 
212 	prev_raw_count = local64_read(&hwc->prev_count);
213 	do {
214 		rdmsrl(event->hw.event_base, new_raw_count);
215 	} while (!local64_try_cmpxchg(&hwc->prev_count,
216 				      &prev_raw_count, new_raw_count));
217 
218 	/*
219 	 * Now we have the new raw value and have updated the prev
220 	 * timestamp already. We can now calculate the elapsed delta
221 	 * (event-)time and add that to the generic event.
222 	 *
223 	 * Careful, not all hw sign-extends above the physical width
224 	 * of the count.
225 	 */
226 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
227 	delta >>= shift;
228 
229 	sdelta = rapl_scale(delta, event->hw.config);
230 
231 	local64_add(sdelta, &event->count);
232 
233 	return new_raw_count;
234 }
235 
236 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
237 {
238        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
239 		     HRTIMER_MODE_REL_PINNED);
240 }
241 
242 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
243 {
244 	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
245 	struct perf_event *event;
246 	unsigned long flags;
247 
248 	if (!pmu->n_active)
249 		return HRTIMER_NORESTART;
250 
251 	raw_spin_lock_irqsave(&pmu->lock, flags);
252 
253 	list_for_each_entry(event, &pmu->active_list, active_entry)
254 		rapl_event_update(event);
255 
256 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
257 
258 	hrtimer_forward_now(hrtimer, pmu->timer_interval);
259 
260 	return HRTIMER_RESTART;
261 }
262 
263 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
264 {
265 	struct hrtimer *hr = &pmu->hrtimer;
266 
267 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
268 	hr->function = rapl_hrtimer_handle;
269 }
270 
271 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
272 				   struct perf_event *event)
273 {
274 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
275 		return;
276 
277 	event->hw.state = 0;
278 
279 	list_add_tail(&event->active_entry, &pmu->active_list);
280 
281 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
282 
283 	pmu->n_active++;
284 	if (pmu->n_active == 1)
285 		rapl_start_hrtimer(pmu);
286 }
287 
288 static void rapl_pmu_event_start(struct perf_event *event, int mode)
289 {
290 	struct rapl_pmu *pmu = event->pmu_private;
291 	unsigned long flags;
292 
293 	raw_spin_lock_irqsave(&pmu->lock, flags);
294 	__rapl_pmu_event_start(pmu, event);
295 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
296 }
297 
298 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
299 {
300 	struct rapl_pmu *pmu = event->pmu_private;
301 	struct hw_perf_event *hwc = &event->hw;
302 	unsigned long flags;
303 
304 	raw_spin_lock_irqsave(&pmu->lock, flags);
305 
306 	/* mark event as deactivated and stopped */
307 	if (!(hwc->state & PERF_HES_STOPPED)) {
308 		WARN_ON_ONCE(pmu->n_active <= 0);
309 		pmu->n_active--;
310 		if (pmu->n_active == 0)
311 			hrtimer_cancel(&pmu->hrtimer);
312 
313 		list_del(&event->active_entry);
314 
315 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
316 		hwc->state |= PERF_HES_STOPPED;
317 	}
318 
319 	/* check if update of sw counter is necessary */
320 	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
321 		/*
322 		 * Drain the remaining delta count out of a event
323 		 * that we are disabling:
324 		 */
325 		rapl_event_update(event);
326 		hwc->state |= PERF_HES_UPTODATE;
327 	}
328 
329 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
330 }
331 
332 static int rapl_pmu_event_add(struct perf_event *event, int mode)
333 {
334 	struct rapl_pmu *pmu = event->pmu_private;
335 	struct hw_perf_event *hwc = &event->hw;
336 	unsigned long flags;
337 
338 	raw_spin_lock_irqsave(&pmu->lock, flags);
339 
340 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
341 
342 	if (mode & PERF_EF_START)
343 		__rapl_pmu_event_start(pmu, event);
344 
345 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
346 
347 	return 0;
348 }
349 
350 static void rapl_pmu_event_del(struct perf_event *event, int flags)
351 {
352 	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
353 }
354 
355 static int rapl_pmu_event_init(struct perf_event *event)
356 {
357 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
358 	int bit, ret = 0;
359 	struct rapl_pmu *pmu;
360 
361 	/* only look at RAPL events */
362 	if (event->attr.type != rapl_pmus->pmu.type)
363 		return -ENOENT;
364 
365 	/* check only supported bits are set */
366 	if (event->attr.config & ~RAPL_EVENT_MASK)
367 		return -EINVAL;
368 
369 	if (event->cpu < 0)
370 		return -EINVAL;
371 
372 	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
373 
374 	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
375 		return -EINVAL;
376 
377 	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
378 	bit = cfg - 1;
379 
380 	/* check event supported */
381 	if (!(rapl_cntr_mask & (1 << bit)))
382 		return -EINVAL;
383 
384 	/* unsupported modes and filters */
385 	if (event->attr.sample_period) /* no sampling */
386 		return -EINVAL;
387 
388 	/* must be done before validate_group */
389 	pmu = cpu_to_rapl_pmu(event->cpu);
390 	if (!pmu)
391 		return -EINVAL;
392 	event->cpu = pmu->cpu;
393 	event->pmu_private = pmu;
394 	event->hw.event_base = rapl_msrs[bit].msr;
395 	event->hw.config = cfg;
396 	event->hw.idx = bit;
397 
398 	return ret;
399 }
400 
401 static void rapl_pmu_event_read(struct perf_event *event)
402 {
403 	rapl_event_update(event);
404 }
405 
406 static ssize_t rapl_get_attr_cpumask(struct device *dev,
407 				struct device_attribute *attr, char *buf)
408 {
409 	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
410 }
411 
412 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
413 
414 static struct attribute *rapl_pmu_attrs[] = {
415 	&dev_attr_cpumask.attr,
416 	NULL,
417 };
418 
419 static struct attribute_group rapl_pmu_attr_group = {
420 	.attrs = rapl_pmu_attrs,
421 };
422 
423 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
424 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
425 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
426 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
427 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
428 
429 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
430 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
431 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
432 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
433 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
434 
435 /*
436  * we compute in 0.23 nJ increments regardless of MSR
437  */
438 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
439 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
440 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
441 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
442 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
443 
444 /*
445  * There are no default events, but we need to create
446  * "events" group (with empty attrs) before updating
447  * it with detected events.
448  */
449 static struct attribute *attrs_empty[] = {
450 	NULL,
451 };
452 
453 static struct attribute_group rapl_pmu_events_group = {
454 	.name = "events",
455 	.attrs = attrs_empty,
456 };
457 
458 PMU_FORMAT_ATTR(event, "config:0-7");
459 static struct attribute *rapl_formats_attr[] = {
460 	&format_attr_event.attr,
461 	NULL,
462 };
463 
464 static struct attribute_group rapl_pmu_format_group = {
465 	.name = "format",
466 	.attrs = rapl_formats_attr,
467 };
468 
469 static const struct attribute_group *rapl_attr_groups[] = {
470 	&rapl_pmu_attr_group,
471 	&rapl_pmu_format_group,
472 	&rapl_pmu_events_group,
473 	NULL,
474 };
475 
476 static struct attribute *rapl_events_cores[] = {
477 	EVENT_PTR(rapl_cores),
478 	EVENT_PTR(rapl_cores_unit),
479 	EVENT_PTR(rapl_cores_scale),
480 	NULL,
481 };
482 
483 static struct attribute_group rapl_events_cores_group = {
484 	.name  = "events",
485 	.attrs = rapl_events_cores,
486 };
487 
488 static struct attribute *rapl_events_pkg[] = {
489 	EVENT_PTR(rapl_pkg),
490 	EVENT_PTR(rapl_pkg_unit),
491 	EVENT_PTR(rapl_pkg_scale),
492 	NULL,
493 };
494 
495 static struct attribute_group rapl_events_pkg_group = {
496 	.name  = "events",
497 	.attrs = rapl_events_pkg,
498 };
499 
500 static struct attribute *rapl_events_ram[] = {
501 	EVENT_PTR(rapl_ram),
502 	EVENT_PTR(rapl_ram_unit),
503 	EVENT_PTR(rapl_ram_scale),
504 	NULL,
505 };
506 
507 static struct attribute_group rapl_events_ram_group = {
508 	.name  = "events",
509 	.attrs = rapl_events_ram,
510 };
511 
512 static struct attribute *rapl_events_gpu[] = {
513 	EVENT_PTR(rapl_gpu),
514 	EVENT_PTR(rapl_gpu_unit),
515 	EVENT_PTR(rapl_gpu_scale),
516 	NULL,
517 };
518 
519 static struct attribute_group rapl_events_gpu_group = {
520 	.name  = "events",
521 	.attrs = rapl_events_gpu,
522 };
523 
524 static struct attribute *rapl_events_psys[] = {
525 	EVENT_PTR(rapl_psys),
526 	EVENT_PTR(rapl_psys_unit),
527 	EVENT_PTR(rapl_psys_scale),
528 	NULL,
529 };
530 
531 static struct attribute_group rapl_events_psys_group = {
532 	.name  = "events",
533 	.attrs = rapl_events_psys,
534 };
535 
536 static bool test_msr(int idx, void *data)
537 {
538 	return test_bit(idx, (unsigned long *) data);
539 }
540 
541 /* Only lower 32bits of the MSR represents the energy counter */
542 #define RAPL_MSR_MASK 0xFFFFFFFF
543 
544 static struct perf_msr intel_rapl_msrs[] = {
545 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
546 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
547 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
548 	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
549 	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
550 };
551 
552 static struct perf_msr intel_rapl_spr_msrs[] = {
553 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
554 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
555 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
556 	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
557 	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
558 };
559 
560 /*
561  * Force to PERF_RAPL_MAX size due to:
562  * - perf_msr_probe(PERF_RAPL_MAX)
563  * - want to use same event codes across both architectures
564  */
565 static struct perf_msr amd_rapl_msrs[] = {
566 	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
567 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
568 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
569 	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   NULL, false, 0 },
570 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
571 };
572 
573 static int rapl_cpu_offline(unsigned int cpu)
574 {
575 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
576 	int target;
577 
578 	/* Check if exiting cpu is used for collecting rapl events */
579 	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
580 		return 0;
581 
582 	pmu->cpu = -1;
583 	/* Find a new cpu to collect rapl events */
584 	target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu);
585 
586 	/* Migrate rapl events to the new target */
587 	if (target < nr_cpu_ids) {
588 		cpumask_set_cpu(target, &rapl_cpu_mask);
589 		pmu->cpu = target;
590 		perf_pmu_migrate_context(pmu->pmu, cpu, target);
591 	}
592 	return 0;
593 }
594 
595 static int rapl_cpu_online(unsigned int cpu)
596 {
597 	s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu);
598 	if (rapl_pmu_idx < 0) {
599 		pr_err("topology_logical_(package/die)_id() returned a negative value");
600 		return -EINVAL;
601 	}
602 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
603 	int target;
604 
605 	if (!pmu) {
606 		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
607 		if (!pmu)
608 			return -ENOMEM;
609 
610 		raw_spin_lock_init(&pmu->lock);
611 		INIT_LIST_HEAD(&pmu->active_list);
612 		pmu->pmu = &rapl_pmus->pmu;
613 		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
614 		rapl_hrtimer_init(pmu);
615 
616 		rapl_pmus->pmus[rapl_pmu_idx] = pmu;
617 	}
618 
619 	/*
620 	 * Check if there is an online cpu in the package which collects rapl
621 	 * events already.
622 	 */
623 	target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu));
624 	if (target < nr_cpu_ids)
625 		return 0;
626 
627 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
628 	pmu->cpu = cpu;
629 	return 0;
630 }
631 
632 static int rapl_check_hw_unit(struct rapl_model *rm)
633 {
634 	u64 msr_rapl_power_unit_bits;
635 	int i;
636 
637 	/* protect rdmsrl() to handle virtualization */
638 	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
639 		return -1;
640 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
641 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
642 
643 	switch (rm->unit_quirk) {
644 	/*
645 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
646 	 * different than the unit from power unit MSR. See
647 	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
648 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
649 	 */
650 	case RAPL_UNIT_QUIRK_INTEL_HSW:
651 		rapl_hw_unit[PERF_RAPL_RAM] = 16;
652 		break;
653 	/* SPR uses a fixed energy unit for Psys domain. */
654 	case RAPL_UNIT_QUIRK_INTEL_SPR:
655 		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
656 		break;
657 	default:
658 		break;
659 	}
660 
661 
662 	/*
663 	 * Calculate the timer rate:
664 	 * Use reference of 200W for scaling the timeout to avoid counter
665 	 * overflows. 200W = 200 Joules/sec
666 	 * Divide interval by 2 to avoid lockstep (2 * 100)
667 	 * if hw unit is 32, then we use 2 ms 1/200/2
668 	 */
669 	rapl_timer_ms = 2;
670 	if (rapl_hw_unit[0] < 32) {
671 		rapl_timer_ms = (1000 / (2 * 100));
672 		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
673 	}
674 	return 0;
675 }
676 
677 static void __init rapl_advertise(void)
678 {
679 	int i;
680 
681 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
682 		hweight32(rapl_cntr_mask), rapl_timer_ms);
683 
684 	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
685 		if (rapl_cntr_mask & (1 << i)) {
686 			pr_info("hw unit of domain %s 2^-%d Joules\n",
687 				rapl_domain_names[i], rapl_hw_unit[i]);
688 		}
689 	}
690 }
691 
692 static void cleanup_rapl_pmus(void)
693 {
694 	int i;
695 
696 	for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
697 		kfree(rapl_pmus->pmus[i]);
698 	kfree(rapl_pmus);
699 }
700 
701 static const struct attribute_group *rapl_attr_update[] = {
702 	&rapl_events_cores_group,
703 	&rapl_events_pkg_group,
704 	&rapl_events_ram_group,
705 	&rapl_events_gpu_group,
706 	&rapl_events_psys_group,
707 	NULL,
708 };
709 
710 static int __init init_rapl_pmus(void)
711 {
712 	int nr_rapl_pmu = topology_max_packages();
713 
714 	if (!rapl_pmu_is_pkg_scope())
715 		nr_rapl_pmu *= topology_max_dies_per_package();
716 
717 	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
718 	if (!rapl_pmus)
719 		return -ENOMEM;
720 
721 	rapl_pmus->nr_rapl_pmu		= nr_rapl_pmu;
722 	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
723 	rapl_pmus->pmu.attr_update	= rapl_attr_update;
724 	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
725 	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
726 	rapl_pmus->pmu.add		= rapl_pmu_event_add;
727 	rapl_pmus->pmu.del		= rapl_pmu_event_del;
728 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
729 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
730 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
731 	rapl_pmus->pmu.module		= THIS_MODULE;
732 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
733 	return 0;
734 }
735 
736 static struct rapl_model model_snb = {
737 	.events		= BIT(PERF_RAPL_PP0) |
738 			  BIT(PERF_RAPL_PKG) |
739 			  BIT(PERF_RAPL_PP1),
740 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
741 	.rapl_msrs      = intel_rapl_msrs,
742 };
743 
744 static struct rapl_model model_snbep = {
745 	.events		= BIT(PERF_RAPL_PP0) |
746 			  BIT(PERF_RAPL_PKG) |
747 			  BIT(PERF_RAPL_RAM),
748 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
749 	.rapl_msrs      = intel_rapl_msrs,
750 };
751 
752 static struct rapl_model model_hsw = {
753 	.events		= BIT(PERF_RAPL_PP0) |
754 			  BIT(PERF_RAPL_PKG) |
755 			  BIT(PERF_RAPL_RAM) |
756 			  BIT(PERF_RAPL_PP1),
757 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
758 	.rapl_msrs      = intel_rapl_msrs,
759 };
760 
761 static struct rapl_model model_hsx = {
762 	.events		= BIT(PERF_RAPL_PP0) |
763 			  BIT(PERF_RAPL_PKG) |
764 			  BIT(PERF_RAPL_RAM),
765 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
766 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
767 	.rapl_msrs      = intel_rapl_msrs,
768 };
769 
770 static struct rapl_model model_knl = {
771 	.events		= BIT(PERF_RAPL_PKG) |
772 			  BIT(PERF_RAPL_RAM),
773 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
774 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
775 	.rapl_msrs      = intel_rapl_msrs,
776 };
777 
778 static struct rapl_model model_skl = {
779 	.events		= BIT(PERF_RAPL_PP0) |
780 			  BIT(PERF_RAPL_PKG) |
781 			  BIT(PERF_RAPL_RAM) |
782 			  BIT(PERF_RAPL_PP1) |
783 			  BIT(PERF_RAPL_PSYS),
784 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
785 	.rapl_msrs      = intel_rapl_msrs,
786 };
787 
788 static struct rapl_model model_spr = {
789 	.events		= BIT(PERF_RAPL_PP0) |
790 			  BIT(PERF_RAPL_PKG) |
791 			  BIT(PERF_RAPL_RAM) |
792 			  BIT(PERF_RAPL_PSYS),
793 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
794 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
795 	.rapl_msrs      = intel_rapl_spr_msrs,
796 };
797 
798 static struct rapl_model model_amd_hygon = {
799 	.events		= BIT(PERF_RAPL_PKG),
800 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
801 	.rapl_msrs      = amd_rapl_msrs,
802 };
803 
804 static const struct x86_cpu_id rapl_model_match[] __initconst = {
805 	X86_MATCH_FEATURE(X86_FEATURE_RAPL,	&model_amd_hygon),
806 	X86_MATCH_VFM(INTEL_SANDYBRIDGE,	&model_snb),
807 	X86_MATCH_VFM(INTEL_SANDYBRIDGE_X,	&model_snbep),
808 	X86_MATCH_VFM(INTEL_IVYBRIDGE,		&model_snb),
809 	X86_MATCH_VFM(INTEL_IVYBRIDGE_X,	&model_snbep),
810 	X86_MATCH_VFM(INTEL_HASWELL,		&model_hsw),
811 	X86_MATCH_VFM(INTEL_HASWELL_X,		&model_hsx),
812 	X86_MATCH_VFM(INTEL_HASWELL_L,		&model_hsw),
813 	X86_MATCH_VFM(INTEL_HASWELL_G,		&model_hsw),
814 	X86_MATCH_VFM(INTEL_BROADWELL,		&model_hsw),
815 	X86_MATCH_VFM(INTEL_BROADWELL_G,	&model_hsw),
816 	X86_MATCH_VFM(INTEL_BROADWELL_X,	&model_hsx),
817 	X86_MATCH_VFM(INTEL_BROADWELL_D,	&model_hsx),
818 	X86_MATCH_VFM(INTEL_XEON_PHI_KNL,	&model_knl),
819 	X86_MATCH_VFM(INTEL_XEON_PHI_KNM,	&model_knl),
820 	X86_MATCH_VFM(INTEL_SKYLAKE_L,		&model_skl),
821 	X86_MATCH_VFM(INTEL_SKYLAKE,		&model_skl),
822 	X86_MATCH_VFM(INTEL_SKYLAKE_X,		&model_hsx),
823 	X86_MATCH_VFM(INTEL_KABYLAKE_L,		&model_skl),
824 	X86_MATCH_VFM(INTEL_KABYLAKE,		&model_skl),
825 	X86_MATCH_VFM(INTEL_CANNONLAKE_L,	&model_skl),
826 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT,	&model_hsw),
827 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D,	&model_hsw),
828 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS,	&model_hsw),
829 	X86_MATCH_VFM(INTEL_ICELAKE_L,		&model_skl),
830 	X86_MATCH_VFM(INTEL_ICELAKE,		&model_skl),
831 	X86_MATCH_VFM(INTEL_ICELAKE_D,		&model_hsx),
832 	X86_MATCH_VFM(INTEL_ICELAKE_X,		&model_hsx),
833 	X86_MATCH_VFM(INTEL_COMETLAKE_L,	&model_skl),
834 	X86_MATCH_VFM(INTEL_COMETLAKE,		&model_skl),
835 	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	&model_skl),
836 	X86_MATCH_VFM(INTEL_TIGERLAKE,		&model_skl),
837 	X86_MATCH_VFM(INTEL_ALDERLAKE,		&model_skl),
838 	X86_MATCH_VFM(INTEL_ALDERLAKE_L,	&model_skl),
839 	X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,	&model_skl),
840 	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X,	&model_spr),
841 	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X,	&model_spr),
842 	X86_MATCH_VFM(INTEL_RAPTORLAKE,		&model_skl),
843 	X86_MATCH_VFM(INTEL_RAPTORLAKE_P,	&model_skl),
844 	X86_MATCH_VFM(INTEL_RAPTORLAKE_S,	&model_skl),
845 	X86_MATCH_VFM(INTEL_METEORLAKE,		&model_skl),
846 	X86_MATCH_VFM(INTEL_METEORLAKE_L,	&model_skl),
847 	X86_MATCH_VFM(INTEL_ARROWLAKE_H,	&model_skl),
848 	X86_MATCH_VFM(INTEL_ARROWLAKE,		&model_skl),
849 	X86_MATCH_VFM(INTEL_LUNARLAKE_M,	&model_skl),
850 	{},
851 };
852 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
853 
854 static int __init rapl_pmu_init(void)
855 {
856 	const struct x86_cpu_id *id;
857 	struct rapl_model *rm;
858 	int ret;
859 
860 	id = x86_match_cpu(rapl_model_match);
861 	if (!id)
862 		return -ENODEV;
863 
864 	rm = (struct rapl_model *) id->driver_data;
865 
866 	rapl_msrs = rm->rapl_msrs;
867 
868 	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
869 					false, (void *) &rm->events);
870 
871 	ret = rapl_check_hw_unit(rm);
872 	if (ret)
873 		return ret;
874 
875 	ret = init_rapl_pmus();
876 	if (ret)
877 		return ret;
878 
879 	/*
880 	 * Install callbacks. Core will call them for each online cpu.
881 	 */
882 	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
883 				"perf/x86/rapl:online",
884 				rapl_cpu_online, rapl_cpu_offline);
885 	if (ret)
886 		goto out;
887 
888 	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
889 	if (ret)
890 		goto out1;
891 
892 	rapl_advertise();
893 	return 0;
894 
895 out1:
896 	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
897 out:
898 	pr_warn("Initialization failed (%d), disabled\n", ret);
899 	cleanup_rapl_pmus();
900 	return ret;
901 }
902 module_init(rapl_pmu_init);
903 
904 static void __exit intel_rapl_exit(void)
905 {
906 	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
907 	perf_pmu_unregister(&rapl_pmus->pmu);
908 	cleanup_rapl_pmus();
909 }
910 module_exit(intel_rapl_exit);
911