xref: /linux/arch/x86/events/rapl.c (revision 5cd2340cb6a383d04fd88e48fabc2a21a909d6a1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Support Intel/AMD RAPL energy consumption counters
4  * Copyright (C) 2013 Google, Inc., Stephane Eranian
5  *
6  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7  * section 14.7.1 (September 2013)
8  *
9  * AMD RAPL interface for Fam17h is described in the public PPR:
10  * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11  *
12  * RAPL provides more controls than just reporting energy consumption
13  * however here we only expose the 3 energy consumption free running
14  * counters (pp0, pkg, dram).
15  *
16  * Each of those counters increments in a power unit defined by the
17  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18  * but it can vary.
19  *
20  * Counter to rapl events mappings:
21  *
22  *  pp0 counter: consumption of all physical cores (power plane 0)
23  * 	  event: rapl_energy_cores
24  *    perf code: 0x1
25  *
26  *  pkg counter: consumption of the whole processor package
27  *	  event: rapl_energy_pkg
28  *    perf code: 0x2
29  *
30  * dram counter: consumption of the dram domain (servers only)
31  *	  event: rapl_energy_dram
32  *    perf code: 0x3
33  *
34  * gpu counter: consumption of the builtin-gpu domain (client only)
35  *	  event: rapl_energy_gpu
36  *    perf code: 0x4
37  *
38  *  psys counter: consumption of the builtin-psys domain (client only)
39  *	  event: rapl_energy_psys
40  *    perf code: 0x5
41  *
42  * We manage those counters as free running (read-only). They may be
43  * use simultaneously by other tools, such as turbostat.
44  *
45  * The events only support system-wide mode counting. There is no
46  * sampling support because it does not make sense and is not
47  * supported by the RAPL hardware.
48  *
49  * Because we want to avoid floating-point operations in the kernel,
50  * the events are all reported in fixed point arithmetic (32.32).
51  * Tools must adjust the counts to convert them to Watts using
52  * the duration of the measurement. Tools may use a function such as
53  * ldexp(raw_count, -32);
54  */
55 
56 #define pr_fmt(fmt) "RAPL PMU: " fmt
57 
58 #include <linux/module.h>
59 #include <linux/slab.h>
60 #include <linux/perf_event.h>
61 #include <linux/nospec.h>
62 #include <asm/cpu_device_id.h>
63 #include <asm/intel-family.h>
64 #include "perf_event.h"
65 #include "probe.h"
66 
67 MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters");
68 MODULE_LICENSE("GPL");
69 
70 /*
71  * RAPL energy status counters
72  */
73 enum perf_rapl_events {
74 	PERF_RAPL_PP0 = 0,		/* all cores */
75 	PERF_RAPL_PKG,			/* entire package */
76 	PERF_RAPL_RAM,			/* DRAM */
77 	PERF_RAPL_PP1,			/* gpu */
78 	PERF_RAPL_PSYS,			/* psys */
79 
80 	PERF_RAPL_MAX,
81 	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
82 };
83 
84 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
85 	"pp0-core",
86 	"package",
87 	"dram",
88 	"pp1-gpu",
89 	"psys",
90 };
91 
92 /*
93  * event code: LSB 8 bits, passed in attr->config
94  * any other bit is reserved
95  */
96 #define RAPL_EVENT_MASK	0xFFULL
97 #define RAPL_CNTR_WIDTH 32
98 
99 #define RAPL_EVENT_ATTR_STR(_name, v, str)					\
100 static struct perf_pmu_events_attr event_attr_##v = {				\
101 	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
102 	.id		= 0,							\
103 	.event_str	= str,							\
104 };
105 
106 struct rapl_pmu {
107 	raw_spinlock_t		lock;
108 	int			n_active;
109 	int			cpu;
110 	struct list_head	active_list;
111 	struct pmu		*pmu;
112 	ktime_t			timer_interval;
113 	struct hrtimer		hrtimer;
114 };
115 
116 struct rapl_pmus {
117 	struct pmu		pmu;
118 	unsigned int		nr_rapl_pmu;
119 	struct rapl_pmu		*pmus[] __counted_by(nr_rapl_pmu);
120 };
121 
122 enum rapl_unit_quirk {
123 	RAPL_UNIT_QUIRK_NONE,
124 	RAPL_UNIT_QUIRK_INTEL_HSW,
125 	RAPL_UNIT_QUIRK_INTEL_SPR,
126 };
127 
128 struct rapl_model {
129 	struct perf_msr *rapl_msrs;
130 	unsigned long	events;
131 	unsigned int	msr_power_unit;
132 	enum rapl_unit_quirk	unit_quirk;
133 };
134 
135  /* 1/2^hw_unit Joule */
136 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
137 static struct rapl_pmus *rapl_pmus;
138 static cpumask_t rapl_cpu_mask;
139 static unsigned int rapl_cntr_mask;
140 static u64 rapl_timer_ms;
141 static struct perf_msr *rapl_msrs;
142 
143 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
144 {
145 	unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
146 
147 	/*
148 	 * The unsigned check also catches the '-1' return value for non
149 	 * existent mappings in the topology map.
150 	 */
151 	return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
152 }
153 
154 static inline u64 rapl_read_counter(struct perf_event *event)
155 {
156 	u64 raw;
157 	rdmsrl(event->hw.event_base, raw);
158 	return raw;
159 }
160 
161 static inline u64 rapl_scale(u64 v, int cfg)
162 {
163 	if (cfg > NR_RAPL_DOMAINS) {
164 		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
165 		return v;
166 	}
167 	/*
168 	 * scale delta to smallest unit (1/2^32)
169 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
170 	 * or use ldexp(count, -32).
171 	 * Watts = Joules/Time delta
172 	 */
173 	return v << (32 - rapl_hw_unit[cfg - 1]);
174 }
175 
176 static u64 rapl_event_update(struct perf_event *event)
177 {
178 	struct hw_perf_event *hwc = &event->hw;
179 	u64 prev_raw_count, new_raw_count;
180 	s64 delta, sdelta;
181 	int shift = RAPL_CNTR_WIDTH;
182 
183 	prev_raw_count = local64_read(&hwc->prev_count);
184 	do {
185 		rdmsrl(event->hw.event_base, new_raw_count);
186 	} while (!local64_try_cmpxchg(&hwc->prev_count,
187 				      &prev_raw_count, new_raw_count));
188 
189 	/*
190 	 * Now we have the new raw value and have updated the prev
191 	 * timestamp already. We can now calculate the elapsed delta
192 	 * (event-)time and add that to the generic event.
193 	 *
194 	 * Careful, not all hw sign-extends above the physical width
195 	 * of the count.
196 	 */
197 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
198 	delta >>= shift;
199 
200 	sdelta = rapl_scale(delta, event->hw.config);
201 
202 	local64_add(sdelta, &event->count);
203 
204 	return new_raw_count;
205 }
206 
207 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
208 {
209        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
210 		     HRTIMER_MODE_REL_PINNED);
211 }
212 
213 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
214 {
215 	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
216 	struct perf_event *event;
217 	unsigned long flags;
218 
219 	if (!pmu->n_active)
220 		return HRTIMER_NORESTART;
221 
222 	raw_spin_lock_irqsave(&pmu->lock, flags);
223 
224 	list_for_each_entry(event, &pmu->active_list, active_entry)
225 		rapl_event_update(event);
226 
227 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
228 
229 	hrtimer_forward_now(hrtimer, pmu->timer_interval);
230 
231 	return HRTIMER_RESTART;
232 }
233 
234 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
235 {
236 	struct hrtimer *hr = &pmu->hrtimer;
237 
238 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
239 	hr->function = rapl_hrtimer_handle;
240 }
241 
242 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
243 				   struct perf_event *event)
244 {
245 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
246 		return;
247 
248 	event->hw.state = 0;
249 
250 	list_add_tail(&event->active_entry, &pmu->active_list);
251 
252 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
253 
254 	pmu->n_active++;
255 	if (pmu->n_active == 1)
256 		rapl_start_hrtimer(pmu);
257 }
258 
259 static void rapl_pmu_event_start(struct perf_event *event, int mode)
260 {
261 	struct rapl_pmu *pmu = event->pmu_private;
262 	unsigned long flags;
263 
264 	raw_spin_lock_irqsave(&pmu->lock, flags);
265 	__rapl_pmu_event_start(pmu, event);
266 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
267 }
268 
269 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
270 {
271 	struct rapl_pmu *pmu = event->pmu_private;
272 	struct hw_perf_event *hwc = &event->hw;
273 	unsigned long flags;
274 
275 	raw_spin_lock_irqsave(&pmu->lock, flags);
276 
277 	/* mark event as deactivated and stopped */
278 	if (!(hwc->state & PERF_HES_STOPPED)) {
279 		WARN_ON_ONCE(pmu->n_active <= 0);
280 		pmu->n_active--;
281 		if (pmu->n_active == 0)
282 			hrtimer_cancel(&pmu->hrtimer);
283 
284 		list_del(&event->active_entry);
285 
286 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
287 		hwc->state |= PERF_HES_STOPPED;
288 	}
289 
290 	/* check if update of sw counter is necessary */
291 	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
292 		/*
293 		 * Drain the remaining delta count out of a event
294 		 * that we are disabling:
295 		 */
296 		rapl_event_update(event);
297 		hwc->state |= PERF_HES_UPTODATE;
298 	}
299 
300 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
301 }
302 
303 static int rapl_pmu_event_add(struct perf_event *event, int mode)
304 {
305 	struct rapl_pmu *pmu = event->pmu_private;
306 	struct hw_perf_event *hwc = &event->hw;
307 	unsigned long flags;
308 
309 	raw_spin_lock_irqsave(&pmu->lock, flags);
310 
311 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
312 
313 	if (mode & PERF_EF_START)
314 		__rapl_pmu_event_start(pmu, event);
315 
316 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
317 
318 	return 0;
319 }
320 
321 static void rapl_pmu_event_del(struct perf_event *event, int flags)
322 {
323 	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
324 }
325 
326 static int rapl_pmu_event_init(struct perf_event *event)
327 {
328 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
329 	int bit, ret = 0;
330 	struct rapl_pmu *pmu;
331 
332 	/* only look at RAPL events */
333 	if (event->attr.type != rapl_pmus->pmu.type)
334 		return -ENOENT;
335 
336 	/* check only supported bits are set */
337 	if (event->attr.config & ~RAPL_EVENT_MASK)
338 		return -EINVAL;
339 
340 	if (event->cpu < 0)
341 		return -EINVAL;
342 
343 	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
344 
345 	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
346 		return -EINVAL;
347 
348 	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
349 	bit = cfg - 1;
350 
351 	/* check event supported */
352 	if (!(rapl_cntr_mask & (1 << bit)))
353 		return -EINVAL;
354 
355 	/* unsupported modes and filters */
356 	if (event->attr.sample_period) /* no sampling */
357 		return -EINVAL;
358 
359 	/* must be done before validate_group */
360 	pmu = cpu_to_rapl_pmu(event->cpu);
361 	if (!pmu)
362 		return -EINVAL;
363 	event->cpu = pmu->cpu;
364 	event->pmu_private = pmu;
365 	event->hw.event_base = rapl_msrs[bit].msr;
366 	event->hw.config = cfg;
367 	event->hw.idx = bit;
368 
369 	return ret;
370 }
371 
372 static void rapl_pmu_event_read(struct perf_event *event)
373 {
374 	rapl_event_update(event);
375 }
376 
377 static ssize_t rapl_get_attr_cpumask(struct device *dev,
378 				struct device_attribute *attr, char *buf)
379 {
380 	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
381 }
382 
383 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
384 
385 static struct attribute *rapl_pmu_attrs[] = {
386 	&dev_attr_cpumask.attr,
387 	NULL,
388 };
389 
390 static struct attribute_group rapl_pmu_attr_group = {
391 	.attrs = rapl_pmu_attrs,
392 };
393 
394 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
395 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
396 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
397 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
398 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
399 
400 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
401 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
402 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
403 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
404 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
405 
406 /*
407  * we compute in 0.23 nJ increments regardless of MSR
408  */
409 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
410 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
411 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
412 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
413 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
414 
415 /*
416  * There are no default events, but we need to create
417  * "events" group (with empty attrs) before updating
418  * it with detected events.
419  */
420 static struct attribute *attrs_empty[] = {
421 	NULL,
422 };
423 
424 static struct attribute_group rapl_pmu_events_group = {
425 	.name = "events",
426 	.attrs = attrs_empty,
427 };
428 
429 PMU_FORMAT_ATTR(event, "config:0-7");
430 static struct attribute *rapl_formats_attr[] = {
431 	&format_attr_event.attr,
432 	NULL,
433 };
434 
435 static struct attribute_group rapl_pmu_format_group = {
436 	.name = "format",
437 	.attrs = rapl_formats_attr,
438 };
439 
440 static const struct attribute_group *rapl_attr_groups[] = {
441 	&rapl_pmu_attr_group,
442 	&rapl_pmu_format_group,
443 	&rapl_pmu_events_group,
444 	NULL,
445 };
446 
447 static struct attribute *rapl_events_cores[] = {
448 	EVENT_PTR(rapl_cores),
449 	EVENT_PTR(rapl_cores_unit),
450 	EVENT_PTR(rapl_cores_scale),
451 	NULL,
452 };
453 
454 static struct attribute_group rapl_events_cores_group = {
455 	.name  = "events",
456 	.attrs = rapl_events_cores,
457 };
458 
459 static struct attribute *rapl_events_pkg[] = {
460 	EVENT_PTR(rapl_pkg),
461 	EVENT_PTR(rapl_pkg_unit),
462 	EVENT_PTR(rapl_pkg_scale),
463 	NULL,
464 };
465 
466 static struct attribute_group rapl_events_pkg_group = {
467 	.name  = "events",
468 	.attrs = rapl_events_pkg,
469 };
470 
471 static struct attribute *rapl_events_ram[] = {
472 	EVENT_PTR(rapl_ram),
473 	EVENT_PTR(rapl_ram_unit),
474 	EVENT_PTR(rapl_ram_scale),
475 	NULL,
476 };
477 
478 static struct attribute_group rapl_events_ram_group = {
479 	.name  = "events",
480 	.attrs = rapl_events_ram,
481 };
482 
483 static struct attribute *rapl_events_gpu[] = {
484 	EVENT_PTR(rapl_gpu),
485 	EVENT_PTR(rapl_gpu_unit),
486 	EVENT_PTR(rapl_gpu_scale),
487 	NULL,
488 };
489 
490 static struct attribute_group rapl_events_gpu_group = {
491 	.name  = "events",
492 	.attrs = rapl_events_gpu,
493 };
494 
495 static struct attribute *rapl_events_psys[] = {
496 	EVENT_PTR(rapl_psys),
497 	EVENT_PTR(rapl_psys_unit),
498 	EVENT_PTR(rapl_psys_scale),
499 	NULL,
500 };
501 
502 static struct attribute_group rapl_events_psys_group = {
503 	.name  = "events",
504 	.attrs = rapl_events_psys,
505 };
506 
507 static bool test_msr(int idx, void *data)
508 {
509 	return test_bit(idx, (unsigned long *) data);
510 }
511 
512 /* Only lower 32bits of the MSR represents the energy counter */
513 #define RAPL_MSR_MASK 0xFFFFFFFF
514 
515 static struct perf_msr intel_rapl_msrs[] = {
516 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
517 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
518 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
519 	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
520 	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
521 };
522 
523 static struct perf_msr intel_rapl_spr_msrs[] = {
524 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
525 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
526 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
527 	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
528 	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
529 };
530 
531 /*
532  * Force to PERF_RAPL_MAX size due to:
533  * - perf_msr_probe(PERF_RAPL_MAX)
534  * - want to use same event codes across both architectures
535  */
536 static struct perf_msr amd_rapl_msrs[] = {
537 	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
538 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
539 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
540 	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   NULL, false, 0 },
541 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
542 };
543 
544 static int rapl_cpu_offline(unsigned int cpu)
545 {
546 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
547 	int target;
548 
549 	/* Check if exiting cpu is used for collecting rapl events */
550 	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
551 		return 0;
552 
553 	pmu->cpu = -1;
554 	/* Find a new cpu to collect rapl events */
555 	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
556 
557 	/* Migrate rapl events to the new target */
558 	if (target < nr_cpu_ids) {
559 		cpumask_set_cpu(target, &rapl_cpu_mask);
560 		pmu->cpu = target;
561 		perf_pmu_migrate_context(pmu->pmu, cpu, target);
562 	}
563 	return 0;
564 }
565 
566 static int rapl_cpu_online(unsigned int cpu)
567 {
568 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
569 	int target;
570 
571 	if (!pmu) {
572 		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
573 		if (!pmu)
574 			return -ENOMEM;
575 
576 		raw_spin_lock_init(&pmu->lock);
577 		INIT_LIST_HEAD(&pmu->active_list);
578 		pmu->pmu = &rapl_pmus->pmu;
579 		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
580 		rapl_hrtimer_init(pmu);
581 
582 		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
583 	}
584 
585 	/*
586 	 * Check if there is an online cpu in the package which collects rapl
587 	 * events already.
588 	 */
589 	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
590 	if (target < nr_cpu_ids)
591 		return 0;
592 
593 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
594 	pmu->cpu = cpu;
595 	return 0;
596 }
597 
598 static int rapl_check_hw_unit(struct rapl_model *rm)
599 {
600 	u64 msr_rapl_power_unit_bits;
601 	int i;
602 
603 	/* protect rdmsrl() to handle virtualization */
604 	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
605 		return -1;
606 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
607 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
608 
609 	switch (rm->unit_quirk) {
610 	/*
611 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
612 	 * different than the unit from power unit MSR. See
613 	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
614 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
615 	 */
616 	case RAPL_UNIT_QUIRK_INTEL_HSW:
617 		rapl_hw_unit[PERF_RAPL_RAM] = 16;
618 		break;
619 	/* SPR uses a fixed energy unit for Psys domain. */
620 	case RAPL_UNIT_QUIRK_INTEL_SPR:
621 		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
622 		break;
623 	default:
624 		break;
625 	}
626 
627 
628 	/*
629 	 * Calculate the timer rate:
630 	 * Use reference of 200W for scaling the timeout to avoid counter
631 	 * overflows. 200W = 200 Joules/sec
632 	 * Divide interval by 2 to avoid lockstep (2 * 100)
633 	 * if hw unit is 32, then we use 2 ms 1/200/2
634 	 */
635 	rapl_timer_ms = 2;
636 	if (rapl_hw_unit[0] < 32) {
637 		rapl_timer_ms = (1000 / (2 * 100));
638 		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
639 	}
640 	return 0;
641 }
642 
643 static void __init rapl_advertise(void)
644 {
645 	int i;
646 
647 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
648 		hweight32(rapl_cntr_mask), rapl_timer_ms);
649 
650 	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
651 		if (rapl_cntr_mask & (1 << i)) {
652 			pr_info("hw unit of domain %s 2^-%d Joules\n",
653 				rapl_domain_names[i], rapl_hw_unit[i]);
654 		}
655 	}
656 }
657 
658 static void cleanup_rapl_pmus(void)
659 {
660 	int i;
661 
662 	for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
663 		kfree(rapl_pmus->pmus[i]);
664 	kfree(rapl_pmus);
665 }
666 
667 static const struct attribute_group *rapl_attr_update[] = {
668 	&rapl_events_cores_group,
669 	&rapl_events_pkg_group,
670 	&rapl_events_ram_group,
671 	&rapl_events_gpu_group,
672 	&rapl_events_psys_group,
673 	NULL,
674 };
675 
676 static int __init init_rapl_pmus(void)
677 {
678 	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
679 
680 	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
681 	if (!rapl_pmus)
682 		return -ENOMEM;
683 
684 	rapl_pmus->nr_rapl_pmu		= nr_rapl_pmu;
685 	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
686 	rapl_pmus->pmu.attr_update	= rapl_attr_update;
687 	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
688 	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
689 	rapl_pmus->pmu.add		= rapl_pmu_event_add;
690 	rapl_pmus->pmu.del		= rapl_pmu_event_del;
691 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
692 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
693 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
694 	rapl_pmus->pmu.module		= THIS_MODULE;
695 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
696 	return 0;
697 }
698 
699 static struct rapl_model model_snb = {
700 	.events		= BIT(PERF_RAPL_PP0) |
701 			  BIT(PERF_RAPL_PKG) |
702 			  BIT(PERF_RAPL_PP1),
703 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
704 	.rapl_msrs      = intel_rapl_msrs,
705 };
706 
707 static struct rapl_model model_snbep = {
708 	.events		= BIT(PERF_RAPL_PP0) |
709 			  BIT(PERF_RAPL_PKG) |
710 			  BIT(PERF_RAPL_RAM),
711 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
712 	.rapl_msrs      = intel_rapl_msrs,
713 };
714 
715 static struct rapl_model model_hsw = {
716 	.events		= BIT(PERF_RAPL_PP0) |
717 			  BIT(PERF_RAPL_PKG) |
718 			  BIT(PERF_RAPL_RAM) |
719 			  BIT(PERF_RAPL_PP1),
720 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
721 	.rapl_msrs      = intel_rapl_msrs,
722 };
723 
724 static struct rapl_model model_hsx = {
725 	.events		= BIT(PERF_RAPL_PP0) |
726 			  BIT(PERF_RAPL_PKG) |
727 			  BIT(PERF_RAPL_RAM),
728 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
729 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
730 	.rapl_msrs      = intel_rapl_msrs,
731 };
732 
733 static struct rapl_model model_knl = {
734 	.events		= BIT(PERF_RAPL_PKG) |
735 			  BIT(PERF_RAPL_RAM),
736 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
737 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
738 	.rapl_msrs      = intel_rapl_msrs,
739 };
740 
741 static struct rapl_model model_skl = {
742 	.events		= BIT(PERF_RAPL_PP0) |
743 			  BIT(PERF_RAPL_PKG) |
744 			  BIT(PERF_RAPL_RAM) |
745 			  BIT(PERF_RAPL_PP1) |
746 			  BIT(PERF_RAPL_PSYS),
747 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
748 	.rapl_msrs      = intel_rapl_msrs,
749 };
750 
751 static struct rapl_model model_spr = {
752 	.events		= BIT(PERF_RAPL_PP0) |
753 			  BIT(PERF_RAPL_PKG) |
754 			  BIT(PERF_RAPL_RAM) |
755 			  BIT(PERF_RAPL_PSYS),
756 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
757 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
758 	.rapl_msrs      = intel_rapl_spr_msrs,
759 };
760 
761 static struct rapl_model model_amd_hygon = {
762 	.events		= BIT(PERF_RAPL_PKG),
763 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
764 	.rapl_msrs      = amd_rapl_msrs,
765 };
766 
767 static const struct x86_cpu_id rapl_model_match[] __initconst = {
768 	X86_MATCH_FEATURE(X86_FEATURE_RAPL,	&model_amd_hygon),
769 	X86_MATCH_VFM(INTEL_SANDYBRIDGE,	&model_snb),
770 	X86_MATCH_VFM(INTEL_SANDYBRIDGE_X,	&model_snbep),
771 	X86_MATCH_VFM(INTEL_IVYBRIDGE,		&model_snb),
772 	X86_MATCH_VFM(INTEL_IVYBRIDGE_X,	&model_snbep),
773 	X86_MATCH_VFM(INTEL_HASWELL,		&model_hsw),
774 	X86_MATCH_VFM(INTEL_HASWELL_X,		&model_hsx),
775 	X86_MATCH_VFM(INTEL_HASWELL_L,		&model_hsw),
776 	X86_MATCH_VFM(INTEL_HASWELL_G,		&model_hsw),
777 	X86_MATCH_VFM(INTEL_BROADWELL,		&model_hsw),
778 	X86_MATCH_VFM(INTEL_BROADWELL_G,	&model_hsw),
779 	X86_MATCH_VFM(INTEL_BROADWELL_X,	&model_hsx),
780 	X86_MATCH_VFM(INTEL_BROADWELL_D,	&model_hsx),
781 	X86_MATCH_VFM(INTEL_XEON_PHI_KNL,	&model_knl),
782 	X86_MATCH_VFM(INTEL_XEON_PHI_KNM,	&model_knl),
783 	X86_MATCH_VFM(INTEL_SKYLAKE_L,		&model_skl),
784 	X86_MATCH_VFM(INTEL_SKYLAKE,		&model_skl),
785 	X86_MATCH_VFM(INTEL_SKYLAKE_X,		&model_hsx),
786 	X86_MATCH_VFM(INTEL_KABYLAKE_L,		&model_skl),
787 	X86_MATCH_VFM(INTEL_KABYLAKE,		&model_skl),
788 	X86_MATCH_VFM(INTEL_CANNONLAKE_L,	&model_skl),
789 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT,	&model_hsw),
790 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D,	&model_hsw),
791 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS,	&model_hsw),
792 	X86_MATCH_VFM(INTEL_ICELAKE_L,		&model_skl),
793 	X86_MATCH_VFM(INTEL_ICELAKE,		&model_skl),
794 	X86_MATCH_VFM(INTEL_ICELAKE_D,		&model_hsx),
795 	X86_MATCH_VFM(INTEL_ICELAKE_X,		&model_hsx),
796 	X86_MATCH_VFM(INTEL_COMETLAKE_L,	&model_skl),
797 	X86_MATCH_VFM(INTEL_COMETLAKE,		&model_skl),
798 	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	&model_skl),
799 	X86_MATCH_VFM(INTEL_TIGERLAKE,		&model_skl),
800 	X86_MATCH_VFM(INTEL_ALDERLAKE,		&model_skl),
801 	X86_MATCH_VFM(INTEL_ALDERLAKE_L,	&model_skl),
802 	X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,	&model_skl),
803 	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X,	&model_spr),
804 	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X,	&model_spr),
805 	X86_MATCH_VFM(INTEL_RAPTORLAKE,		&model_skl),
806 	X86_MATCH_VFM(INTEL_RAPTORLAKE_P,	&model_skl),
807 	X86_MATCH_VFM(INTEL_RAPTORLAKE_S,	&model_skl),
808 	X86_MATCH_VFM(INTEL_METEORLAKE,		&model_skl),
809 	X86_MATCH_VFM(INTEL_METEORLAKE_L,	&model_skl),
810 	X86_MATCH_VFM(INTEL_ARROWLAKE_H,	&model_skl),
811 	X86_MATCH_VFM(INTEL_ARROWLAKE,		&model_skl),
812 	X86_MATCH_VFM(INTEL_LUNARLAKE_M,	&model_skl),
813 	{},
814 };
815 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
816 
817 static int __init rapl_pmu_init(void)
818 {
819 	const struct x86_cpu_id *id;
820 	struct rapl_model *rm;
821 	int ret;
822 
823 	id = x86_match_cpu(rapl_model_match);
824 	if (!id)
825 		return -ENODEV;
826 
827 	rm = (struct rapl_model *) id->driver_data;
828 
829 	rapl_msrs = rm->rapl_msrs;
830 
831 	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
832 					false, (void *) &rm->events);
833 
834 	ret = rapl_check_hw_unit(rm);
835 	if (ret)
836 		return ret;
837 
838 	ret = init_rapl_pmus();
839 	if (ret)
840 		return ret;
841 
842 	/*
843 	 * Install callbacks. Core will call them for each online cpu.
844 	 */
845 	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
846 				"perf/x86/rapl:online",
847 				rapl_cpu_online, rapl_cpu_offline);
848 	if (ret)
849 		goto out;
850 
851 	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
852 	if (ret)
853 		goto out1;
854 
855 	rapl_advertise();
856 	return 0;
857 
858 out1:
859 	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
860 out:
861 	pr_warn("Initialization failed (%d), disabled\n", ret);
862 	cleanup_rapl_pmus();
863 	return ret;
864 }
865 module_init(rapl_pmu_init);
866 
867 static void __exit intel_rapl_exit(void)
868 {
869 	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
870 	perf_pmu_unregister(&rapl_pmus->pmu);
871 	cleanup_rapl_pmus();
872 }
873 module_exit(intel_rapl_exit);
874