xref: /linux/arch/x86/events/rapl.c (revision 766331f2860b08695418109582c94e98cc3528fe)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Support Intel/AMD RAPL energy consumption counters
4  * Copyright (C) 2013 Google, Inc., Stephane Eranian
5  *
6  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7  * section 14.7.1 (September 2013)
8  *
9  * AMD RAPL interface for Fam17h is described in the public PPR:
10  * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11  *
12  * RAPL provides more controls than just reporting energy consumption
13  * however here we only expose the 3 energy consumption free running
14  * counters (pp0, pkg, dram).
15  *
16  * Each of those counters increments in a power unit defined by the
17  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18  * but it can vary.
19  *
20  * Counter to rapl events mappings:
21  *
22  *  pp0 counter: consumption of all physical cores (power plane 0)
23  * 	  event: rapl_energy_cores
24  *    perf code: 0x1
25  *
26  *  pkg counter: consumption of the whole processor package
27  *	  event: rapl_energy_pkg
28  *    perf code: 0x2
29  *
30  * dram counter: consumption of the dram domain (servers only)
31  *	  event: rapl_energy_dram
32  *    perf code: 0x3
33  *
34  * gpu counter: consumption of the builtin-gpu domain (client only)
35  *	  event: rapl_energy_gpu
36  *    perf code: 0x4
37  *
38  *  psys counter: consumption of the builtin-psys domain (client only)
39  *	  event: rapl_energy_psys
40  *    perf code: 0x5
41  *
42  *  core counter: consumption of a single physical core
43  *	  event: rapl_energy_core (power_core PMU)
44  *    perf code: 0x1
45  *
46  * We manage those counters as free running (read-only). They may be
47  * use simultaneously by other tools, such as turbostat.
48  *
49  * The events only support system-wide mode counting. There is no
50  * sampling support because it does not make sense and is not
51  * supported by the RAPL hardware.
52  *
53  * Because we want to avoid floating-point operations in the kernel,
54  * the events are all reported in fixed point arithmetic (32.32).
55  * Tools must adjust the counts to convert them to Watts using
56  * the duration of the measurement. Tools may use a function such as
57  * ldexp(raw_count, -32);
58  */
59 
60 #define pr_fmt(fmt) "RAPL PMU: " fmt
61 
62 #include <linux/module.h>
63 #include <linux/slab.h>
64 #include <linux/perf_event.h>
65 #include <linux/nospec.h>
66 #include <asm/cpu_device_id.h>
67 #include <asm/intel-family.h>
68 #include "perf_event.h"
69 #include "probe.h"
70 
71 MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters");
72 MODULE_LICENSE("GPL");
73 
74 /*
75  * RAPL energy status counters
76  */
77 enum perf_rapl_pkg_events {
78 	PERF_RAPL_PP0 = 0,		/* all cores */
79 	PERF_RAPL_PKG,			/* entire package */
80 	PERF_RAPL_RAM,			/* DRAM */
81 	PERF_RAPL_PP1,			/* gpu */
82 	PERF_RAPL_PSYS,			/* psys */
83 
84 	PERF_RAPL_PKG_EVENTS_MAX,
85 	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
86 };
87 
88 #define PERF_RAPL_CORE			0		/* single core */
89 #define PERF_RAPL_CORE_EVENTS_MAX	1
90 #define NR_RAPL_CORE_DOMAINS		PERF_RAPL_CORE_EVENTS_MAX
91 
92 static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
93 	"pp0-core",
94 	"package",
95 	"dram",
96 	"pp1-gpu",
97 	"psys",
98 };
99 
100 static const char *const rapl_core_domain_name __initconst = "core";
101 
102 /*
103  * event code: LSB 8 bits, passed in attr->config
104  * any other bit is reserved
105  */
106 #define RAPL_EVENT_MASK	0xFFULL
107 #define RAPL_CNTR_WIDTH 32
108 
109 #define RAPL_EVENT_ATTR_STR(_name, v, str)					\
110 static struct perf_pmu_events_attr event_attr_##v = {				\
111 	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
112 	.id		= 0,							\
113 	.event_str	= str,							\
114 };
115 
116 /*
117  * RAPL Package energy counter scope:
118  * 1. AMD/HYGON platforms have a per-PKG package energy counter
119  * 2. For Intel platforms
120  *	2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
121  *	2.2. Other Intel platforms are single die systems so the scope can be
122  *	     considered as either pkg-scope or die-scope, and we are considering
123  *	     them as die-scope.
124  */
125 #define rapl_pkg_pmu_is_pkg_scope()				\
126 	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||	\
127 	 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
128 
129 struct rapl_pmu {
130 	raw_spinlock_t		lock;
131 	int			n_active;
132 	int			cpu;
133 	struct list_head	active_list;
134 	struct pmu		*pmu;
135 	ktime_t			timer_interval;
136 	struct hrtimer		hrtimer;
137 };
138 
139 struct rapl_pmus {
140 	struct pmu		pmu;
141 	unsigned int		nr_rapl_pmu;
142 	unsigned int		cntr_mask;
143 	struct rapl_pmu		*rapl_pmu[] __counted_by(nr_rapl_pmu);
144 };
145 
146 enum rapl_unit_quirk {
147 	RAPL_UNIT_QUIRK_NONE,
148 	RAPL_UNIT_QUIRK_INTEL_HSW,
149 	RAPL_UNIT_QUIRK_INTEL_SPR,
150 };
151 
152 struct rapl_model {
153 	struct perf_msr *rapl_pkg_msrs;
154 	struct perf_msr *rapl_core_msrs;
155 	unsigned long	pkg_events;
156 	unsigned long	core_events;
157 	unsigned int	msr_power_unit;
158 	enum rapl_unit_quirk	unit_quirk;
159 };
160 
161  /* 1/2^hw_unit Joule */
162 static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
163 static int rapl_core_hw_unit __read_mostly;
164 static struct rapl_pmus *rapl_pmus_pkg;
165 static struct rapl_pmus *rapl_pmus_core;
166 static u64 rapl_timer_ms;
167 static struct rapl_model *rapl_model;
168 
169 /*
170  * Helper function to get the correct topology id according to the
171  * RAPL PMU scope.
172  */
get_rapl_pmu_idx(int cpu,int scope)173 static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
174 {
175 	/*
176 	 * Returns unsigned int, which converts the '-1' return value
177 	 * (for non-existent mappings in topology map) to UINT_MAX, so
178 	 * the error check in the caller is simplified.
179 	 */
180 	switch (scope) {
181 	case PERF_PMU_SCOPE_PKG:
182 		return topology_logical_package_id(cpu);
183 	case PERF_PMU_SCOPE_DIE:
184 		return topology_logical_die_id(cpu);
185 	case PERF_PMU_SCOPE_CORE:
186 		return topology_logical_core_id(cpu);
187 	default:
188 		return -EINVAL;
189 	}
190 }
191 
rapl_read_counter(struct perf_event * event)192 static inline u64 rapl_read_counter(struct perf_event *event)
193 {
194 	u64 raw;
195 	rdmsrl(event->hw.event_base, raw);
196 	return raw;
197 }
198 
rapl_scale(u64 v,struct perf_event * event)199 static inline u64 rapl_scale(u64 v, struct perf_event *event)
200 {
201 	int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
202 
203 	if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
204 		hw_unit = rapl_core_hw_unit;
205 
206 	/*
207 	 * scale delta to smallest unit (1/2^32)
208 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
209 	 * or use ldexp(count, -32).
210 	 * Watts = Joules/Time delta
211 	 */
212 	return v << (32 - hw_unit);
213 }
214 
rapl_event_update(struct perf_event * event)215 static u64 rapl_event_update(struct perf_event *event)
216 {
217 	struct hw_perf_event *hwc = &event->hw;
218 	u64 prev_raw_count, new_raw_count;
219 	s64 delta, sdelta;
220 	int shift = RAPL_CNTR_WIDTH;
221 
222 	prev_raw_count = local64_read(&hwc->prev_count);
223 	do {
224 		rdmsrl(event->hw.event_base, new_raw_count);
225 	} while (!local64_try_cmpxchg(&hwc->prev_count,
226 				      &prev_raw_count, new_raw_count));
227 
228 	/*
229 	 * Now we have the new raw value and have updated the prev
230 	 * timestamp already. We can now calculate the elapsed delta
231 	 * (event-)time and add that to the generic event.
232 	 *
233 	 * Careful, not all hw sign-extends above the physical width
234 	 * of the count.
235 	 */
236 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
237 	delta >>= shift;
238 
239 	sdelta = rapl_scale(delta, event);
240 
241 	local64_add(sdelta, &event->count);
242 
243 	return new_raw_count;
244 }
245 
rapl_start_hrtimer(struct rapl_pmu * pmu)246 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
247 {
248        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
249 		     HRTIMER_MODE_REL_PINNED);
250 }
251 
rapl_hrtimer_handle(struct hrtimer * hrtimer)252 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
253 {
254 	struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
255 	struct perf_event *event;
256 	unsigned long flags;
257 
258 	if (!rapl_pmu->n_active)
259 		return HRTIMER_NORESTART;
260 
261 	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
262 
263 	list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
264 		rapl_event_update(event);
265 
266 	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
267 
268 	hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
269 
270 	return HRTIMER_RESTART;
271 }
272 
rapl_hrtimer_init(struct rapl_pmu * rapl_pmu)273 static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
274 {
275 	struct hrtimer *hr = &rapl_pmu->hrtimer;
276 
277 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
278 	hr->function = rapl_hrtimer_handle;
279 }
280 
__rapl_pmu_event_start(struct rapl_pmu * rapl_pmu,struct perf_event * event)281 static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
282 				   struct perf_event *event)
283 {
284 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
285 		return;
286 
287 	event->hw.state = 0;
288 
289 	list_add_tail(&event->active_entry, &rapl_pmu->active_list);
290 
291 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
292 
293 	rapl_pmu->n_active++;
294 	if (rapl_pmu->n_active == 1)
295 		rapl_start_hrtimer(rapl_pmu);
296 }
297 
rapl_pmu_event_start(struct perf_event * event,int mode)298 static void rapl_pmu_event_start(struct perf_event *event, int mode)
299 {
300 	struct rapl_pmu *rapl_pmu = event->pmu_private;
301 	unsigned long flags;
302 
303 	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
304 	__rapl_pmu_event_start(rapl_pmu, event);
305 	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
306 }
307 
rapl_pmu_event_stop(struct perf_event * event,int mode)308 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
309 {
310 	struct rapl_pmu *rapl_pmu = event->pmu_private;
311 	struct hw_perf_event *hwc = &event->hw;
312 	unsigned long flags;
313 
314 	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
315 
316 	/* mark event as deactivated and stopped */
317 	if (!(hwc->state & PERF_HES_STOPPED)) {
318 		WARN_ON_ONCE(rapl_pmu->n_active <= 0);
319 		rapl_pmu->n_active--;
320 		if (rapl_pmu->n_active == 0)
321 			hrtimer_cancel(&rapl_pmu->hrtimer);
322 
323 		list_del(&event->active_entry);
324 
325 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
326 		hwc->state |= PERF_HES_STOPPED;
327 	}
328 
329 	/* check if update of sw counter is necessary */
330 	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
331 		/*
332 		 * Drain the remaining delta count out of a event
333 		 * that we are disabling:
334 		 */
335 		rapl_event_update(event);
336 		hwc->state |= PERF_HES_UPTODATE;
337 	}
338 
339 	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
340 }
341 
rapl_pmu_event_add(struct perf_event * event,int mode)342 static int rapl_pmu_event_add(struct perf_event *event, int mode)
343 {
344 	struct rapl_pmu *rapl_pmu = event->pmu_private;
345 	struct hw_perf_event *hwc = &event->hw;
346 	unsigned long flags;
347 
348 	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
349 
350 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
351 
352 	if (mode & PERF_EF_START)
353 		__rapl_pmu_event_start(rapl_pmu, event);
354 
355 	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
356 
357 	return 0;
358 }
359 
rapl_pmu_event_del(struct perf_event * event,int flags)360 static void rapl_pmu_event_del(struct perf_event *event, int flags)
361 {
362 	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
363 }
364 
rapl_pmu_event_init(struct perf_event * event)365 static int rapl_pmu_event_init(struct perf_event *event)
366 {
367 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
368 	int bit, rapl_pmus_scope, ret = 0;
369 	struct rapl_pmu *rapl_pmu;
370 	unsigned int rapl_pmu_idx;
371 	struct rapl_pmus *rapl_pmus;
372 
373 	/* only look at RAPL events */
374 	if (event->attr.type != event->pmu->type)
375 		return -ENOENT;
376 
377 	/* unsupported modes and filters */
378 	if (event->attr.sample_period) /* no sampling */
379 		return -EINVAL;
380 
381 	/* check only supported bits are set */
382 	if (event->attr.config & ~RAPL_EVENT_MASK)
383 		return -EINVAL;
384 
385 	if (event->cpu < 0)
386 		return -EINVAL;
387 
388 	rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
389 	if (!rapl_pmus)
390 		return -EINVAL;
391 	rapl_pmus_scope = rapl_pmus->pmu.scope;
392 
393 	if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
394 		cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
395 		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
396 			return -EINVAL;
397 
398 		bit = cfg - 1;
399 		event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
400 	} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
401 		cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
402 		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
403 			return -EINVAL;
404 
405 		bit = cfg - 1;
406 		event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
407 	} else
408 		return -EINVAL;
409 
410 	/* check event supported */
411 	if (!(rapl_pmus->cntr_mask & (1 << bit)))
412 		return -EINVAL;
413 
414 	rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
415 	if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
416 		return -EINVAL;
417 	/* must be done before validate_group */
418 	rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
419 	if (!rapl_pmu)
420 		return -EINVAL;
421 
422 	event->pmu_private = rapl_pmu;
423 	event->hw.config = cfg;
424 	event->hw.idx = bit;
425 
426 	return ret;
427 }
428 
rapl_pmu_event_read(struct perf_event * event)429 static void rapl_pmu_event_read(struct perf_event *event)
430 {
431 	rapl_event_update(event);
432 }
433 
434 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
435 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
436 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
437 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
438 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
439 RAPL_EVENT_ATTR_STR(energy-core,   rapl_core, "event=0x01");
440 
441 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
442 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
443 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
444 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
445 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
446 RAPL_EVENT_ATTR_STR(energy-core.unit,   rapl_core_unit, "Joules");
447 
448 /*
449  * we compute in 0.23 nJ increments regardless of MSR
450  */
451 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
452 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
453 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
454 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
455 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
456 RAPL_EVENT_ATTR_STR(energy-core.scale,   rapl_core_scale, "2.3283064365386962890625e-10");
457 
458 /*
459  * There are no default events, but we need to create
460  * "events" group (with empty attrs) before updating
461  * it with detected events.
462  */
463 static struct attribute *attrs_empty[] = {
464 	NULL,
465 };
466 
467 static struct attribute_group rapl_pmu_events_group = {
468 	.name = "events",
469 	.attrs = attrs_empty,
470 };
471 
472 PMU_FORMAT_ATTR(event, "config:0-7");
473 static struct attribute *rapl_formats_attr[] = {
474 	&format_attr_event.attr,
475 	NULL,
476 };
477 
478 static struct attribute_group rapl_pmu_format_group = {
479 	.name = "format",
480 	.attrs = rapl_formats_attr,
481 };
482 
483 static const struct attribute_group *rapl_attr_groups[] = {
484 	&rapl_pmu_format_group,
485 	&rapl_pmu_events_group,
486 	NULL,
487 };
488 
489 static const struct attribute_group *rapl_core_attr_groups[] = {
490 	&rapl_pmu_format_group,
491 	&rapl_pmu_events_group,
492 	NULL,
493 };
494 
495 static struct attribute *rapl_events_cores[] = {
496 	EVENT_PTR(rapl_cores),
497 	EVENT_PTR(rapl_cores_unit),
498 	EVENT_PTR(rapl_cores_scale),
499 	NULL,
500 };
501 
502 static struct attribute_group rapl_events_cores_group = {
503 	.name  = "events",
504 	.attrs = rapl_events_cores,
505 };
506 
507 static struct attribute *rapl_events_pkg[] = {
508 	EVENT_PTR(rapl_pkg),
509 	EVENT_PTR(rapl_pkg_unit),
510 	EVENT_PTR(rapl_pkg_scale),
511 	NULL,
512 };
513 
514 static struct attribute_group rapl_events_pkg_group = {
515 	.name  = "events",
516 	.attrs = rapl_events_pkg,
517 };
518 
519 static struct attribute *rapl_events_ram[] = {
520 	EVENT_PTR(rapl_ram),
521 	EVENT_PTR(rapl_ram_unit),
522 	EVENT_PTR(rapl_ram_scale),
523 	NULL,
524 };
525 
526 static struct attribute_group rapl_events_ram_group = {
527 	.name  = "events",
528 	.attrs = rapl_events_ram,
529 };
530 
531 static struct attribute *rapl_events_gpu[] = {
532 	EVENT_PTR(rapl_gpu),
533 	EVENT_PTR(rapl_gpu_unit),
534 	EVENT_PTR(rapl_gpu_scale),
535 	NULL,
536 };
537 
538 static struct attribute_group rapl_events_gpu_group = {
539 	.name  = "events",
540 	.attrs = rapl_events_gpu,
541 };
542 
543 static struct attribute *rapl_events_psys[] = {
544 	EVENT_PTR(rapl_psys),
545 	EVENT_PTR(rapl_psys_unit),
546 	EVENT_PTR(rapl_psys_scale),
547 	NULL,
548 };
549 
550 static struct attribute_group rapl_events_psys_group = {
551 	.name  = "events",
552 	.attrs = rapl_events_psys,
553 };
554 
555 static struct attribute *rapl_events_core[] = {
556 	EVENT_PTR(rapl_core),
557 	EVENT_PTR(rapl_core_unit),
558 	EVENT_PTR(rapl_core_scale),
559 	NULL,
560 };
561 
562 static struct attribute_group rapl_events_core_group = {
563 	.name  = "events",
564 	.attrs = rapl_events_core,
565 };
566 
test_msr(int idx,void * data)567 static bool test_msr(int idx, void *data)
568 {
569 	return test_bit(idx, (unsigned long *) data);
570 }
571 
572 /* Only lower 32bits of the MSR represents the energy counter */
573 #define RAPL_MSR_MASK 0xFFFFFFFF
574 
575 static struct perf_msr intel_rapl_msrs[] = {
576 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
577 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
578 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
579 	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
580 	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
581 };
582 
583 static struct perf_msr intel_rapl_spr_msrs[] = {
584 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
585 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
586 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
587 	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
588 	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
589 };
590 
591 /*
592  * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
593  * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
594  * - want to use same event codes across both architectures
595  */
596 static struct perf_msr amd_rapl_pkg_msrs[] = {
597 	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
598 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
599 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
600 	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   NULL, false, 0 },
601 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
602 };
603 
604 static struct perf_msr amd_rapl_core_msrs[] = {
605 	[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
606 				 test_msr, false, RAPL_MSR_MASK },
607 };
608 
rapl_check_hw_unit(void)609 static int rapl_check_hw_unit(void)
610 {
611 	u64 msr_rapl_power_unit_bits;
612 	int i;
613 
614 	/* protect rdmsrl() to handle virtualization */
615 	if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
616 		return -1;
617 	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
618 		rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
619 
620 	rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
621 
622 	switch (rapl_model->unit_quirk) {
623 	/*
624 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
625 	 * different than the unit from power unit MSR. See
626 	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
627 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
628 	 */
629 	case RAPL_UNIT_QUIRK_INTEL_HSW:
630 		rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
631 		break;
632 	/* SPR uses a fixed energy unit for Psys domain. */
633 	case RAPL_UNIT_QUIRK_INTEL_SPR:
634 		rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
635 		break;
636 	default:
637 		break;
638 	}
639 
640 	/*
641 	 * Calculate the timer rate:
642 	 * Use reference of 200W for scaling the timeout to avoid counter
643 	 * overflows. 200W = 200 Joules/sec
644 	 * Divide interval by 2 to avoid lockstep (2 * 100)
645 	 * if hw unit is 32, then we use 2 ms 1/200/2
646 	 */
647 	rapl_timer_ms = 2;
648 	if (rapl_pkg_hw_unit[0] < 32) {
649 		rapl_timer_ms = (1000 / (2 * 100));
650 		rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
651 	}
652 	return 0;
653 }
654 
rapl_advertise(void)655 static void __init rapl_advertise(void)
656 {
657 	int i;
658 	int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
659 
660 	if (rapl_pmus_core)
661 		num_counters += hweight32(rapl_pmus_core->cntr_mask);
662 
663 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
664 		num_counters, rapl_timer_ms);
665 
666 	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
667 		if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
668 			pr_info("hw unit of domain %s 2^-%d Joules\n",
669 				rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
670 		}
671 	}
672 
673 	if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
674 		pr_info("hw unit of domain %s 2^-%d Joules\n",
675 			rapl_core_domain_name, rapl_core_hw_unit);
676 }
677 
cleanup_rapl_pmus(struct rapl_pmus * rapl_pmus)678 static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
679 {
680 	int i;
681 
682 	for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
683 		kfree(rapl_pmus->rapl_pmu[i]);
684 	kfree(rapl_pmus);
685 }
686 
687 static const struct attribute_group *rapl_attr_update[] = {
688 	&rapl_events_cores_group,
689 	&rapl_events_pkg_group,
690 	&rapl_events_ram_group,
691 	&rapl_events_gpu_group,
692 	&rapl_events_psys_group,
693 	NULL,
694 };
695 
696 static const struct attribute_group *rapl_core_attr_update[] = {
697 	&rapl_events_core_group,
698 	NULL,
699 };
700 
init_rapl_pmu(struct rapl_pmus * rapl_pmus)701 static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
702 {
703 	struct rapl_pmu *rapl_pmu;
704 	int idx;
705 
706 	for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
707 		rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
708 		if (!rapl_pmu)
709 			goto free;
710 
711 		raw_spin_lock_init(&rapl_pmu->lock);
712 		INIT_LIST_HEAD(&rapl_pmu->active_list);
713 		rapl_pmu->pmu = &rapl_pmus->pmu;
714 		rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
715 		rapl_hrtimer_init(rapl_pmu);
716 
717 		rapl_pmus->rapl_pmu[idx] = rapl_pmu;
718 	}
719 
720 	return 0;
721 free:
722 	for (; idx > 0; idx--)
723 		kfree(rapl_pmus->rapl_pmu[idx - 1]);
724 	return -ENOMEM;
725 }
726 
init_rapl_pmus(struct rapl_pmus ** rapl_pmus_ptr,int rapl_pmu_scope,const struct attribute_group ** rapl_attr_groups,const struct attribute_group ** rapl_attr_update)727 static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
728 				 const struct attribute_group **rapl_attr_groups,
729 				 const struct attribute_group **rapl_attr_update)
730 {
731 	int nr_rapl_pmu = topology_max_packages();
732 	struct rapl_pmus *rapl_pmus;
733 
734 	/*
735 	 * rapl_pmu_scope must be either PKG, DIE or CORE
736 	 */
737 	if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
738 		nr_rapl_pmu	*= topology_max_dies_per_package();
739 	else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
740 		nr_rapl_pmu	*= topology_num_cores_per_package();
741 	else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
742 		return -EINVAL;
743 
744 	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
745 	if (!rapl_pmus)
746 		return -ENOMEM;
747 
748 	*rapl_pmus_ptr = rapl_pmus;
749 
750 	rapl_pmus->nr_rapl_pmu		= nr_rapl_pmu;
751 	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
752 	rapl_pmus->pmu.attr_update	= rapl_attr_update;
753 	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
754 	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
755 	rapl_pmus->pmu.add		= rapl_pmu_event_add;
756 	rapl_pmus->pmu.del		= rapl_pmu_event_del;
757 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
758 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
759 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
760 	rapl_pmus->pmu.scope		= rapl_pmu_scope;
761 	rapl_pmus->pmu.module		= THIS_MODULE;
762 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
763 
764 	return init_rapl_pmu(rapl_pmus);
765 }
766 
767 static struct rapl_model model_snb = {
768 	.pkg_events	= BIT(PERF_RAPL_PP0) |
769 			  BIT(PERF_RAPL_PKG) |
770 			  BIT(PERF_RAPL_PP1),
771 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
772 	.rapl_pkg_msrs	= intel_rapl_msrs,
773 };
774 
775 static struct rapl_model model_snbep = {
776 	.pkg_events	= BIT(PERF_RAPL_PP0) |
777 			  BIT(PERF_RAPL_PKG) |
778 			  BIT(PERF_RAPL_RAM),
779 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
780 	.rapl_pkg_msrs	= intel_rapl_msrs,
781 };
782 
783 static struct rapl_model model_hsw = {
784 	.pkg_events	= BIT(PERF_RAPL_PP0) |
785 			  BIT(PERF_RAPL_PKG) |
786 			  BIT(PERF_RAPL_RAM) |
787 			  BIT(PERF_RAPL_PP1),
788 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
789 	.rapl_pkg_msrs	= intel_rapl_msrs,
790 };
791 
792 static struct rapl_model model_hsx = {
793 	.pkg_events	= BIT(PERF_RAPL_PP0) |
794 			  BIT(PERF_RAPL_PKG) |
795 			  BIT(PERF_RAPL_RAM),
796 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
797 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
798 	.rapl_pkg_msrs	= intel_rapl_msrs,
799 };
800 
801 static struct rapl_model model_knl = {
802 	.pkg_events	= BIT(PERF_RAPL_PKG) |
803 			  BIT(PERF_RAPL_RAM),
804 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
805 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
806 	.rapl_pkg_msrs	= intel_rapl_msrs,
807 };
808 
809 static struct rapl_model model_skl = {
810 	.pkg_events	= BIT(PERF_RAPL_PP0) |
811 			  BIT(PERF_RAPL_PKG) |
812 			  BIT(PERF_RAPL_RAM) |
813 			  BIT(PERF_RAPL_PP1) |
814 			  BIT(PERF_RAPL_PSYS),
815 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
816 	.rapl_pkg_msrs      = intel_rapl_msrs,
817 };
818 
819 static struct rapl_model model_spr = {
820 	.pkg_events	= BIT(PERF_RAPL_PP0) |
821 			  BIT(PERF_RAPL_PKG) |
822 			  BIT(PERF_RAPL_RAM) |
823 			  BIT(PERF_RAPL_PSYS),
824 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
825 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
826 	.rapl_pkg_msrs	= intel_rapl_spr_msrs,
827 };
828 
829 static struct rapl_model model_amd_hygon = {
830 	.pkg_events	= BIT(PERF_RAPL_PKG),
831 	.core_events	= BIT(PERF_RAPL_CORE),
832 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
833 	.rapl_pkg_msrs	= amd_rapl_pkg_msrs,
834 	.rapl_core_msrs	= amd_rapl_core_msrs,
835 };
836 
837 static const struct x86_cpu_id rapl_model_match[] __initconst = {
838 	X86_MATCH_FEATURE(X86_FEATURE_RAPL,	&model_amd_hygon),
839 	X86_MATCH_VFM(INTEL_SANDYBRIDGE,	&model_snb),
840 	X86_MATCH_VFM(INTEL_SANDYBRIDGE_X,	&model_snbep),
841 	X86_MATCH_VFM(INTEL_IVYBRIDGE,		&model_snb),
842 	X86_MATCH_VFM(INTEL_IVYBRIDGE_X,	&model_snbep),
843 	X86_MATCH_VFM(INTEL_HASWELL,		&model_hsw),
844 	X86_MATCH_VFM(INTEL_HASWELL_X,		&model_hsx),
845 	X86_MATCH_VFM(INTEL_HASWELL_L,		&model_hsw),
846 	X86_MATCH_VFM(INTEL_HASWELL_G,		&model_hsw),
847 	X86_MATCH_VFM(INTEL_BROADWELL,		&model_hsw),
848 	X86_MATCH_VFM(INTEL_BROADWELL_G,	&model_hsw),
849 	X86_MATCH_VFM(INTEL_BROADWELL_X,	&model_hsx),
850 	X86_MATCH_VFM(INTEL_BROADWELL_D,	&model_hsx),
851 	X86_MATCH_VFM(INTEL_XEON_PHI_KNL,	&model_knl),
852 	X86_MATCH_VFM(INTEL_XEON_PHI_KNM,	&model_knl),
853 	X86_MATCH_VFM(INTEL_SKYLAKE_L,		&model_skl),
854 	X86_MATCH_VFM(INTEL_SKYLAKE,		&model_skl),
855 	X86_MATCH_VFM(INTEL_SKYLAKE_X,		&model_hsx),
856 	X86_MATCH_VFM(INTEL_KABYLAKE_L,		&model_skl),
857 	X86_MATCH_VFM(INTEL_KABYLAKE,		&model_skl),
858 	X86_MATCH_VFM(INTEL_CANNONLAKE_L,	&model_skl),
859 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT,	&model_hsw),
860 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D,	&model_hsw),
861 	X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS,	&model_hsw),
862 	X86_MATCH_VFM(INTEL_ICELAKE_L,		&model_skl),
863 	X86_MATCH_VFM(INTEL_ICELAKE,		&model_skl),
864 	X86_MATCH_VFM(INTEL_ICELAKE_D,		&model_hsx),
865 	X86_MATCH_VFM(INTEL_ICELAKE_X,		&model_hsx),
866 	X86_MATCH_VFM(INTEL_COMETLAKE_L,	&model_skl),
867 	X86_MATCH_VFM(INTEL_COMETLAKE,		&model_skl),
868 	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	&model_skl),
869 	X86_MATCH_VFM(INTEL_TIGERLAKE,		&model_skl),
870 	X86_MATCH_VFM(INTEL_ALDERLAKE,		&model_skl),
871 	X86_MATCH_VFM(INTEL_ALDERLAKE_L,	&model_skl),
872 	X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,	&model_skl),
873 	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X,	&model_spr),
874 	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X,	&model_spr),
875 	X86_MATCH_VFM(INTEL_RAPTORLAKE,		&model_skl),
876 	X86_MATCH_VFM(INTEL_RAPTORLAKE_P,	&model_skl),
877 	X86_MATCH_VFM(INTEL_RAPTORLAKE_S,	&model_skl),
878 	X86_MATCH_VFM(INTEL_METEORLAKE,		&model_skl),
879 	X86_MATCH_VFM(INTEL_METEORLAKE_L,	&model_skl),
880 	X86_MATCH_VFM(INTEL_ARROWLAKE_H,	&model_skl),
881 	X86_MATCH_VFM(INTEL_ARROWLAKE,		&model_skl),
882 	X86_MATCH_VFM(INTEL_ARROWLAKE_U,	&model_skl),
883 	X86_MATCH_VFM(INTEL_LUNARLAKE_M,	&model_skl),
884 	{},
885 };
886 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
887 
rapl_pmu_init(void)888 static int __init rapl_pmu_init(void)
889 {
890 	const struct x86_cpu_id *id;
891 	int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
892 	int ret;
893 
894 	if (rapl_pkg_pmu_is_pkg_scope())
895 		rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
896 
897 	id = x86_match_cpu(rapl_model_match);
898 	if (!id)
899 		return -ENODEV;
900 
901 	rapl_model = (struct rapl_model *) id->driver_data;
902 
903 	ret = rapl_check_hw_unit();
904 	if (ret)
905 		return ret;
906 
907 	ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
908 			     rapl_attr_update);
909 	if (ret)
910 		return ret;
911 
912 	rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
913 						  PERF_RAPL_PKG_EVENTS_MAX, false,
914 						  (void *) &rapl_model->pkg_events);
915 
916 	ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
917 	if (ret)
918 		goto out;
919 
920 	if (rapl_model->core_events) {
921 		ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
922 				     rapl_core_attr_groups,
923 				     rapl_core_attr_update);
924 		if (ret) {
925 			pr_warn("power-core PMU initialization failed (%d)\n", ret);
926 			goto core_init_failed;
927 		}
928 
929 		rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
930 						     PERF_RAPL_CORE_EVENTS_MAX, false,
931 						     (void *) &rapl_model->core_events);
932 
933 		ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
934 		if (ret) {
935 			pr_warn("power-core PMU registration failed (%d)\n", ret);
936 			cleanup_rapl_pmus(rapl_pmus_core);
937 		}
938 	}
939 
940 core_init_failed:
941 	rapl_advertise();
942 	return 0;
943 
944 out:
945 	pr_warn("Initialization failed (%d), disabled\n", ret);
946 	cleanup_rapl_pmus(rapl_pmus_pkg);
947 	return ret;
948 }
949 module_init(rapl_pmu_init);
950 
intel_rapl_exit(void)951 static void __exit intel_rapl_exit(void)
952 {
953 	if (rapl_pmus_core) {
954 		perf_pmu_unregister(&rapl_pmus_core->pmu);
955 		cleanup_rapl_pmus(rapl_pmus_core);
956 	}
957 	perf_pmu_unregister(&rapl_pmus_pkg->pmu);
958 	cleanup_rapl_pmus(rapl_pmus_pkg);
959 }
960 module_exit(intel_rapl_exit);
961