1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Support Intel/AMD RAPL energy consumption counters
4 * Copyright (C) 2013 Google, Inc., Stephane Eranian
5 *
6 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7 * section 14.7.1 (September 2013)
8 *
9 * AMD RAPL interface for Fam17h is described in the public PPR:
10 * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11 *
12 * RAPL provides more controls than just reporting energy consumption
13 * however here we only expose the 3 energy consumption free running
14 * counters (pp0, pkg, dram).
15 *
16 * Each of those counters increments in a power unit defined by the
17 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18 * but it can vary.
19 *
20 * Counter to rapl events mappings:
21 *
22 * pp0 counter: consumption of all physical cores (power plane 0)
23 * event: rapl_energy_cores
24 * perf code: 0x1
25 *
26 * pkg counter: consumption of the whole processor package
27 * event: rapl_energy_pkg
28 * perf code: 0x2
29 *
30 * dram counter: consumption of the dram domain (servers only)
31 * event: rapl_energy_dram
32 * perf code: 0x3
33 *
34 * gpu counter: consumption of the builtin-gpu domain (client only)
35 * event: rapl_energy_gpu
36 * perf code: 0x4
37 *
38 * psys counter: consumption of the builtin-psys domain (client only)
39 * event: rapl_energy_psys
40 * perf code: 0x5
41 *
42 * We manage those counters as free running (read-only). They may be
43 * use simultaneously by other tools, such as turbostat.
44 *
45 * The events only support system-wide mode counting. There is no
46 * sampling support because it does not make sense and is not
47 * supported by the RAPL hardware.
48 *
49 * Because we want to avoid floating-point operations in the kernel,
50 * the events are all reported in fixed point arithmetic (32.32).
51 * Tools must adjust the counts to convert them to Watts using
52 * the duration of the measurement. Tools may use a function such as
53 * ldexp(raw_count, -32);
54 */
55
56 #define pr_fmt(fmt) "RAPL PMU: " fmt
57
58 #include <linux/module.h>
59 #include <linux/slab.h>
60 #include <linux/perf_event.h>
61 #include <linux/nospec.h>
62 #include <asm/cpu_device_id.h>
63 #include <asm/intel-family.h>
64 #include "perf_event.h"
65 #include "probe.h"
66
67 MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters");
68 MODULE_LICENSE("GPL");
69
70 /*
71 * RAPL energy status counters
72 */
73 enum perf_rapl_events {
74 PERF_RAPL_PP0 = 0, /* all cores */
75 PERF_RAPL_PKG, /* entire package */
76 PERF_RAPL_RAM, /* DRAM */
77 PERF_RAPL_PP1, /* gpu */
78 PERF_RAPL_PSYS, /* psys */
79
80 PERF_RAPL_MAX,
81 NR_RAPL_DOMAINS = PERF_RAPL_MAX,
82 };
83
84 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
85 "pp0-core",
86 "package",
87 "dram",
88 "pp1-gpu",
89 "psys",
90 };
91
92 /*
93 * event code: LSB 8 bits, passed in attr->config
94 * any other bit is reserved
95 */
96 #define RAPL_EVENT_MASK 0xFFULL
97 #define RAPL_CNTR_WIDTH 32
98
99 #define RAPL_EVENT_ATTR_STR(_name, v, str) \
100 static struct perf_pmu_events_attr event_attr_##v = { \
101 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
102 .id = 0, \
103 .event_str = str, \
104 };
105
106 /*
107 * RAPL Package energy counter scope:
108 * 1. AMD/HYGON platforms have a per-PKG package energy counter
109 * 2. For Intel platforms
110 * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
111 * 2.2. Other Intel platforms are single die systems so the scope can be
112 * considered as either pkg-scope or die-scope, and we are considering
113 * them as die-scope.
114 */
115 #define rapl_pmu_is_pkg_scope() \
116 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
117 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
118
119 struct rapl_pmu {
120 raw_spinlock_t lock;
121 int n_active;
122 int cpu;
123 struct list_head active_list;
124 struct pmu *pmu;
125 ktime_t timer_interval;
126 struct hrtimer hrtimer;
127 };
128
129 struct rapl_pmus {
130 struct pmu pmu;
131 unsigned int nr_rapl_pmu;
132 struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
133 };
134
135 enum rapl_unit_quirk {
136 RAPL_UNIT_QUIRK_NONE,
137 RAPL_UNIT_QUIRK_INTEL_HSW,
138 RAPL_UNIT_QUIRK_INTEL_SPR,
139 };
140
141 struct rapl_model {
142 struct perf_msr *rapl_msrs;
143 unsigned long events;
144 unsigned int msr_power_unit;
145 enum rapl_unit_quirk unit_quirk;
146 };
147
148 /* 1/2^hw_unit Joule */
149 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
150 static struct rapl_pmus *rapl_pmus;
151 static cpumask_t rapl_cpu_mask;
152 static unsigned int rapl_cntr_mask;
153 static u64 rapl_timer_ms;
154 static struct perf_msr *rapl_msrs;
155
156 /*
157 * Helper functions to get the correct topology macros according to the
158 * RAPL PMU scope.
159 */
get_rapl_pmu_idx(int cpu)160 static inline unsigned int get_rapl_pmu_idx(int cpu)
161 {
162 return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
163 topology_logical_die_id(cpu);
164 }
165
get_rapl_pmu_cpumask(int cpu)166 static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
167 {
168 return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
169 topology_die_cpumask(cpu);
170 }
171
cpu_to_rapl_pmu(unsigned int cpu)172 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
173 {
174 unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
175
176 /*
177 * The unsigned check also catches the '-1' return value for non
178 * existent mappings in the topology map.
179 */
180 return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
181 }
182
rapl_read_counter(struct perf_event * event)183 static inline u64 rapl_read_counter(struct perf_event *event)
184 {
185 u64 raw;
186 rdmsrl(event->hw.event_base, raw);
187 return raw;
188 }
189
rapl_scale(u64 v,int cfg)190 static inline u64 rapl_scale(u64 v, int cfg)
191 {
192 if (cfg > NR_RAPL_DOMAINS) {
193 pr_warn("Invalid domain %d, failed to scale data\n", cfg);
194 return v;
195 }
196 /*
197 * scale delta to smallest unit (1/2^32)
198 * users must then scale back: count * 1/(1e9*2^32) to get Joules
199 * or use ldexp(count, -32).
200 * Watts = Joules/Time delta
201 */
202 return v << (32 - rapl_hw_unit[cfg - 1]);
203 }
204
rapl_event_update(struct perf_event * event)205 static u64 rapl_event_update(struct perf_event *event)
206 {
207 struct hw_perf_event *hwc = &event->hw;
208 u64 prev_raw_count, new_raw_count;
209 s64 delta, sdelta;
210 int shift = RAPL_CNTR_WIDTH;
211
212 prev_raw_count = local64_read(&hwc->prev_count);
213 do {
214 rdmsrl(event->hw.event_base, new_raw_count);
215 } while (!local64_try_cmpxchg(&hwc->prev_count,
216 &prev_raw_count, new_raw_count));
217
218 /*
219 * Now we have the new raw value and have updated the prev
220 * timestamp already. We can now calculate the elapsed delta
221 * (event-)time and add that to the generic event.
222 *
223 * Careful, not all hw sign-extends above the physical width
224 * of the count.
225 */
226 delta = (new_raw_count << shift) - (prev_raw_count << shift);
227 delta >>= shift;
228
229 sdelta = rapl_scale(delta, event->hw.config);
230
231 local64_add(sdelta, &event->count);
232
233 return new_raw_count;
234 }
235
rapl_start_hrtimer(struct rapl_pmu * pmu)236 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
237 {
238 hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
239 HRTIMER_MODE_REL_PINNED);
240 }
241
rapl_hrtimer_handle(struct hrtimer * hrtimer)242 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
243 {
244 struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
245 struct perf_event *event;
246 unsigned long flags;
247
248 if (!pmu->n_active)
249 return HRTIMER_NORESTART;
250
251 raw_spin_lock_irqsave(&pmu->lock, flags);
252
253 list_for_each_entry(event, &pmu->active_list, active_entry)
254 rapl_event_update(event);
255
256 raw_spin_unlock_irqrestore(&pmu->lock, flags);
257
258 hrtimer_forward_now(hrtimer, pmu->timer_interval);
259
260 return HRTIMER_RESTART;
261 }
262
rapl_hrtimer_init(struct rapl_pmu * pmu)263 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
264 {
265 struct hrtimer *hr = &pmu->hrtimer;
266
267 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
268 hr->function = rapl_hrtimer_handle;
269 }
270
__rapl_pmu_event_start(struct rapl_pmu * pmu,struct perf_event * event)271 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
272 struct perf_event *event)
273 {
274 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
275 return;
276
277 event->hw.state = 0;
278
279 list_add_tail(&event->active_entry, &pmu->active_list);
280
281 local64_set(&event->hw.prev_count, rapl_read_counter(event));
282
283 pmu->n_active++;
284 if (pmu->n_active == 1)
285 rapl_start_hrtimer(pmu);
286 }
287
rapl_pmu_event_start(struct perf_event * event,int mode)288 static void rapl_pmu_event_start(struct perf_event *event, int mode)
289 {
290 struct rapl_pmu *pmu = event->pmu_private;
291 unsigned long flags;
292
293 raw_spin_lock_irqsave(&pmu->lock, flags);
294 __rapl_pmu_event_start(pmu, event);
295 raw_spin_unlock_irqrestore(&pmu->lock, flags);
296 }
297
rapl_pmu_event_stop(struct perf_event * event,int mode)298 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
299 {
300 struct rapl_pmu *pmu = event->pmu_private;
301 struct hw_perf_event *hwc = &event->hw;
302 unsigned long flags;
303
304 raw_spin_lock_irqsave(&pmu->lock, flags);
305
306 /* mark event as deactivated and stopped */
307 if (!(hwc->state & PERF_HES_STOPPED)) {
308 WARN_ON_ONCE(pmu->n_active <= 0);
309 pmu->n_active--;
310 if (pmu->n_active == 0)
311 hrtimer_cancel(&pmu->hrtimer);
312
313 list_del(&event->active_entry);
314
315 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
316 hwc->state |= PERF_HES_STOPPED;
317 }
318
319 /* check if update of sw counter is necessary */
320 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
321 /*
322 * Drain the remaining delta count out of a event
323 * that we are disabling:
324 */
325 rapl_event_update(event);
326 hwc->state |= PERF_HES_UPTODATE;
327 }
328
329 raw_spin_unlock_irqrestore(&pmu->lock, flags);
330 }
331
rapl_pmu_event_add(struct perf_event * event,int mode)332 static int rapl_pmu_event_add(struct perf_event *event, int mode)
333 {
334 struct rapl_pmu *pmu = event->pmu_private;
335 struct hw_perf_event *hwc = &event->hw;
336 unsigned long flags;
337
338 raw_spin_lock_irqsave(&pmu->lock, flags);
339
340 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
341
342 if (mode & PERF_EF_START)
343 __rapl_pmu_event_start(pmu, event);
344
345 raw_spin_unlock_irqrestore(&pmu->lock, flags);
346
347 return 0;
348 }
349
rapl_pmu_event_del(struct perf_event * event,int flags)350 static void rapl_pmu_event_del(struct perf_event *event, int flags)
351 {
352 rapl_pmu_event_stop(event, PERF_EF_UPDATE);
353 }
354
rapl_pmu_event_init(struct perf_event * event)355 static int rapl_pmu_event_init(struct perf_event *event)
356 {
357 u64 cfg = event->attr.config & RAPL_EVENT_MASK;
358 int bit, ret = 0;
359 struct rapl_pmu *pmu;
360
361 /* only look at RAPL events */
362 if (event->attr.type != rapl_pmus->pmu.type)
363 return -ENOENT;
364
365 /* check only supported bits are set */
366 if (event->attr.config & ~RAPL_EVENT_MASK)
367 return -EINVAL;
368
369 if (event->cpu < 0)
370 return -EINVAL;
371
372 event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
373
374 if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
375 return -EINVAL;
376
377 cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
378 bit = cfg - 1;
379
380 /* check event supported */
381 if (!(rapl_cntr_mask & (1 << bit)))
382 return -EINVAL;
383
384 /* unsupported modes and filters */
385 if (event->attr.sample_period) /* no sampling */
386 return -EINVAL;
387
388 /* must be done before validate_group */
389 pmu = cpu_to_rapl_pmu(event->cpu);
390 if (!pmu)
391 return -EINVAL;
392 event->cpu = pmu->cpu;
393 event->pmu_private = pmu;
394 event->hw.event_base = rapl_msrs[bit].msr;
395 event->hw.config = cfg;
396 event->hw.idx = bit;
397
398 return ret;
399 }
400
rapl_pmu_event_read(struct perf_event * event)401 static void rapl_pmu_event_read(struct perf_event *event)
402 {
403 rapl_event_update(event);
404 }
405
rapl_get_attr_cpumask(struct device * dev,struct device_attribute * attr,char * buf)406 static ssize_t rapl_get_attr_cpumask(struct device *dev,
407 struct device_attribute *attr, char *buf)
408 {
409 return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
410 }
411
412 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
413
414 static struct attribute *rapl_pmu_attrs[] = {
415 &dev_attr_cpumask.attr,
416 NULL,
417 };
418
419 static struct attribute_group rapl_pmu_attr_group = {
420 .attrs = rapl_pmu_attrs,
421 };
422
423 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
424 RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
425 RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
426 RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
427 RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
428
429 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
430 RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
431 RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
432 RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
433 RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
434
435 /*
436 * we compute in 0.23 nJ increments regardless of MSR
437 */
438 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
439 RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
440 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
441 RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
442 RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
443
444 /*
445 * There are no default events, but we need to create
446 * "events" group (with empty attrs) before updating
447 * it with detected events.
448 */
449 static struct attribute *attrs_empty[] = {
450 NULL,
451 };
452
453 static struct attribute_group rapl_pmu_events_group = {
454 .name = "events",
455 .attrs = attrs_empty,
456 };
457
458 PMU_FORMAT_ATTR(event, "config:0-7");
459 static struct attribute *rapl_formats_attr[] = {
460 &format_attr_event.attr,
461 NULL,
462 };
463
464 static struct attribute_group rapl_pmu_format_group = {
465 .name = "format",
466 .attrs = rapl_formats_attr,
467 };
468
469 static const struct attribute_group *rapl_attr_groups[] = {
470 &rapl_pmu_attr_group,
471 &rapl_pmu_format_group,
472 &rapl_pmu_events_group,
473 NULL,
474 };
475
476 static struct attribute *rapl_events_cores[] = {
477 EVENT_PTR(rapl_cores),
478 EVENT_PTR(rapl_cores_unit),
479 EVENT_PTR(rapl_cores_scale),
480 NULL,
481 };
482
483 static struct attribute_group rapl_events_cores_group = {
484 .name = "events",
485 .attrs = rapl_events_cores,
486 };
487
488 static struct attribute *rapl_events_pkg[] = {
489 EVENT_PTR(rapl_pkg),
490 EVENT_PTR(rapl_pkg_unit),
491 EVENT_PTR(rapl_pkg_scale),
492 NULL,
493 };
494
495 static struct attribute_group rapl_events_pkg_group = {
496 .name = "events",
497 .attrs = rapl_events_pkg,
498 };
499
500 static struct attribute *rapl_events_ram[] = {
501 EVENT_PTR(rapl_ram),
502 EVENT_PTR(rapl_ram_unit),
503 EVENT_PTR(rapl_ram_scale),
504 NULL,
505 };
506
507 static struct attribute_group rapl_events_ram_group = {
508 .name = "events",
509 .attrs = rapl_events_ram,
510 };
511
512 static struct attribute *rapl_events_gpu[] = {
513 EVENT_PTR(rapl_gpu),
514 EVENT_PTR(rapl_gpu_unit),
515 EVENT_PTR(rapl_gpu_scale),
516 NULL,
517 };
518
519 static struct attribute_group rapl_events_gpu_group = {
520 .name = "events",
521 .attrs = rapl_events_gpu,
522 };
523
524 static struct attribute *rapl_events_psys[] = {
525 EVENT_PTR(rapl_psys),
526 EVENT_PTR(rapl_psys_unit),
527 EVENT_PTR(rapl_psys_scale),
528 NULL,
529 };
530
531 static struct attribute_group rapl_events_psys_group = {
532 .name = "events",
533 .attrs = rapl_events_psys,
534 };
535
test_msr(int idx,void * data)536 static bool test_msr(int idx, void *data)
537 {
538 return test_bit(idx, (unsigned long *) data);
539 }
540
541 /* Only lower 32bits of the MSR represents the energy counter */
542 #define RAPL_MSR_MASK 0xFFFFFFFF
543
544 static struct perf_msr intel_rapl_msrs[] = {
545 [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
546 [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
547 [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
548 [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
549 [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
550 };
551
552 static struct perf_msr intel_rapl_spr_msrs[] = {
553 [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
554 [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
555 [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
556 [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
557 [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
558 };
559
560 /*
561 * Force to PERF_RAPL_MAX size due to:
562 * - perf_msr_probe(PERF_RAPL_MAX)
563 * - want to use same event codes across both architectures
564 */
565 static struct perf_msr amd_rapl_msrs[] = {
566 [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
567 [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
568 [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
569 [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, NULL, false, 0 },
570 [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
571 };
572
rapl_cpu_offline(unsigned int cpu)573 static int rapl_cpu_offline(unsigned int cpu)
574 {
575 struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
576 int target;
577
578 /* Check if exiting cpu is used for collecting rapl events */
579 if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
580 return 0;
581
582 pmu->cpu = -1;
583 /* Find a new cpu to collect rapl events */
584 target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu);
585
586 /* Migrate rapl events to the new target */
587 if (target < nr_cpu_ids) {
588 cpumask_set_cpu(target, &rapl_cpu_mask);
589 pmu->cpu = target;
590 perf_pmu_migrate_context(pmu->pmu, cpu, target);
591 }
592 return 0;
593 }
594
rapl_cpu_online(unsigned int cpu)595 static int rapl_cpu_online(unsigned int cpu)
596 {
597 s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu);
598 if (rapl_pmu_idx < 0) {
599 pr_err("topology_logical_(package/die)_id() returned a negative value");
600 return -EINVAL;
601 }
602 struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
603 int target;
604
605 if (!pmu) {
606 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
607 if (!pmu)
608 return -ENOMEM;
609
610 raw_spin_lock_init(&pmu->lock);
611 INIT_LIST_HEAD(&pmu->active_list);
612 pmu->pmu = &rapl_pmus->pmu;
613 pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
614 rapl_hrtimer_init(pmu);
615
616 rapl_pmus->pmus[rapl_pmu_idx] = pmu;
617 }
618
619 /*
620 * Check if there is an online cpu in the package which collects rapl
621 * events already.
622 */
623 target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu));
624 if (target < nr_cpu_ids)
625 return 0;
626
627 cpumask_set_cpu(cpu, &rapl_cpu_mask);
628 pmu->cpu = cpu;
629 return 0;
630 }
631
rapl_check_hw_unit(struct rapl_model * rm)632 static int rapl_check_hw_unit(struct rapl_model *rm)
633 {
634 u64 msr_rapl_power_unit_bits;
635 int i;
636
637 /* protect rdmsrl() to handle virtualization */
638 if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
639 return -1;
640 for (i = 0; i < NR_RAPL_DOMAINS; i++)
641 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
642
643 switch (rm->unit_quirk) {
644 /*
645 * DRAM domain on HSW server and KNL has fixed energy unit which can be
646 * different than the unit from power unit MSR. See
647 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
648 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
649 */
650 case RAPL_UNIT_QUIRK_INTEL_HSW:
651 rapl_hw_unit[PERF_RAPL_RAM] = 16;
652 break;
653 /* SPR uses a fixed energy unit for Psys domain. */
654 case RAPL_UNIT_QUIRK_INTEL_SPR:
655 rapl_hw_unit[PERF_RAPL_PSYS] = 0;
656 break;
657 default:
658 break;
659 }
660
661
662 /*
663 * Calculate the timer rate:
664 * Use reference of 200W for scaling the timeout to avoid counter
665 * overflows. 200W = 200 Joules/sec
666 * Divide interval by 2 to avoid lockstep (2 * 100)
667 * if hw unit is 32, then we use 2 ms 1/200/2
668 */
669 rapl_timer_ms = 2;
670 if (rapl_hw_unit[0] < 32) {
671 rapl_timer_ms = (1000 / (2 * 100));
672 rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
673 }
674 return 0;
675 }
676
rapl_advertise(void)677 static void __init rapl_advertise(void)
678 {
679 int i;
680
681 pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
682 hweight32(rapl_cntr_mask), rapl_timer_ms);
683
684 for (i = 0; i < NR_RAPL_DOMAINS; i++) {
685 if (rapl_cntr_mask & (1 << i)) {
686 pr_info("hw unit of domain %s 2^-%d Joules\n",
687 rapl_domain_names[i], rapl_hw_unit[i]);
688 }
689 }
690 }
691
cleanup_rapl_pmus(void)692 static void cleanup_rapl_pmus(void)
693 {
694 int i;
695
696 for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
697 kfree(rapl_pmus->pmus[i]);
698 kfree(rapl_pmus);
699 }
700
701 static const struct attribute_group *rapl_attr_update[] = {
702 &rapl_events_cores_group,
703 &rapl_events_pkg_group,
704 &rapl_events_ram_group,
705 &rapl_events_gpu_group,
706 &rapl_events_psys_group,
707 NULL,
708 };
709
init_rapl_pmus(void)710 static int __init init_rapl_pmus(void)
711 {
712 int nr_rapl_pmu = topology_max_packages();
713
714 if (!rapl_pmu_is_pkg_scope())
715 nr_rapl_pmu *= topology_max_dies_per_package();
716
717 rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
718 if (!rapl_pmus)
719 return -ENOMEM;
720
721 rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
722 rapl_pmus->pmu.attr_groups = rapl_attr_groups;
723 rapl_pmus->pmu.attr_update = rapl_attr_update;
724 rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
725 rapl_pmus->pmu.event_init = rapl_pmu_event_init;
726 rapl_pmus->pmu.add = rapl_pmu_event_add;
727 rapl_pmus->pmu.del = rapl_pmu_event_del;
728 rapl_pmus->pmu.start = rapl_pmu_event_start;
729 rapl_pmus->pmu.stop = rapl_pmu_event_stop;
730 rapl_pmus->pmu.read = rapl_pmu_event_read;
731 rapl_pmus->pmu.module = THIS_MODULE;
732 rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
733 return 0;
734 }
735
736 static struct rapl_model model_snb = {
737 .events = BIT(PERF_RAPL_PP0) |
738 BIT(PERF_RAPL_PKG) |
739 BIT(PERF_RAPL_PP1),
740 .msr_power_unit = MSR_RAPL_POWER_UNIT,
741 .rapl_msrs = intel_rapl_msrs,
742 };
743
744 static struct rapl_model model_snbep = {
745 .events = BIT(PERF_RAPL_PP0) |
746 BIT(PERF_RAPL_PKG) |
747 BIT(PERF_RAPL_RAM),
748 .msr_power_unit = MSR_RAPL_POWER_UNIT,
749 .rapl_msrs = intel_rapl_msrs,
750 };
751
752 static struct rapl_model model_hsw = {
753 .events = BIT(PERF_RAPL_PP0) |
754 BIT(PERF_RAPL_PKG) |
755 BIT(PERF_RAPL_RAM) |
756 BIT(PERF_RAPL_PP1),
757 .msr_power_unit = MSR_RAPL_POWER_UNIT,
758 .rapl_msrs = intel_rapl_msrs,
759 };
760
761 static struct rapl_model model_hsx = {
762 .events = BIT(PERF_RAPL_PP0) |
763 BIT(PERF_RAPL_PKG) |
764 BIT(PERF_RAPL_RAM),
765 .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
766 .msr_power_unit = MSR_RAPL_POWER_UNIT,
767 .rapl_msrs = intel_rapl_msrs,
768 };
769
770 static struct rapl_model model_knl = {
771 .events = BIT(PERF_RAPL_PKG) |
772 BIT(PERF_RAPL_RAM),
773 .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
774 .msr_power_unit = MSR_RAPL_POWER_UNIT,
775 .rapl_msrs = intel_rapl_msrs,
776 };
777
778 static struct rapl_model model_skl = {
779 .events = BIT(PERF_RAPL_PP0) |
780 BIT(PERF_RAPL_PKG) |
781 BIT(PERF_RAPL_RAM) |
782 BIT(PERF_RAPL_PP1) |
783 BIT(PERF_RAPL_PSYS),
784 .msr_power_unit = MSR_RAPL_POWER_UNIT,
785 .rapl_msrs = intel_rapl_msrs,
786 };
787
788 static struct rapl_model model_spr = {
789 .events = BIT(PERF_RAPL_PP0) |
790 BIT(PERF_RAPL_PKG) |
791 BIT(PERF_RAPL_RAM) |
792 BIT(PERF_RAPL_PSYS),
793 .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
794 .msr_power_unit = MSR_RAPL_POWER_UNIT,
795 .rapl_msrs = intel_rapl_spr_msrs,
796 };
797
798 static struct rapl_model model_amd_hygon = {
799 .events = BIT(PERF_RAPL_PKG),
800 .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
801 .rapl_msrs = amd_rapl_msrs,
802 };
803
804 static const struct x86_cpu_id rapl_model_match[] __initconst = {
805 X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon),
806 X86_MATCH_VFM(INTEL_SANDYBRIDGE, &model_snb),
807 X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &model_snbep),
808 X86_MATCH_VFM(INTEL_IVYBRIDGE, &model_snb),
809 X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &model_snbep),
810 X86_MATCH_VFM(INTEL_HASWELL, &model_hsw),
811 X86_MATCH_VFM(INTEL_HASWELL_X, &model_hsx),
812 X86_MATCH_VFM(INTEL_HASWELL_L, &model_hsw),
813 X86_MATCH_VFM(INTEL_HASWELL_G, &model_hsw),
814 X86_MATCH_VFM(INTEL_BROADWELL, &model_hsw),
815 X86_MATCH_VFM(INTEL_BROADWELL_G, &model_hsw),
816 X86_MATCH_VFM(INTEL_BROADWELL_X, &model_hsx),
817 X86_MATCH_VFM(INTEL_BROADWELL_D, &model_hsx),
818 X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &model_knl),
819 X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &model_knl),
820 X86_MATCH_VFM(INTEL_SKYLAKE_L, &model_skl),
821 X86_MATCH_VFM(INTEL_SKYLAKE, &model_skl),
822 X86_MATCH_VFM(INTEL_SKYLAKE_X, &model_hsx),
823 X86_MATCH_VFM(INTEL_KABYLAKE_L, &model_skl),
824 X86_MATCH_VFM(INTEL_KABYLAKE, &model_skl),
825 X86_MATCH_VFM(INTEL_CANNONLAKE_L, &model_skl),
826 X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &model_hsw),
827 X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &model_hsw),
828 X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &model_hsw),
829 X86_MATCH_VFM(INTEL_ICELAKE_L, &model_skl),
830 X86_MATCH_VFM(INTEL_ICELAKE, &model_skl),
831 X86_MATCH_VFM(INTEL_ICELAKE_D, &model_hsx),
832 X86_MATCH_VFM(INTEL_ICELAKE_X, &model_hsx),
833 X86_MATCH_VFM(INTEL_COMETLAKE_L, &model_skl),
834 X86_MATCH_VFM(INTEL_COMETLAKE, &model_skl),
835 X86_MATCH_VFM(INTEL_TIGERLAKE_L, &model_skl),
836 X86_MATCH_VFM(INTEL_TIGERLAKE, &model_skl),
837 X86_MATCH_VFM(INTEL_ALDERLAKE, &model_skl),
838 X86_MATCH_VFM(INTEL_ALDERLAKE_L, &model_skl),
839 X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &model_skl),
840 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &model_spr),
841 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &model_spr),
842 X86_MATCH_VFM(INTEL_RAPTORLAKE, &model_skl),
843 X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_skl),
844 X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &model_skl),
845 X86_MATCH_VFM(INTEL_METEORLAKE, &model_skl),
846 X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl),
847 X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl),
848 X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl),
849 X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl),
850 {},
851 };
852 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
853
rapl_pmu_init(void)854 static int __init rapl_pmu_init(void)
855 {
856 const struct x86_cpu_id *id;
857 struct rapl_model *rm;
858 int ret;
859
860 id = x86_match_cpu(rapl_model_match);
861 if (!id)
862 return -ENODEV;
863
864 rm = (struct rapl_model *) id->driver_data;
865
866 rapl_msrs = rm->rapl_msrs;
867
868 rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
869 false, (void *) &rm->events);
870
871 ret = rapl_check_hw_unit(rm);
872 if (ret)
873 return ret;
874
875 ret = init_rapl_pmus();
876 if (ret)
877 return ret;
878
879 /*
880 * Install callbacks. Core will call them for each online cpu.
881 */
882 ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
883 "perf/x86/rapl:online",
884 rapl_cpu_online, rapl_cpu_offline);
885 if (ret)
886 goto out;
887
888 ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
889 if (ret)
890 goto out1;
891
892 rapl_advertise();
893 return 0;
894
895 out1:
896 cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
897 out:
898 pr_warn("Initialization failed (%d), disabled\n", ret);
899 cleanup_rapl_pmus();
900 return ret;
901 }
902 module_init(rapl_pmu_init);
903
intel_rapl_exit(void)904 static void __exit intel_rapl_exit(void)
905 {
906 cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
907 perf_pmu_unregister(&rapl_pmus->pmu);
908 cleanup_rapl_pmus();
909 }
910 module_exit(intel_rapl_exit);
911