xref: /linux/drivers/powercap/intel_rapl_common.c (revision b45e0c30bc58fb6fcaa42f1d1d813cefb8ab4117)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Common code for Intel Running Average Power Limit (RAPL) support.
4  * Copyright (c) 2019, Intel Corporation.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/list.h>
11 #include <linux/types.h>
12 #include <linux/device.h>
13 #include <linux/slab.h>
14 #include <linux/log2.h>
15 #include <linux/bitmap.h>
16 #include <linux/delay.h>
17 #include <linux/sysfs.h>
18 #include <linux/cpu.h>
19 #include <linux/powercap.h>
20 #include <linux/suspend.h>
21 #include <linux/intel_rapl.h>
22 #include <linux/processor.h>
23 #include <linux/platform_device.h>
24 
25 #include <asm/iosf_mbi.h>
26 #include <asm/cpu_device_id.h>
27 #include <asm/intel-family.h>
28 
29 /* Local defines */
30 #define MSR_PLATFORM_POWER_LIMIT	0x0000065C
31 
32 /* bitmasks for RAPL MSRs, used by primitive access functions */
33 #define ENERGY_STATUS_MASK      0xffffffff
34 
35 #define POWER_LIMIT1_MASK       0x7FFF
36 #define POWER_LIMIT1_ENABLE     BIT(15)
37 #define POWER_LIMIT1_CLAMP      BIT(16)
38 
39 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
40 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
41 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
42 #define POWER_HIGH_LOCK         BIT_ULL(63)
43 #define POWER_LOW_LOCK          BIT(31)
44 
45 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
46 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
47 
48 #define POWER_UNIT_OFFSET	0
49 #define POWER_UNIT_MASK		0x0F
50 
51 #define ENERGY_UNIT_OFFSET	0x08
52 #define ENERGY_UNIT_MASK	0x1F00
53 
54 #define TIME_UNIT_OFFSET	0x10
55 #define TIME_UNIT_MASK		0xF0000
56 
57 #define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
58 #define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
59 #define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
60 #define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
61 
62 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
63 #define PP_POLICY_MASK         0x1F
64 
65 /* Non HW constants */
66 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
67 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
68 
69 #define TIME_WINDOW_MAX_MSEC 40000
70 #define TIME_WINDOW_MIN_MSEC 250
71 #define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
72 enum unit_type {
73 	ARBITRARY_UNIT,		/* no translation */
74 	POWER_UNIT,
75 	ENERGY_UNIT,
76 	TIME_UNIT,
77 };
78 
79 /* per domain data, some are optional */
80 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
81 
82 #define	DOMAIN_STATE_INACTIVE           BIT(0)
83 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
84 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
85 
86 static const char pl1_name[] = "long_term";
87 static const char pl2_name[] = "short_term";
88 
89 #define power_zone_to_rapl_domain(_zone) \
90 	container_of(_zone, struct rapl_domain, power_zone)
91 
92 struct rapl_defaults {
93 	u8 floor_freq_reg_addr;
94 	int (*check_unit)(struct rapl_package *rp, int cpu);
95 	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
96 	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
97 				    bool to_raw);
98 	unsigned int dram_domain_energy_unit;
99 };
100 static struct rapl_defaults *rapl_defaults;
101 
102 /* Sideband MBI registers */
103 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
104 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
105 
106 #define PACKAGE_PLN_INT_SAVED   BIT(0)
107 #define MAX_PRIM_NAME (32)
108 
109 /* per domain data. used to describe individual knobs such that access function
110  * can be consolidated into one instead of many inline functions.
111  */
112 struct rapl_primitive_info {
113 	const char *name;
114 	u64 mask;
115 	int shift;
116 	enum rapl_domain_reg_id id;
117 	enum unit_type unit;
118 	u32 flag;
119 };
120 
121 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
122 		.name = #p,			\
123 		.mask = m,			\
124 		.shift = s,			\
125 		.id = i,			\
126 		.unit = u,			\
127 		.flag = f			\
128 	}
129 
130 static void rapl_init_domains(struct rapl_package *rp);
131 static int rapl_read_data_raw(struct rapl_domain *rd,
132 			      enum rapl_primitives prim,
133 			      bool xlate, u64 *data);
134 static int rapl_write_data_raw(struct rapl_domain *rd,
135 			       enum rapl_primitives prim,
136 			       unsigned long long value);
137 static u64 rapl_unit_xlate(struct rapl_domain *rd,
138 			   enum unit_type type, u64 value, int to_raw);
139 static void package_power_limit_irq_save(struct rapl_package *rp);
140 
141 static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
142 
143 static const char *const rapl_domain_names[] = {
144 	"package",
145 	"core",
146 	"uncore",
147 	"dram",
148 	"psys",
149 };
150 
151 static int get_energy_counter(struct powercap_zone *power_zone,
152 			      u64 *energy_raw)
153 {
154 	struct rapl_domain *rd;
155 	u64 energy_now;
156 
157 	/* prevent CPU hotplug, make sure the RAPL domain does not go
158 	 * away while reading the counter.
159 	 */
160 	get_online_cpus();
161 	rd = power_zone_to_rapl_domain(power_zone);
162 
163 	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
164 		*energy_raw = energy_now;
165 		put_online_cpus();
166 
167 		return 0;
168 	}
169 	put_online_cpus();
170 
171 	return -EIO;
172 }
173 
174 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
175 {
176 	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
177 
178 	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
179 	return 0;
180 }
181 
182 static int release_zone(struct powercap_zone *power_zone)
183 {
184 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
185 	struct rapl_package *rp = rd->rp;
186 
187 	/* package zone is the last zone of a package, we can free
188 	 * memory here since all children has been unregistered.
189 	 */
190 	if (rd->id == RAPL_DOMAIN_PACKAGE) {
191 		kfree(rd);
192 		rp->domains = NULL;
193 	}
194 
195 	return 0;
196 
197 }
198 
199 static int find_nr_power_limit(struct rapl_domain *rd)
200 {
201 	int i, nr_pl = 0;
202 
203 	for (i = 0; i < NR_POWER_LIMITS; i++) {
204 		if (rd->rpl[i].name)
205 			nr_pl++;
206 	}
207 
208 	return nr_pl;
209 }
210 
211 static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
212 {
213 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
214 
215 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
216 		return -EACCES;
217 
218 	get_online_cpus();
219 	rapl_write_data_raw(rd, PL1_ENABLE, mode);
220 	if (rapl_defaults->set_floor_freq)
221 		rapl_defaults->set_floor_freq(rd, mode);
222 	put_online_cpus();
223 
224 	return 0;
225 }
226 
227 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
228 {
229 	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
230 	u64 val;
231 
232 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
233 		*mode = false;
234 		return 0;
235 	}
236 	get_online_cpus();
237 	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
238 		put_online_cpus();
239 		return -EIO;
240 	}
241 	*mode = val;
242 	put_online_cpus();
243 
244 	return 0;
245 }
246 
247 /* per RAPL domain ops, in the order of rapl_domain_type */
248 static const struct powercap_zone_ops zone_ops[] = {
249 	/* RAPL_DOMAIN_PACKAGE */
250 	{
251 	 .get_energy_uj = get_energy_counter,
252 	 .get_max_energy_range_uj = get_max_energy_counter,
253 	 .release = release_zone,
254 	 .set_enable = set_domain_enable,
255 	 .get_enable = get_domain_enable,
256 	 },
257 	/* RAPL_DOMAIN_PP0 */
258 	{
259 	 .get_energy_uj = get_energy_counter,
260 	 .get_max_energy_range_uj = get_max_energy_counter,
261 	 .release = release_zone,
262 	 .set_enable = set_domain_enable,
263 	 .get_enable = get_domain_enable,
264 	 },
265 	/* RAPL_DOMAIN_PP1 */
266 	{
267 	 .get_energy_uj = get_energy_counter,
268 	 .get_max_energy_range_uj = get_max_energy_counter,
269 	 .release = release_zone,
270 	 .set_enable = set_domain_enable,
271 	 .get_enable = get_domain_enable,
272 	 },
273 	/* RAPL_DOMAIN_DRAM */
274 	{
275 	 .get_energy_uj = get_energy_counter,
276 	 .get_max_energy_range_uj = get_max_energy_counter,
277 	 .release = release_zone,
278 	 .set_enable = set_domain_enable,
279 	 .get_enable = get_domain_enable,
280 	 },
281 	/* RAPL_DOMAIN_PLATFORM */
282 	{
283 	 .get_energy_uj = get_energy_counter,
284 	 .get_max_energy_range_uj = get_max_energy_counter,
285 	 .release = release_zone,
286 	 .set_enable = set_domain_enable,
287 	 .get_enable = get_domain_enable,
288 	 },
289 };
290 
291 /*
292  * Constraint index used by powercap can be different than power limit (PL)
293  * index in that some  PLs maybe missing due to non-existent MSRs. So we
294  * need to convert here by finding the valid PLs only (name populated).
295  */
296 static int contraint_to_pl(struct rapl_domain *rd, int cid)
297 {
298 	int i, j;
299 
300 	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
301 		if ((rd->rpl[i].name) && j++ == cid) {
302 			pr_debug("%s: index %d\n", __func__, i);
303 			return i;
304 		}
305 	}
306 	pr_err("Cannot find matching power limit for constraint %d\n", cid);
307 
308 	return -EINVAL;
309 }
310 
311 static int set_power_limit(struct powercap_zone *power_zone, int cid,
312 			   u64 power_limit)
313 {
314 	struct rapl_domain *rd;
315 	struct rapl_package *rp;
316 	int ret = 0;
317 	int id;
318 
319 	get_online_cpus();
320 	rd = power_zone_to_rapl_domain(power_zone);
321 	id = contraint_to_pl(rd, cid);
322 	if (id < 0) {
323 		ret = id;
324 		goto set_exit;
325 	}
326 
327 	rp = rd->rp;
328 
329 	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
330 		dev_warn(&power_zone->dev,
331 			 "%s locked by BIOS, monitoring only\n", rd->name);
332 		ret = -EACCES;
333 		goto set_exit;
334 	}
335 
336 	switch (rd->rpl[id].prim_id) {
337 	case PL1_ENABLE:
338 		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
339 		break;
340 	case PL2_ENABLE:
341 		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
342 		break;
343 	default:
344 		ret = -EINVAL;
345 	}
346 	if (!ret)
347 		package_power_limit_irq_save(rp);
348 set_exit:
349 	put_online_cpus();
350 	return ret;
351 }
352 
353 static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
354 				   u64 *data)
355 {
356 	struct rapl_domain *rd;
357 	u64 val;
358 	int prim;
359 	int ret = 0;
360 	int id;
361 
362 	get_online_cpus();
363 	rd = power_zone_to_rapl_domain(power_zone);
364 	id = contraint_to_pl(rd, cid);
365 	if (id < 0) {
366 		ret = id;
367 		goto get_exit;
368 	}
369 
370 	switch (rd->rpl[id].prim_id) {
371 	case PL1_ENABLE:
372 		prim = POWER_LIMIT1;
373 		break;
374 	case PL2_ENABLE:
375 		prim = POWER_LIMIT2;
376 		break;
377 	default:
378 		put_online_cpus();
379 		return -EINVAL;
380 	}
381 	if (rapl_read_data_raw(rd, prim, true, &val))
382 		ret = -EIO;
383 	else
384 		*data = val;
385 
386 get_exit:
387 	put_online_cpus();
388 
389 	return ret;
390 }
391 
392 static int set_time_window(struct powercap_zone *power_zone, int cid,
393 			   u64 window)
394 {
395 	struct rapl_domain *rd;
396 	int ret = 0;
397 	int id;
398 
399 	get_online_cpus();
400 	rd = power_zone_to_rapl_domain(power_zone);
401 	id = contraint_to_pl(rd, cid);
402 	if (id < 0) {
403 		ret = id;
404 		goto set_time_exit;
405 	}
406 
407 	switch (rd->rpl[id].prim_id) {
408 	case PL1_ENABLE:
409 		rapl_write_data_raw(rd, TIME_WINDOW1, window);
410 		break;
411 	case PL2_ENABLE:
412 		rapl_write_data_raw(rd, TIME_WINDOW2, window);
413 		break;
414 	default:
415 		ret = -EINVAL;
416 	}
417 
418 set_time_exit:
419 	put_online_cpus();
420 	return ret;
421 }
422 
423 static int get_time_window(struct powercap_zone *power_zone, int cid,
424 			   u64 *data)
425 {
426 	struct rapl_domain *rd;
427 	u64 val;
428 	int ret = 0;
429 	int id;
430 
431 	get_online_cpus();
432 	rd = power_zone_to_rapl_domain(power_zone);
433 	id = contraint_to_pl(rd, cid);
434 	if (id < 0) {
435 		ret = id;
436 		goto get_time_exit;
437 	}
438 
439 	switch (rd->rpl[id].prim_id) {
440 	case PL1_ENABLE:
441 		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
442 		break;
443 	case PL2_ENABLE:
444 		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
445 		break;
446 	default:
447 		put_online_cpus();
448 		return -EINVAL;
449 	}
450 	if (!ret)
451 		*data = val;
452 
453 get_time_exit:
454 	put_online_cpus();
455 
456 	return ret;
457 }
458 
459 static const char *get_constraint_name(struct powercap_zone *power_zone,
460 				       int cid)
461 {
462 	struct rapl_domain *rd;
463 	int id;
464 
465 	rd = power_zone_to_rapl_domain(power_zone);
466 	id = contraint_to_pl(rd, cid);
467 	if (id >= 0)
468 		return rd->rpl[id].name;
469 
470 	return NULL;
471 }
472 
473 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
474 {
475 	struct rapl_domain *rd;
476 	u64 val;
477 	int prim;
478 	int ret = 0;
479 
480 	get_online_cpus();
481 	rd = power_zone_to_rapl_domain(power_zone);
482 	switch (rd->rpl[id].prim_id) {
483 	case PL1_ENABLE:
484 		prim = THERMAL_SPEC_POWER;
485 		break;
486 	case PL2_ENABLE:
487 		prim = MAX_POWER;
488 		break;
489 	default:
490 		put_online_cpus();
491 		return -EINVAL;
492 	}
493 	if (rapl_read_data_raw(rd, prim, true, &val))
494 		ret = -EIO;
495 	else
496 		*data = val;
497 
498 	put_online_cpus();
499 
500 	return ret;
501 }
502 
503 static const struct powercap_zone_constraint_ops constraint_ops = {
504 	.set_power_limit_uw = set_power_limit,
505 	.get_power_limit_uw = get_current_power_limit,
506 	.set_time_window_us = set_time_window,
507 	.get_time_window_us = get_time_window,
508 	.get_max_power_uw = get_max_power,
509 	.get_name = get_constraint_name,
510 };
511 
512 /* called after domain detection and package level data are set */
513 static void rapl_init_domains(struct rapl_package *rp)
514 {
515 	enum rapl_domain_type i;
516 	enum rapl_domain_reg_id j;
517 	struct rapl_domain *rd = rp->domains;
518 
519 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
520 		unsigned int mask = rp->domain_map & (1 << i);
521 
522 		if (!mask)
523 			continue;
524 
525 		rd->rp = rp;
526 		rd->name = rapl_domain_names[i];
527 		rd->id = i;
528 		rd->rpl[0].prim_id = PL1_ENABLE;
529 		rd->rpl[0].name = pl1_name;
530 		/* some domain may support two power limits */
531 		if (rp->priv->limits[i] == 2) {
532 			rd->rpl[1].prim_id = PL2_ENABLE;
533 			rd->rpl[1].name = pl2_name;
534 		}
535 
536 		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
537 			rd->regs[j] = rp->priv->regs[i][j];
538 
539 		if (i == RAPL_DOMAIN_DRAM) {
540 			rd->domain_energy_unit =
541 			    rapl_defaults->dram_domain_energy_unit;
542 			if (rd->domain_energy_unit)
543 				pr_info("DRAM domain energy unit %dpj\n",
544 					rd->domain_energy_unit);
545 		}
546 		rd++;
547 	}
548 }
549 
550 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
551 			   u64 value, int to_raw)
552 {
553 	u64 units = 1;
554 	struct rapl_package *rp = rd->rp;
555 	u64 scale = 1;
556 
557 	switch (type) {
558 	case POWER_UNIT:
559 		units = rp->power_unit;
560 		break;
561 	case ENERGY_UNIT:
562 		scale = ENERGY_UNIT_SCALE;
563 		/* per domain unit takes precedence */
564 		if (rd->domain_energy_unit)
565 			units = rd->domain_energy_unit;
566 		else
567 			units = rp->energy_unit;
568 		break;
569 	case TIME_UNIT:
570 		return rapl_defaults->compute_time_window(rp, value, to_raw);
571 	case ARBITRARY_UNIT:
572 	default:
573 		return value;
574 	};
575 
576 	if (to_raw)
577 		return div64_u64(value, units) * scale;
578 
579 	value *= units;
580 
581 	return div64_u64(value, scale);
582 }
583 
584 /* in the order of enum rapl_primitives */
585 static struct rapl_primitive_info rpi[] = {
586 	/* name, mask, shift, msr index, unit divisor */
587 	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
588 			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
589 	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
590 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
591 	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
592 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
593 	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
594 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
595 	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
596 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
597 	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
598 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
599 	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
600 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
601 	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
602 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
603 	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
604 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
605 	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
606 			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
607 	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
608 			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
609 	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
610 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
611 	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
612 			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
613 	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
614 			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
615 	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
616 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
617 	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
618 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
619 	/* non-hardware */
620 	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
621 			    RAPL_PRIMITIVE_DERIVED),
622 	{NULL, 0, 0, 0},
623 };
624 
625 /* Read primitive data based on its related struct rapl_primitive_info.
626  * if xlate flag is set, return translated data based on data units, i.e.
627  * time, energy, and power.
628  * RAPL MSRs are non-architectual and are laid out not consistently across
629  * domains. Here we use primitive info to allow writing consolidated access
630  * functions.
631  * For a given primitive, it is processed by MSR mask and shift. Unit conversion
632  * is pre-assigned based on RAPL unit MSRs read at init time.
633  * 63-------------------------- 31--------------------------- 0
634  * |                           xxxxx (mask)                   |
635  * |                                |<- shift ----------------|
636  * 63-------------------------- 31--------------------------- 0
637  */
638 static int rapl_read_data_raw(struct rapl_domain *rd,
639 			      enum rapl_primitives prim, bool xlate, u64 *data)
640 {
641 	u64 value;
642 	struct rapl_primitive_info *rp = &rpi[prim];
643 	struct reg_action ra;
644 	int cpu;
645 
646 	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
647 		return -EINVAL;
648 
649 	ra.reg = rd->regs[rp->id];
650 	if (!ra.reg)
651 		return -EINVAL;
652 
653 	cpu = rd->rp->lead_cpu;
654 
655 	/* domain with 2 limits has different bit */
656 	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
657 		rp->mask = POWER_HIGH_LOCK;
658 		rp->shift = 63;
659 	}
660 	/* non-hardware data are collected by the polling thread */
661 	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
662 		*data = rd->rdd.primitives[prim];
663 		return 0;
664 	}
665 
666 	ra.mask = rp->mask;
667 
668 	if (rd->rp->priv->read_raw(cpu, &ra)) {
669 		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
670 		return -EIO;
671 	}
672 
673 	value = ra.value >> rp->shift;
674 
675 	if (xlate)
676 		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
677 	else
678 		*data = value;
679 
680 	return 0;
681 }
682 
683 /* Similar use of primitive info in the read counterpart */
684 static int rapl_write_data_raw(struct rapl_domain *rd,
685 			       enum rapl_primitives prim,
686 			       unsigned long long value)
687 {
688 	struct rapl_primitive_info *rp = &rpi[prim];
689 	int cpu;
690 	u64 bits;
691 	struct reg_action ra;
692 	int ret;
693 
694 	cpu = rd->rp->lead_cpu;
695 	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
696 	bits <<= rp->shift;
697 	bits &= rp->mask;
698 
699 	memset(&ra, 0, sizeof(ra));
700 
701 	ra.reg = rd->regs[rp->id];
702 	ra.mask = rp->mask;
703 	ra.value = bits;
704 
705 	ret = rd->rp->priv->write_raw(cpu, &ra);
706 
707 	return ret;
708 }
709 
710 /*
711  * Raw RAPL data stored in MSRs are in certain scales. We need to
712  * convert them into standard units based on the units reported in
713  * the RAPL unit MSRs. This is specific to CPUs as the method to
714  * calculate units differ on different CPUs.
715  * We convert the units to below format based on CPUs.
716  * i.e.
717  * energy unit: picoJoules  : Represented in picoJoules by default
718  * power unit : microWatts  : Represented in milliWatts by default
719  * time unit  : microseconds: Represented in seconds by default
720  */
721 static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
722 {
723 	struct reg_action ra;
724 	u32 value;
725 
726 	ra.reg = rp->priv->reg_unit;
727 	ra.mask = ~0;
728 	if (rp->priv->read_raw(cpu, &ra)) {
729 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
730 		       rp->priv->reg_unit, cpu);
731 		return -ENODEV;
732 	}
733 
734 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
735 	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
736 
737 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
738 	rp->power_unit = 1000000 / (1 << value);
739 
740 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
741 	rp->time_unit = 1000000 / (1 << value);
742 
743 	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
744 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
745 
746 	return 0;
747 }
748 
749 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
750 {
751 	struct reg_action ra;
752 	u32 value;
753 
754 	ra.reg = rp->priv->reg_unit;
755 	ra.mask = ~0;
756 	if (rp->priv->read_raw(cpu, &ra)) {
757 		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
758 		       rp->priv->reg_unit, cpu);
759 		return -ENODEV;
760 	}
761 
762 	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
763 	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
764 
765 	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
766 	rp->power_unit = (1 << value) * 1000;
767 
768 	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
769 	rp->time_unit = 1000000 / (1 << value);
770 
771 	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
772 		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
773 
774 	return 0;
775 }
776 
777 static void power_limit_irq_save_cpu(void *info)
778 {
779 	u32 l, h = 0;
780 	struct rapl_package *rp = (struct rapl_package *)info;
781 
782 	/* save the state of PLN irq mask bit before disabling it */
783 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
784 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
785 		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
786 		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
787 	}
788 	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
789 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
790 }
791 
792 /* REVISIT:
793  * When package power limit is set artificially low by RAPL, LVT
794  * thermal interrupt for package power limit should be ignored
795  * since we are not really exceeding the real limit. The intention
796  * is to avoid excessive interrupts while we are trying to save power.
797  * A useful feature might be routing the package_power_limit interrupt
798  * to userspace via eventfd. once we have a usecase, this is simple
799  * to do by adding an atomic notifier.
800  */
801 
802 static void package_power_limit_irq_save(struct rapl_package *rp)
803 {
804 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
805 		return;
806 
807 	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
808 }
809 
810 /*
811  * Restore per package power limit interrupt enable state. Called from cpu
812  * hotplug code on package removal.
813  */
814 static void package_power_limit_irq_restore(struct rapl_package *rp)
815 {
816 	u32 l, h;
817 
818 	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
819 		return;
820 
821 	/* irq enable state not saved, nothing to restore */
822 	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
823 		return;
824 
825 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
826 
827 	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
828 		l |= PACKAGE_THERM_INT_PLN_ENABLE;
829 	else
830 		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
831 
832 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
833 }
834 
835 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
836 {
837 	int nr_powerlimit = find_nr_power_limit(rd);
838 
839 	/* always enable clamp such that p-state can go below OS requested
840 	 * range. power capping priority over guranteed frequency.
841 	 */
842 	rapl_write_data_raw(rd, PL1_CLAMP, mode);
843 
844 	/* some domains have pl2 */
845 	if (nr_powerlimit > 1) {
846 		rapl_write_data_raw(rd, PL2_ENABLE, mode);
847 		rapl_write_data_raw(rd, PL2_CLAMP, mode);
848 	}
849 }
850 
851 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
852 {
853 	static u32 power_ctrl_orig_val;
854 	u32 mdata;
855 
856 	if (!rapl_defaults->floor_freq_reg_addr) {
857 		pr_err("Invalid floor frequency config register\n");
858 		return;
859 	}
860 
861 	if (!power_ctrl_orig_val)
862 		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
863 			      rapl_defaults->floor_freq_reg_addr,
864 			      &power_ctrl_orig_val);
865 	mdata = power_ctrl_orig_val;
866 	if (enable) {
867 		mdata &= ~(0x7f << 8);
868 		mdata |= 1 << 8;
869 	}
870 	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
871 		       rapl_defaults->floor_freq_reg_addr, mdata);
872 }
873 
874 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
875 					 bool to_raw)
876 {
877 	u64 f, y;		/* fraction and exp. used for time unit */
878 
879 	/*
880 	 * Special processing based on 2^Y*(1+F/4), refer
881 	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
882 	 */
883 	if (!to_raw) {
884 		f = (value & 0x60) >> 5;
885 		y = value & 0x1f;
886 		value = (1 << y) * (4 + f) * rp->time_unit / 4;
887 	} else {
888 		do_div(value, rp->time_unit);
889 		y = ilog2(value);
890 		f = div64_u64(4 * (value - (1 << y)), 1 << y);
891 		value = (y & 0x1f) | ((f & 0x3) << 5);
892 	}
893 	return value;
894 }
895 
896 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
897 					 bool to_raw)
898 {
899 	/*
900 	 * Atom time unit encoding is straight forward val * time_unit,
901 	 * where time_unit is default to 1 sec. Never 0.
902 	 */
903 	if (!to_raw)
904 		return (value) ? value *= rp->time_unit : rp->time_unit;
905 
906 	value = div64_u64(value, rp->time_unit);
907 
908 	return value;
909 }
910 
911 static const struct rapl_defaults rapl_defaults_core = {
912 	.floor_freq_reg_addr = 0,
913 	.check_unit = rapl_check_unit_core,
914 	.set_floor_freq = set_floor_freq_default,
915 	.compute_time_window = rapl_compute_time_window_core,
916 };
917 
918 static const struct rapl_defaults rapl_defaults_hsw_server = {
919 	.check_unit = rapl_check_unit_core,
920 	.set_floor_freq = set_floor_freq_default,
921 	.compute_time_window = rapl_compute_time_window_core,
922 	.dram_domain_energy_unit = 15300,
923 };
924 
925 static const struct rapl_defaults rapl_defaults_byt = {
926 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
927 	.check_unit = rapl_check_unit_atom,
928 	.set_floor_freq = set_floor_freq_atom,
929 	.compute_time_window = rapl_compute_time_window_atom,
930 };
931 
932 static const struct rapl_defaults rapl_defaults_tng = {
933 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
934 	.check_unit = rapl_check_unit_atom,
935 	.set_floor_freq = set_floor_freq_atom,
936 	.compute_time_window = rapl_compute_time_window_atom,
937 };
938 
939 static const struct rapl_defaults rapl_defaults_ann = {
940 	.floor_freq_reg_addr = 0,
941 	.check_unit = rapl_check_unit_atom,
942 	.set_floor_freq = NULL,
943 	.compute_time_window = rapl_compute_time_window_atom,
944 };
945 
946 static const struct rapl_defaults rapl_defaults_cht = {
947 	.floor_freq_reg_addr = 0,
948 	.check_unit = rapl_check_unit_atom,
949 	.set_floor_freq = NULL,
950 	.compute_time_window = rapl_compute_time_window_atom,
951 };
952 
953 static const struct x86_cpu_id rapl_ids[] __initconst = {
954 	INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
955 	INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
956 
957 	INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
958 	INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
959 
960 	INTEL_CPU_FAM6(HASWELL, rapl_defaults_core),
961 	INTEL_CPU_FAM6(HASWELL_L, rapl_defaults_core),
962 	INTEL_CPU_FAM6(HASWELL_G, rapl_defaults_core),
963 	INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
964 
965 	INTEL_CPU_FAM6(BROADWELL, rapl_defaults_core),
966 	INTEL_CPU_FAM6(BROADWELL_G, rapl_defaults_core),
967 	INTEL_CPU_FAM6(BROADWELL_D, rapl_defaults_core),
968 	INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
969 
970 	INTEL_CPU_FAM6(SKYLAKE, rapl_defaults_core),
971 	INTEL_CPU_FAM6(SKYLAKE_L, rapl_defaults_core),
972 	INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
973 	INTEL_CPU_FAM6(KABYLAKE_L, rapl_defaults_core),
974 	INTEL_CPU_FAM6(KABYLAKE, rapl_defaults_core),
975 	INTEL_CPU_FAM6(CANNONLAKE_L, rapl_defaults_core),
976 	INTEL_CPU_FAM6(ICELAKE_L, rapl_defaults_core),
977 	INTEL_CPU_FAM6(ICELAKE, rapl_defaults_core),
978 	INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core),
979 	INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server),
980 	INTEL_CPU_FAM6(ICELAKE_D, rapl_defaults_hsw_server),
981 
982 	INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
983 	INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
984 	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
985 	INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
986 	INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
987 	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
988 	INTEL_CPU_FAM6(ATOM_GOLDMONT_D, rapl_defaults_core),
989 	INTEL_CPU_FAM6(ATOM_TREMONT_D, rapl_defaults_core),
990 
991 	INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
992 	INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
993 	{}
994 };
995 
996 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
997 
998 /* Read once for all raw primitive data for domains */
999 static void rapl_update_domain_data(struct rapl_package *rp)
1000 {
1001 	int dmn, prim;
1002 	u64 val;
1003 
1004 	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1005 		pr_debug("update %s domain %s data\n", rp->name,
1006 			 rp->domains[dmn].name);
1007 		/* exclude non-raw primitives */
1008 		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1009 			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1010 						rpi[prim].unit, &val))
1011 				rp->domains[dmn].rdd.primitives[prim] = val;
1012 		}
1013 	}
1014 
1015 }
1016 
1017 static int rapl_package_register_powercap(struct rapl_package *rp)
1018 {
1019 	struct rapl_domain *rd;
1020 	struct powercap_zone *power_zone = NULL;
1021 	int nr_pl, ret;
1022 
1023 	/* Update the domain data of the new package */
1024 	rapl_update_domain_data(rp);
1025 
1026 	/* first we register package domain as the parent zone */
1027 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1028 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1029 			nr_pl = find_nr_power_limit(rd);
1030 			pr_debug("register package domain %s\n", rp->name);
1031 			power_zone = powercap_register_zone(&rd->power_zone,
1032 					    rp->priv->control_type, rp->name,
1033 					    NULL, &zone_ops[rd->id], nr_pl,
1034 					    &constraint_ops);
1035 			if (IS_ERR(power_zone)) {
1036 				pr_debug("failed to register power zone %s\n",
1037 					 rp->name);
1038 				return PTR_ERR(power_zone);
1039 			}
1040 			/* track parent zone in per package/socket data */
1041 			rp->power_zone = power_zone;
1042 			/* done, only one package domain per socket */
1043 			break;
1044 		}
1045 	}
1046 	if (!power_zone) {
1047 		pr_err("no package domain found, unknown topology!\n");
1048 		return -ENODEV;
1049 	}
1050 	/* now register domains as children of the socket/package */
1051 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1052 		if (rd->id == RAPL_DOMAIN_PACKAGE)
1053 			continue;
1054 		/* number of power limits per domain varies */
1055 		nr_pl = find_nr_power_limit(rd);
1056 		power_zone = powercap_register_zone(&rd->power_zone,
1057 						    rp->priv->control_type,
1058 						    rd->name, rp->power_zone,
1059 						    &zone_ops[rd->id], nr_pl,
1060 						    &constraint_ops);
1061 
1062 		if (IS_ERR(power_zone)) {
1063 			pr_debug("failed to register power_zone, %s:%s\n",
1064 				 rp->name, rd->name);
1065 			ret = PTR_ERR(power_zone);
1066 			goto err_cleanup;
1067 		}
1068 	}
1069 	return 0;
1070 
1071 err_cleanup:
1072 	/*
1073 	 * Clean up previously initialized domains within the package if we
1074 	 * failed after the first domain setup.
1075 	 */
1076 	while (--rd >= rp->domains) {
1077 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1078 		powercap_unregister_zone(rp->priv->control_type,
1079 					 &rd->power_zone);
1080 	}
1081 
1082 	return ret;
1083 }
1084 
1085 int rapl_add_platform_domain(struct rapl_if_priv *priv)
1086 {
1087 	struct rapl_domain *rd;
1088 	struct powercap_zone *power_zone;
1089 	struct reg_action ra;
1090 	int ret;
1091 
1092 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1093 	ra.mask = ~0;
1094 	ret = priv->read_raw(0, &ra);
1095 	if (ret || !ra.value)
1096 		return -ENODEV;
1097 
1098 	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1099 	ra.mask = ~0;
1100 	ret = priv->read_raw(0, &ra);
1101 	if (ret || !ra.value)
1102 		return -ENODEV;
1103 
1104 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
1105 	if (!rd)
1106 		return -ENOMEM;
1107 
1108 	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
1109 	rd->id = RAPL_DOMAIN_PLATFORM;
1110 	rd->regs[RAPL_DOMAIN_REG_LIMIT] =
1111 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1112 	rd->regs[RAPL_DOMAIN_REG_STATUS] =
1113 	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1114 	rd->rpl[0].prim_id = PL1_ENABLE;
1115 	rd->rpl[0].name = pl1_name;
1116 	rd->rpl[1].prim_id = PL2_ENABLE;
1117 	rd->rpl[1].name = pl2_name;
1118 	rd->rp = rapl_find_package_domain(0, priv);
1119 
1120 	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
1121 					    "psys", NULL,
1122 					    &zone_ops[RAPL_DOMAIN_PLATFORM],
1123 					    2, &constraint_ops);
1124 
1125 	if (IS_ERR(power_zone)) {
1126 		kfree(rd);
1127 		return PTR_ERR(power_zone);
1128 	}
1129 
1130 	priv->platform_rapl_domain = rd;
1131 
1132 	return 0;
1133 }
1134 EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
1135 
1136 void rapl_remove_platform_domain(struct rapl_if_priv *priv)
1137 {
1138 	if (priv->platform_rapl_domain) {
1139 		powercap_unregister_zone(priv->control_type,
1140 				 &priv->platform_rapl_domain->power_zone);
1141 		kfree(priv->platform_rapl_domain);
1142 	}
1143 }
1144 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
1145 
1146 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1147 {
1148 	struct reg_action ra;
1149 
1150 	switch (domain) {
1151 	case RAPL_DOMAIN_PACKAGE:
1152 	case RAPL_DOMAIN_PP0:
1153 	case RAPL_DOMAIN_PP1:
1154 	case RAPL_DOMAIN_DRAM:
1155 		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1156 		break;
1157 	case RAPL_DOMAIN_PLATFORM:
1158 		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1159 		return -EINVAL;
1160 	default:
1161 		pr_err("invalid domain id %d\n", domain);
1162 		return -EINVAL;
1163 	}
1164 	/* make sure domain counters are available and contains non-zero
1165 	 * values, otherwise skip it.
1166 	 */
1167 
1168 	ra.mask = ~0;
1169 	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1170 		return -ENODEV;
1171 
1172 	return 0;
1173 }
1174 
1175 /*
1176  * Check if power limits are available. Two cases when they are not available:
1177  * 1. Locked by BIOS, in this case we still provide read-only access so that
1178  *    users can see what limit is set by the BIOS.
1179  * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1180  *    exist at all. In this case, we do not show the constraints in powercap.
1181  *
1182  * Called after domains are detected and initialized.
1183  */
1184 static void rapl_detect_powerlimit(struct rapl_domain *rd)
1185 {
1186 	u64 val64;
1187 	int i;
1188 
1189 	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1190 	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1191 		if (val64) {
1192 			pr_info("RAPL %s domain %s locked by BIOS\n",
1193 				rd->rp->name, rd->name);
1194 			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1195 		}
1196 	}
1197 	/* check if power limit MSR exists, otherwise domain is monitoring only */
1198 	for (i = 0; i < NR_POWER_LIMITS; i++) {
1199 		int prim = rd->rpl[i].prim_id;
1200 
1201 		if (rapl_read_data_raw(rd, prim, false, &val64))
1202 			rd->rpl[i].name = NULL;
1203 	}
1204 }
1205 
1206 /* Detect active and valid domains for the given CPU, caller must
1207  * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1208  */
1209 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1210 {
1211 	struct rapl_domain *rd;
1212 	int i;
1213 
1214 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1215 		/* use physical package id to read counters */
1216 		if (!rapl_check_domain(cpu, i, rp)) {
1217 			rp->domain_map |= 1 << i;
1218 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1219 		}
1220 	}
1221 	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1222 	if (!rp->nr_domains) {
1223 		pr_debug("no valid rapl domains found in %s\n", rp->name);
1224 		return -ENODEV;
1225 	}
1226 	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1227 
1228 	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1229 			      GFP_KERNEL);
1230 	if (!rp->domains)
1231 		return -ENOMEM;
1232 
1233 	rapl_init_domains(rp);
1234 
1235 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1236 		rapl_detect_powerlimit(rd);
1237 
1238 	return 0;
1239 }
1240 
1241 /* called from CPU hotplug notifier, hotplug lock held */
1242 void rapl_remove_package(struct rapl_package *rp)
1243 {
1244 	struct rapl_domain *rd, *rd_package = NULL;
1245 
1246 	package_power_limit_irq_restore(rp);
1247 
1248 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1249 		rapl_write_data_raw(rd, PL1_ENABLE, 0);
1250 		rapl_write_data_raw(rd, PL1_CLAMP, 0);
1251 		if (find_nr_power_limit(rd) > 1) {
1252 			rapl_write_data_raw(rd, PL2_ENABLE, 0);
1253 			rapl_write_data_raw(rd, PL2_CLAMP, 0);
1254 		}
1255 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
1256 			rd_package = rd;
1257 			continue;
1258 		}
1259 		pr_debug("remove package, undo power limit on %s: %s\n",
1260 			 rp->name, rd->name);
1261 		powercap_unregister_zone(rp->priv->control_type,
1262 					 &rd->power_zone);
1263 	}
1264 	/* do parent zone last */
1265 	powercap_unregister_zone(rp->priv->control_type,
1266 				 &rd_package->power_zone);
1267 	list_del(&rp->plist);
1268 	kfree(rp);
1269 }
1270 EXPORT_SYMBOL_GPL(rapl_remove_package);
1271 
1272 /* caller to ensure CPU hotplug lock is held */
1273 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1274 {
1275 	int id = topology_logical_die_id(cpu);
1276 	struct rapl_package *rp;
1277 
1278 	list_for_each_entry(rp, &rapl_packages, plist) {
1279 		if (rp->id == id
1280 		    && rp->priv->control_type == priv->control_type)
1281 			return rp;
1282 	}
1283 
1284 	return NULL;
1285 }
1286 EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1287 
1288 /* called from CPU hotplug notifier, hotplug lock held */
1289 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1290 {
1291 	int id = topology_logical_die_id(cpu);
1292 	struct rapl_package *rp;
1293 	struct cpuinfo_x86 *c = &cpu_data(cpu);
1294 	int ret;
1295 
1296 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1297 	if (!rp)
1298 		return ERR_PTR(-ENOMEM);
1299 
1300 	/* add the new package to the list */
1301 	rp->id = id;
1302 	rp->lead_cpu = cpu;
1303 	rp->priv = priv;
1304 
1305 	if (topology_max_die_per_package() > 1)
1306 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1307 			 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
1308 	else
1309 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1310 			 c->phys_proc_id);
1311 
1312 	/* check if the package contains valid domains */
1313 	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1314 		ret = -ENODEV;
1315 		goto err_free_package;
1316 	}
1317 	ret = rapl_package_register_powercap(rp);
1318 	if (!ret) {
1319 		INIT_LIST_HEAD(&rp->plist);
1320 		list_add(&rp->plist, &rapl_packages);
1321 		return rp;
1322 	}
1323 
1324 err_free_package:
1325 	kfree(rp->domains);
1326 	kfree(rp);
1327 	return ERR_PTR(ret);
1328 }
1329 EXPORT_SYMBOL_GPL(rapl_add_package);
1330 
1331 static void power_limit_state_save(void)
1332 {
1333 	struct rapl_package *rp;
1334 	struct rapl_domain *rd;
1335 	int nr_pl, ret, i;
1336 
1337 	get_online_cpus();
1338 	list_for_each_entry(rp, &rapl_packages, plist) {
1339 		if (!rp->power_zone)
1340 			continue;
1341 		rd = power_zone_to_rapl_domain(rp->power_zone);
1342 		nr_pl = find_nr_power_limit(rd);
1343 		for (i = 0; i < nr_pl; i++) {
1344 			switch (rd->rpl[i].prim_id) {
1345 			case PL1_ENABLE:
1346 				ret = rapl_read_data_raw(rd,
1347 						 POWER_LIMIT1, true,
1348 						 &rd->rpl[i].last_power_limit);
1349 				if (ret)
1350 					rd->rpl[i].last_power_limit = 0;
1351 				break;
1352 			case PL2_ENABLE:
1353 				ret = rapl_read_data_raw(rd,
1354 						 POWER_LIMIT2, true,
1355 						 &rd->rpl[i].last_power_limit);
1356 				if (ret)
1357 					rd->rpl[i].last_power_limit = 0;
1358 				break;
1359 			}
1360 		}
1361 	}
1362 	put_online_cpus();
1363 }
1364 
1365 static void power_limit_state_restore(void)
1366 {
1367 	struct rapl_package *rp;
1368 	struct rapl_domain *rd;
1369 	int nr_pl, i;
1370 
1371 	get_online_cpus();
1372 	list_for_each_entry(rp, &rapl_packages, plist) {
1373 		if (!rp->power_zone)
1374 			continue;
1375 		rd = power_zone_to_rapl_domain(rp->power_zone);
1376 		nr_pl = find_nr_power_limit(rd);
1377 		for (i = 0; i < nr_pl; i++) {
1378 			switch (rd->rpl[i].prim_id) {
1379 			case PL1_ENABLE:
1380 				if (rd->rpl[i].last_power_limit)
1381 					rapl_write_data_raw(rd, POWER_LIMIT1,
1382 					    rd->rpl[i].last_power_limit);
1383 				break;
1384 			case PL2_ENABLE:
1385 				if (rd->rpl[i].last_power_limit)
1386 					rapl_write_data_raw(rd, POWER_LIMIT2,
1387 					    rd->rpl[i].last_power_limit);
1388 				break;
1389 			}
1390 		}
1391 	}
1392 	put_online_cpus();
1393 }
1394 
1395 static int rapl_pm_callback(struct notifier_block *nb,
1396 			    unsigned long mode, void *_unused)
1397 {
1398 	switch (mode) {
1399 	case PM_SUSPEND_PREPARE:
1400 		power_limit_state_save();
1401 		break;
1402 	case PM_POST_SUSPEND:
1403 		power_limit_state_restore();
1404 		break;
1405 	}
1406 	return NOTIFY_OK;
1407 }
1408 
1409 static struct notifier_block rapl_pm_notifier = {
1410 	.notifier_call = rapl_pm_callback,
1411 };
1412 
1413 static struct platform_device *rapl_msr_platdev;
1414 
1415 static int __init rapl_init(void)
1416 {
1417 	const struct x86_cpu_id *id;
1418 	int ret;
1419 
1420 	id = x86_match_cpu(rapl_ids);
1421 	if (!id) {
1422 		pr_err("driver does not support CPU family %d model %d\n",
1423 		       boot_cpu_data.x86, boot_cpu_data.x86_model);
1424 
1425 		return -ENODEV;
1426 	}
1427 
1428 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
1429 
1430 	ret = register_pm_notifier(&rapl_pm_notifier);
1431 	if (ret)
1432 		return ret;
1433 
1434 	rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1435 	if (!rapl_msr_platdev) {
1436 		ret = -ENOMEM;
1437 		goto end;
1438 	}
1439 
1440 	ret = platform_device_add(rapl_msr_platdev);
1441 	if (ret)
1442 		platform_device_put(rapl_msr_platdev);
1443 
1444 end:
1445 	if (ret)
1446 		unregister_pm_notifier(&rapl_pm_notifier);
1447 
1448 	return ret;
1449 }
1450 
1451 static void __exit rapl_exit(void)
1452 {
1453 	platform_device_unregister(rapl_msr_platdev);
1454 	unregister_pm_notifier(&rapl_pm_notifier);
1455 }
1456 
1457 fs_initcall(rapl_init);
1458 module_exit(rapl_exit);
1459 
1460 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1461 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1462 MODULE_LICENSE("GPL v2");
1463