xref: /linux/drivers/thermal/intel/intel_powerclamp.c (revision 0340dc4c82590d8735c58cf904a8aa1173273ab5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012-2023, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/cpu.h>
31 #include <linux/thermal.h>
32 #include <linux/debugfs.h>
33 #include <linux/seq_file.h>
34 #include <linux/idle_inject.h>
35 
36 #include <asm/msr.h>
37 #include <asm/mwait.h>
38 #include <asm/cpu_device_id.h>
39 
40 #define MAX_TARGET_RATIO (100U)
41 /* For each undisturbed clamping period (no extra wake ups during idle time),
42  * we increment the confidence counter for the given target ratio.
43  * CONFIDENCE_OK defines the level where runtime calibration results are
44  * valid.
45  */
46 #define CONFIDENCE_OK (3)
47 /* Default idle injection duration, driver adjust sleep time to meet target
48  * idle ratio. Similar to frequency modulation.
49  */
50 #define DEFAULT_DURATION_JIFFIES (6)
51 
52 static struct dentry *debug_dir;
53 static bool poll_pkg_cstate_enable;
54 
55 /* Idle ratio observed using package C-state counters */
56 static unsigned int current_ratio;
57 
58 /* Skip the idle injection till set to true */
59 static bool should_skip;
60 
61 struct powerclamp_data {
62 	unsigned int cpu;
63 	unsigned int count;
64 	unsigned int guard;
65 	unsigned int window_size_now;
66 	unsigned int target_ratio;
67 	bool clamping;
68 };
69 
70 static struct powerclamp_data powerclamp_data;
71 
72 static struct thermal_cooling_device *cooling_dev;
73 
74 static DEFINE_MUTEX(powerclamp_lock);
75 
76 /* This duration is in microseconds */
77 static unsigned int duration;
78 static unsigned int pkg_cstate_ratio_cur;
79 static unsigned int window_size;
80 
81 static int duration_set(const char *arg, const struct kernel_param *kp)
82 {
83 	int ret = 0;
84 	unsigned long new_duration;
85 
86 	ret = kstrtoul(arg, 10, &new_duration);
87 	if (ret)
88 		goto exit;
89 	if (new_duration > 25 || new_duration < 6) {
90 		pr_err("Out of recommended range %lu, between 6-25ms\n",
91 			new_duration);
92 		ret = -EINVAL;
93 		goto exit;
94 	}
95 
96 	mutex_lock(&powerclamp_lock);
97 	duration = clamp(new_duration, 6ul, 25ul) * 1000;
98 	mutex_unlock(&powerclamp_lock);
99 exit:
100 
101 	return ret;
102 }
103 
104 static int duration_get(char *buf, const struct kernel_param *kp)
105 {
106 	int ret;
107 
108 	mutex_lock(&powerclamp_lock);
109 	ret = sysfs_emit(buf, "%d\n", duration / 1000);
110 	mutex_unlock(&powerclamp_lock);
111 
112 	return ret;
113 }
114 
115 static const struct kernel_param_ops duration_ops = {
116 	.set = duration_set,
117 	.get = duration_get,
118 };
119 
120 module_param_cb(duration, &duration_ops, NULL, 0644);
121 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
122 
123 #define DEFAULT_MAX_IDLE	50
124 #define MAX_ALL_CPU_IDLE	75
125 
126 static u8 max_idle = DEFAULT_MAX_IDLE;
127 
128 static cpumask_var_t idle_injection_cpu_mask;
129 
130 static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
131 {
132 	if (cpumask_available(idle_injection_cpu_mask))
133 		goto copy_mask;
134 
135 	/* This mask is allocated only one time and freed during module exit */
136 	if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
137 		return -ENOMEM;
138 
139 copy_mask:
140 	cpumask_copy(idle_injection_cpu_mask, copy_mask);
141 
142 	return 0;
143 }
144 
145 /* Return true if the cpumask and idle percent combination is invalid */
146 static bool check_invalid(cpumask_var_t mask, u8 idle)
147 {
148 	if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
149 		return true;
150 
151 	return false;
152 }
153 
154 static int cpumask_set(const char *arg, const struct kernel_param *kp)
155 {
156 	cpumask_var_t new_mask;
157 	int ret;
158 
159 	mutex_lock(&powerclamp_lock);
160 
161 	/* Can't set mask when cooling device is in use */
162 	if (powerclamp_data.clamping) {
163 		ret = -EAGAIN;
164 		goto skip_cpumask_set;
165 	}
166 
167 	ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
168 	if (!ret)
169 		goto skip_cpumask_set;
170 
171 	ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
172 			   nr_cpumask_bits);
173 	if (ret)
174 		goto free_cpumask_set;
175 
176 	if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
177 		ret = -EINVAL;
178 		goto free_cpumask_set;
179 	}
180 
181 	/*
182 	 * When module parameters are passed from kernel command line
183 	 * during insmod, the module parameter callback is called
184 	 * before powerclamp_init(), so we can't assume that some
185 	 * cpumask can be allocated and copied before here. Also
186 	 * in this case this cpumask is used as the default mask.
187 	 */
188 	ret = allocate_copy_idle_injection_mask(new_mask);
189 
190 free_cpumask_set:
191 	free_cpumask_var(new_mask);
192 skip_cpumask_set:
193 	mutex_unlock(&powerclamp_lock);
194 
195 	return ret;
196 }
197 
198 static int cpumask_get(char *buf, const struct kernel_param *kp)
199 {
200 	if (!cpumask_available(idle_injection_cpu_mask))
201 		return -ENODEV;
202 
203 	return bitmap_print_to_pagebuf(false, buf, cpumask_bits(idle_injection_cpu_mask),
204 				       nr_cpumask_bits);
205 }
206 
207 static const struct kernel_param_ops cpumask_ops = {
208 	.set = cpumask_set,
209 	.get = cpumask_get,
210 };
211 
212 module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
213 MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
214 
215 static int max_idle_set(const char *arg, const struct kernel_param *kp)
216 {
217 	u8 new_max_idle;
218 	int ret = 0;
219 
220 	mutex_lock(&powerclamp_lock);
221 
222 	/* Can't set mask when cooling device is in use */
223 	if (powerclamp_data.clamping) {
224 		ret = -EAGAIN;
225 		goto skip_limit_set;
226 	}
227 
228 	ret = kstrtou8(arg, 10, &new_max_idle);
229 	if (ret)
230 		goto skip_limit_set;
231 
232 	if (new_max_idle > MAX_TARGET_RATIO) {
233 		ret = -EINVAL;
234 		goto skip_limit_set;
235 	}
236 
237 	if (!cpumask_available(idle_injection_cpu_mask)) {
238 		ret = allocate_copy_idle_injection_mask(cpu_present_mask);
239 		if (ret)
240 			goto skip_limit_set;
241 	}
242 
243 	if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
244 		ret = -EINVAL;
245 		goto skip_limit_set;
246 	}
247 
248 	max_idle = new_max_idle;
249 
250 skip_limit_set:
251 	mutex_unlock(&powerclamp_lock);
252 
253 	return ret;
254 }
255 
256 static const struct kernel_param_ops max_idle_ops = {
257 	.set = max_idle_set,
258 	.get = param_get_byte,
259 };
260 
261 module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
262 MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
263 
264 struct powerclamp_calibration_data {
265 	unsigned long confidence;  /* used for calibration, basically a counter
266 				    * gets incremented each time a clamping
267 				    * period is completed without extra wakeups
268 				    * once that counter is reached given level,
269 				    * compensation is deemed usable.
270 				    */
271 	unsigned long steady_comp; /* steady state compensation used when
272 				    * no extra wakeups occurred.
273 				    */
274 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
275 				     * mostly from external interrupts.
276 				     */
277 };
278 
279 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
280 
281 static int window_size_set(const char *arg, const struct kernel_param *kp)
282 {
283 	int ret = 0;
284 	unsigned long new_window_size;
285 
286 	ret = kstrtoul(arg, 10, &new_window_size);
287 	if (ret)
288 		goto exit_win;
289 	if (new_window_size > 10 || new_window_size < 2) {
290 		pr_err("Out of recommended window size %lu, between 2-10\n",
291 			new_window_size);
292 		ret = -EINVAL;
293 	}
294 
295 	window_size = clamp(new_window_size, 2ul, 10ul);
296 	smp_mb();
297 
298 exit_win:
299 
300 	return ret;
301 }
302 
303 static const struct kernel_param_ops window_size_ops = {
304 	.set = window_size_set,
305 	.get = param_get_int,
306 };
307 
308 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
309 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
310 	"\tpowerclamp controls idle ratio within this window. larger\n"
311 	"\twindow size results in slower response time but more smooth\n"
312 	"\tclamping results. default to 2.");
313 
314 struct pkg_cstate_info {
315 	bool skip;
316 	int msr_index;
317 	int cstate_id;
318 };
319 
320 #define PKG_CSTATE_INIT(id) {				\
321 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
322 		.cstate_id = id				\
323 			}
324 
325 static struct pkg_cstate_info pkg_cstates[] = {
326 	PKG_CSTATE_INIT(2),
327 	PKG_CSTATE_INIT(3),
328 	PKG_CSTATE_INIT(6),
329 	PKG_CSTATE_INIT(7),
330 	PKG_CSTATE_INIT(8),
331 	PKG_CSTATE_INIT(9),
332 	PKG_CSTATE_INIT(10),
333 	{NULL},
334 };
335 
336 static bool has_pkg_state_counter(void)
337 {
338 	u64 val;
339 	struct pkg_cstate_info *info = pkg_cstates;
340 
341 	/* check if any one of the counter msrs exists */
342 	while (info->msr_index) {
343 		if (!rdmsrl_safe(info->msr_index, &val))
344 			return true;
345 		info++;
346 	}
347 
348 	return false;
349 }
350 
351 static u64 pkg_state_counter(void)
352 {
353 	u64 val;
354 	u64 count = 0;
355 	struct pkg_cstate_info *info = pkg_cstates;
356 
357 	while (info->msr_index) {
358 		if (!info->skip) {
359 			if (!rdmsrl_safe(info->msr_index, &val))
360 				count += val;
361 			else
362 				info->skip = true;
363 		}
364 		info++;
365 	}
366 
367 	return count;
368 }
369 
370 static unsigned int get_compensation(int ratio)
371 {
372 	unsigned int comp = 0;
373 
374 	if (!poll_pkg_cstate_enable)
375 		return 0;
376 
377 	/* we only use compensation if all adjacent ones are good */
378 	if (ratio == 1 &&
379 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
380 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
381 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
382 		comp = (cal_data[ratio].steady_comp +
383 			cal_data[ratio + 1].steady_comp +
384 			cal_data[ratio + 2].steady_comp) / 3;
385 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
386 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
387 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
388 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
389 		comp = (cal_data[ratio].steady_comp +
390 			cal_data[ratio - 1].steady_comp +
391 			cal_data[ratio - 2].steady_comp) / 3;
392 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
393 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
394 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
395 		comp = (cal_data[ratio].steady_comp +
396 			cal_data[ratio - 1].steady_comp +
397 			cal_data[ratio + 1].steady_comp) / 3;
398 	}
399 
400 	/* do not exceed limit */
401 	if (comp + ratio >= MAX_TARGET_RATIO)
402 		comp = MAX_TARGET_RATIO - ratio - 1;
403 
404 	return comp;
405 }
406 
407 static void adjust_compensation(int target_ratio, unsigned int win)
408 {
409 	int delta;
410 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
411 
412 	/*
413 	 * adjust compensations if confidence level has not been reached.
414 	 */
415 	if (d->confidence >= CONFIDENCE_OK)
416 		return;
417 
418 	delta = powerclamp_data.target_ratio - current_ratio;
419 	/* filter out bad data */
420 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
421 		if (d->steady_comp)
422 			d->steady_comp =
423 				roundup(delta+d->steady_comp, 2)/2;
424 		else
425 			d->steady_comp = delta;
426 		d->confidence++;
427 	}
428 }
429 
430 static bool powerclamp_adjust_controls(unsigned int target_ratio,
431 				unsigned int guard, unsigned int win)
432 {
433 	static u64 msr_last, tsc_last;
434 	u64 msr_now, tsc_now;
435 	u64 val64;
436 
437 	/* check result for the last window */
438 	msr_now = pkg_state_counter();
439 	tsc_now = rdtsc();
440 
441 	/* calculate pkg cstate vs tsc ratio */
442 	if (!msr_last || !tsc_last)
443 		current_ratio = 1;
444 	else if (tsc_now-tsc_last) {
445 		val64 = 100*(msr_now-msr_last);
446 		do_div(val64, (tsc_now-tsc_last));
447 		current_ratio = val64;
448 	}
449 
450 	/* update record */
451 	msr_last = msr_now;
452 	tsc_last = tsc_now;
453 
454 	adjust_compensation(target_ratio, win);
455 
456 	/* if we are above target+guard, skip */
457 	return powerclamp_data.target_ratio + guard <= current_ratio;
458 }
459 
460 /*
461  * This function calculates runtime from the current target ratio.
462  * This function gets called under powerclamp_lock.
463  */
464 static unsigned int get_run_time(void)
465 {
466 	unsigned int compensated_ratio;
467 	unsigned int runtime;
468 
469 	/*
470 	 * make sure user selected ratio does not take effect until
471 	 * the next round. adjust target_ratio if user has changed
472 	 * target such that we can converge quickly.
473 	 */
474 	powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
475 	powerclamp_data.window_size_now = window_size;
476 
477 	/*
478 	 * systems may have different ability to enter package level
479 	 * c-states, thus we need to compensate the injected idle ratio
480 	 * to achieve the actual target reported by the HW.
481 	 */
482 	compensated_ratio = powerclamp_data.target_ratio +
483 		get_compensation(powerclamp_data.target_ratio);
484 	if (compensated_ratio <= 0)
485 		compensated_ratio = 1;
486 
487 	runtime = duration * 100 / compensated_ratio - duration;
488 
489 	return runtime;
490 }
491 
492 /*
493  * 1 HZ polling while clamping is active, useful for userspace
494  * to monitor actual idle ratio.
495  */
496 static void poll_pkg_cstate(struct work_struct *dummy);
497 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
498 static void poll_pkg_cstate(struct work_struct *dummy)
499 {
500 	static u64 msr_last;
501 	static u64 tsc_last;
502 
503 	u64 msr_now;
504 	u64 tsc_now;
505 	u64 val64;
506 
507 	msr_now = pkg_state_counter();
508 	tsc_now = rdtsc();
509 
510 	/* calculate pkg cstate vs tsc ratio */
511 	if (!msr_last || !tsc_last)
512 		pkg_cstate_ratio_cur = 1;
513 	else {
514 		if (tsc_now - tsc_last) {
515 			val64 = 100 * (msr_now - msr_last);
516 			do_div(val64, (tsc_now - tsc_last));
517 			pkg_cstate_ratio_cur = val64;
518 		}
519 	}
520 
521 	/* update record */
522 	msr_last = msr_now;
523 	tsc_last = tsc_now;
524 
525 	mutex_lock(&powerclamp_lock);
526 	if (powerclamp_data.clamping)
527 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
528 	mutex_unlock(&powerclamp_lock);
529 }
530 
531 static struct idle_inject_device *ii_dev;
532 
533 /*
534  * This function is called from idle injection core on timer expiry
535  * for the run duration. This allows powerclamp to readjust or skip
536  * injecting idle for this cycle.
537  */
538 static bool idle_inject_update(void)
539 {
540 	bool update = false;
541 
542 	/* We can't sleep in this callback */
543 	if (!mutex_trylock(&powerclamp_lock))
544 		return true;
545 
546 	if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
547 
548 		should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
549 							 powerclamp_data.guard,
550 							 powerclamp_data.window_size_now);
551 		update = true;
552 	}
553 
554 	if (update) {
555 		unsigned int runtime = get_run_time();
556 
557 		idle_inject_set_duration(ii_dev, runtime, duration);
558 	}
559 
560 	powerclamp_data.count++;
561 
562 	mutex_unlock(&powerclamp_lock);
563 
564 	if (should_skip)
565 		return false;
566 
567 	return true;
568 }
569 
570 /* This function starts idle injection by calling idle_inject_start() */
571 static void trigger_idle_injection(void)
572 {
573 	unsigned int runtime = get_run_time();
574 
575 	idle_inject_set_duration(ii_dev, runtime, duration);
576 	idle_inject_start(ii_dev);
577 	powerclamp_data.clamping = true;
578 }
579 
580 /*
581  * This function is called from start_power_clamp() to register
582  * CPUS with powercap idle injection register and set default
583  * idle duration and latency.
584  */
585 static int powerclamp_idle_injection_register(void)
586 {
587 	poll_pkg_cstate_enable = false;
588 	if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
589 		ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
590 		if (topology_max_packages() == 1 && topology_max_dies_per_package() == 1)
591 			poll_pkg_cstate_enable = true;
592 	} else {
593 		ii_dev = idle_inject_register(idle_injection_cpu_mask);
594 	}
595 
596 	if (!ii_dev) {
597 		pr_err("powerclamp: idle_inject_register failed\n");
598 		return -EAGAIN;
599 	}
600 
601 	idle_inject_set_duration(ii_dev, TICK_USEC, duration);
602 	idle_inject_set_latency(ii_dev, UINT_MAX);
603 
604 	return 0;
605 }
606 
607 /*
608  * This function is called from end_power_clamp() to stop idle injection
609  * and unregister CPUS from powercap idle injection core.
610  */
611 static void remove_idle_injection(void)
612 {
613 	if (!powerclamp_data.clamping)
614 		return;
615 
616 	powerclamp_data.clamping = false;
617 	idle_inject_stop(ii_dev);
618 }
619 
620 /*
621  * This function is called when user change the cooling device
622  * state from zero to some other value.
623  */
624 static int start_power_clamp(void)
625 {
626 	int ret;
627 
628 	ret = powerclamp_idle_injection_register();
629 	if (!ret) {
630 		trigger_idle_injection();
631 		if (poll_pkg_cstate_enable)
632 			schedule_delayed_work(&poll_pkg_cstate_work, 0);
633 	}
634 
635 	return ret;
636 }
637 
638 /*
639  * This function is called when user change the cooling device
640  * state from non zero value zero.
641  */
642 static void end_power_clamp(void)
643 {
644 	if (powerclamp_data.clamping) {
645 		remove_idle_injection();
646 		idle_inject_unregister(ii_dev);
647 	}
648 }
649 
650 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
651 				 unsigned long *state)
652 {
653 	*state = MAX_TARGET_RATIO;
654 
655 	return 0;
656 }
657 
658 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
659 				 unsigned long *state)
660 {
661 	mutex_lock(&powerclamp_lock);
662 	*state = powerclamp_data.target_ratio;
663 	mutex_unlock(&powerclamp_lock);
664 
665 	return 0;
666 }
667 
668 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
669 				 unsigned long new_target_ratio)
670 {
671 	int ret = 0;
672 
673 	mutex_lock(&powerclamp_lock);
674 
675 	new_target_ratio = clamp(new_target_ratio, 0UL,
676 				(unsigned long) (max_idle - 1));
677 
678 	if (powerclamp_data.target_ratio == new_target_ratio)
679 		goto exit_set;
680 
681 	if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
682 		pr_info("Start idle injection to reduce power\n");
683 		powerclamp_data.target_ratio = new_target_ratio;
684 		ret = start_power_clamp();
685 		if (ret)
686 			powerclamp_data.target_ratio = 0;
687 		goto exit_set;
688 	} else	if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
689 		pr_info("Stop forced idle injection\n");
690 		end_power_clamp();
691 		powerclamp_data.target_ratio = 0;
692 	} else	/* adjust currently running */ {
693 		unsigned int runtime;
694 
695 		powerclamp_data.target_ratio = new_target_ratio;
696 		runtime = get_run_time();
697 		idle_inject_set_duration(ii_dev, runtime, duration);
698 	}
699 
700 exit_set:
701 	mutex_unlock(&powerclamp_lock);
702 
703 	return ret;
704 }
705 
706 /* bind to generic thermal layer as cooling device*/
707 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
708 	.get_max_state = powerclamp_get_max_state,
709 	.get_cur_state = powerclamp_get_cur_state,
710 	.set_cur_state = powerclamp_set_cur_state,
711 };
712 
713 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
714 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
715 	{}
716 };
717 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
718 
719 static int __init powerclamp_probe(void)
720 {
721 
722 	if (!x86_match_cpu(intel_powerclamp_ids)) {
723 		pr_err("CPU does not support MWAIT\n");
724 		return -ENODEV;
725 	}
726 
727 	/* The goal for idle time alignment is to achieve package cstate. */
728 	if (!has_pkg_state_counter()) {
729 		pr_info("No package C-state available\n");
730 		return -ENODEV;
731 	}
732 
733 	return 0;
734 }
735 
736 static int powerclamp_debug_show(struct seq_file *m, void *unused)
737 {
738 	int i = 0;
739 
740 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
741 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
742 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
743 			i,
744 			cal_data[i].confidence,
745 			cal_data[i].steady_comp,
746 			cal_data[i].dynamic_comp);
747 	}
748 
749 	return 0;
750 }
751 
752 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
753 
754 static inline void powerclamp_create_debug_files(void)
755 {
756 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
757 
758 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
759 			    &powerclamp_debug_fops);
760 }
761 
762 static int __init powerclamp_init(void)
763 {
764 	int retval;
765 
766 	/* probe cpu features and ids here */
767 	retval = powerclamp_probe();
768 	if (retval)
769 		return retval;
770 
771 	mutex_lock(&powerclamp_lock);
772 	if (!cpumask_available(idle_injection_cpu_mask))
773 		retval = allocate_copy_idle_injection_mask(cpu_present_mask);
774 	mutex_unlock(&powerclamp_lock);
775 
776 	if (retval)
777 		return retval;
778 
779 	/* set default limit, maybe adjusted during runtime based on feedback */
780 	window_size = 2;
781 
782 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
783 						      &powerclamp_cooling_ops);
784 	if (IS_ERR(cooling_dev))
785 		return -ENODEV;
786 
787 	if (!duration)
788 		duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
789 
790 	powerclamp_create_debug_files();
791 
792 	return 0;
793 }
794 module_init(powerclamp_init);
795 
796 static void __exit powerclamp_exit(void)
797 {
798 	mutex_lock(&powerclamp_lock);
799 	end_power_clamp();
800 	mutex_unlock(&powerclamp_lock);
801 
802 	thermal_cooling_device_unregister(cooling_dev);
803 
804 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
805 	debugfs_remove_recursive(debug_dir);
806 
807 	if (cpumask_available(idle_injection_cpu_mask))
808 		free_cpumask_var(idle_injection_cpu_mask);
809 }
810 module_exit(powerclamp_exit);
811 
812 MODULE_IMPORT_NS(IDLE_INJECT);
813 
814 MODULE_LICENSE("GPL");
815 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
816 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
817 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
818