xref: /linux/drivers/thermal/intel/intel_powerclamp.c (revision 4c0c5bbc89cda1c57ce0fb36d917693396b8b065)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39 
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45 
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57 
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 
61 /* user selected target */
62 static unsigned int set_target_ratio;
63 static unsigned int current_ratio;
64 static bool should_skip;
65 
66 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
67 				  * control parameters. default to BSP but BSP
68 				  * can be offlined.
69 				  */
70 static bool clamping;
71 
72 struct powerclamp_worker_data {
73 	struct kthread_worker *worker;
74 	struct kthread_work balancing_work;
75 	struct kthread_delayed_work idle_injection_work;
76 	unsigned int cpu;
77 	unsigned int count;
78 	unsigned int guard;
79 	unsigned int window_size_now;
80 	unsigned int target_ratio;
81 	unsigned int duration_jiffies;
82 	bool clamping;
83 };
84 
85 static struct powerclamp_worker_data __percpu *worker_data;
86 static struct thermal_cooling_device *cooling_dev;
87 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
88 					   * clamping kthread worker
89 					   */
90 
91 static unsigned int duration;
92 static unsigned int pkg_cstate_ratio_cur;
93 static unsigned int window_size;
94 
95 static int duration_set(const char *arg, const struct kernel_param *kp)
96 {
97 	int ret = 0;
98 	unsigned long new_duration;
99 
100 	ret = kstrtoul(arg, 10, &new_duration);
101 	if (ret)
102 		goto exit;
103 	if (new_duration > 25 || new_duration < 6) {
104 		pr_err("Out of recommended range %lu, between 6-25ms\n",
105 			new_duration);
106 		ret = -EINVAL;
107 	}
108 
109 	duration = clamp(new_duration, 6ul, 25ul);
110 	smp_mb();
111 
112 exit:
113 
114 	return ret;
115 }
116 
117 static const struct kernel_param_ops duration_ops = {
118 	.set = duration_set,
119 	.get = param_get_int,
120 };
121 
122 
123 module_param_cb(duration, &duration_ops, &duration, 0644);
124 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
125 
126 struct powerclamp_calibration_data {
127 	unsigned long confidence;  /* used for calibration, basically a counter
128 				    * gets incremented each time a clamping
129 				    * period is completed without extra wakeups
130 				    * once that counter is reached given level,
131 				    * compensation is deemed usable.
132 				    */
133 	unsigned long steady_comp; /* steady state compensation used when
134 				    * no extra wakeups occurred.
135 				    */
136 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
137 				     * mostly from external interrupts.
138 				     */
139 };
140 
141 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
142 
143 static int window_size_set(const char *arg, const struct kernel_param *kp)
144 {
145 	int ret = 0;
146 	unsigned long new_window_size;
147 
148 	ret = kstrtoul(arg, 10, &new_window_size);
149 	if (ret)
150 		goto exit_win;
151 	if (new_window_size > 10 || new_window_size < 2) {
152 		pr_err("Out of recommended window size %lu, between 2-10\n",
153 			new_window_size);
154 		ret = -EINVAL;
155 	}
156 
157 	window_size = clamp(new_window_size, 2ul, 10ul);
158 	smp_mb();
159 
160 exit_win:
161 
162 	return ret;
163 }
164 
165 static const struct kernel_param_ops window_size_ops = {
166 	.set = window_size_set,
167 	.get = param_get_int,
168 };
169 
170 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
171 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
172 	"\tpowerclamp controls idle ratio within this window. larger\n"
173 	"\twindow size results in slower response time but more smooth\n"
174 	"\tclamping results. default to 2.");
175 
176 static void find_target_mwait(void)
177 {
178 	unsigned int eax, ebx, ecx, edx;
179 	unsigned int highest_cstate = 0;
180 	unsigned int highest_subcstate = 0;
181 	int i;
182 
183 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
184 		return;
185 
186 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
187 
188 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
189 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
190 		return;
191 
192 	edx >>= MWAIT_SUBSTATE_SIZE;
193 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
194 		if (edx & MWAIT_SUBSTATE_MASK) {
195 			highest_cstate = i;
196 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
197 		}
198 	}
199 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
200 		(highest_subcstate - 1);
201 
202 }
203 
204 struct pkg_cstate_info {
205 	bool skip;
206 	int msr_index;
207 	int cstate_id;
208 };
209 
210 #define PKG_CSTATE_INIT(id) {				\
211 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
212 		.cstate_id = id				\
213 			}
214 
215 static struct pkg_cstate_info pkg_cstates[] = {
216 	PKG_CSTATE_INIT(2),
217 	PKG_CSTATE_INIT(3),
218 	PKG_CSTATE_INIT(6),
219 	PKG_CSTATE_INIT(7),
220 	PKG_CSTATE_INIT(8),
221 	PKG_CSTATE_INIT(9),
222 	PKG_CSTATE_INIT(10),
223 	{NULL},
224 };
225 
226 static bool has_pkg_state_counter(void)
227 {
228 	u64 val;
229 	struct pkg_cstate_info *info = pkg_cstates;
230 
231 	/* check if any one of the counter msrs exists */
232 	while (info->msr_index) {
233 		if (!rdmsrl_safe(info->msr_index, &val))
234 			return true;
235 		info++;
236 	}
237 
238 	return false;
239 }
240 
241 static u64 pkg_state_counter(void)
242 {
243 	u64 val;
244 	u64 count = 0;
245 	struct pkg_cstate_info *info = pkg_cstates;
246 
247 	while (info->msr_index) {
248 		if (!info->skip) {
249 			if (!rdmsrl_safe(info->msr_index, &val))
250 				count += val;
251 			else
252 				info->skip = true;
253 		}
254 		info++;
255 	}
256 
257 	return count;
258 }
259 
260 static unsigned int get_compensation(int ratio)
261 {
262 	unsigned int comp = 0;
263 
264 	/* we only use compensation if all adjacent ones are good */
265 	if (ratio == 1 &&
266 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
267 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
268 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
269 		comp = (cal_data[ratio].steady_comp +
270 			cal_data[ratio + 1].steady_comp +
271 			cal_data[ratio + 2].steady_comp) / 3;
272 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
273 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
274 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
275 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
276 		comp = (cal_data[ratio].steady_comp +
277 			cal_data[ratio - 1].steady_comp +
278 			cal_data[ratio - 2].steady_comp) / 3;
279 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
280 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
281 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
282 		comp = (cal_data[ratio].steady_comp +
283 			cal_data[ratio - 1].steady_comp +
284 			cal_data[ratio + 1].steady_comp) / 3;
285 	}
286 
287 	/* do not exceed limit */
288 	if (comp + ratio >= MAX_TARGET_RATIO)
289 		comp = MAX_TARGET_RATIO - ratio - 1;
290 
291 	return comp;
292 }
293 
294 static void adjust_compensation(int target_ratio, unsigned int win)
295 {
296 	int delta;
297 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
298 
299 	/*
300 	 * adjust compensations if confidence level has not been reached.
301 	 */
302 	if (d->confidence >= CONFIDENCE_OK)
303 		return;
304 
305 	delta = set_target_ratio - current_ratio;
306 	/* filter out bad data */
307 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
308 		if (d->steady_comp)
309 			d->steady_comp =
310 				roundup(delta+d->steady_comp, 2)/2;
311 		else
312 			d->steady_comp = delta;
313 		d->confidence++;
314 	}
315 }
316 
317 static bool powerclamp_adjust_controls(unsigned int target_ratio,
318 				unsigned int guard, unsigned int win)
319 {
320 	static u64 msr_last, tsc_last;
321 	u64 msr_now, tsc_now;
322 	u64 val64;
323 
324 	/* check result for the last window */
325 	msr_now = pkg_state_counter();
326 	tsc_now = rdtsc();
327 
328 	/* calculate pkg cstate vs tsc ratio */
329 	if (!msr_last || !tsc_last)
330 		current_ratio = 1;
331 	else if (tsc_now-tsc_last) {
332 		val64 = 100*(msr_now-msr_last);
333 		do_div(val64, (tsc_now-tsc_last));
334 		current_ratio = val64;
335 	}
336 
337 	/* update record */
338 	msr_last = msr_now;
339 	tsc_last = tsc_now;
340 
341 	adjust_compensation(target_ratio, win);
342 
343 	/* if we are above target+guard, skip */
344 	return set_target_ratio + guard <= current_ratio;
345 }
346 
347 static void clamp_balancing_func(struct kthread_work *work)
348 {
349 	struct powerclamp_worker_data *w_data;
350 	int sleeptime;
351 	unsigned long target_jiffies;
352 	unsigned int compensated_ratio;
353 	int interval; /* jiffies to sleep for each attempt */
354 
355 	w_data = container_of(work, struct powerclamp_worker_data,
356 			      balancing_work);
357 
358 	/*
359 	 * make sure user selected ratio does not take effect until
360 	 * the next round. adjust target_ratio if user has changed
361 	 * target such that we can converge quickly.
362 	 */
363 	w_data->target_ratio = READ_ONCE(set_target_ratio);
364 	w_data->guard = 1 + w_data->target_ratio / 20;
365 	w_data->window_size_now = window_size;
366 	w_data->duration_jiffies = msecs_to_jiffies(duration);
367 	w_data->count++;
368 
369 	/*
370 	 * systems may have different ability to enter package level
371 	 * c-states, thus we need to compensate the injected idle ratio
372 	 * to achieve the actual target reported by the HW.
373 	 */
374 	compensated_ratio = w_data->target_ratio +
375 		get_compensation(w_data->target_ratio);
376 	if (compensated_ratio <= 0)
377 		compensated_ratio = 1;
378 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
379 
380 	/* align idle time */
381 	target_jiffies = roundup(jiffies, interval);
382 	sleeptime = target_jiffies - jiffies;
383 	if (sleeptime <= 0)
384 		sleeptime = 1;
385 
386 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
387 		kthread_queue_delayed_work(w_data->worker,
388 					   &w_data->idle_injection_work,
389 					   sleeptime);
390 }
391 
392 static void clamp_idle_injection_func(struct kthread_work *work)
393 {
394 	struct powerclamp_worker_data *w_data;
395 
396 	w_data = container_of(work, struct powerclamp_worker_data,
397 			      idle_injection_work.work);
398 
399 	/*
400 	 * only elected controlling cpu can collect stats and update
401 	 * control parameters.
402 	 */
403 	if (w_data->cpu == control_cpu &&
404 	    !(w_data->count % w_data->window_size_now)) {
405 		should_skip =
406 			powerclamp_adjust_controls(w_data->target_ratio,
407 						   w_data->guard,
408 						   w_data->window_size_now);
409 		smp_mb();
410 	}
411 
412 	if (should_skip)
413 		goto balance;
414 
415 	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
416 
417 balance:
418 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
419 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
420 }
421 
422 /*
423  * 1 HZ polling while clamping is active, useful for userspace
424  * to monitor actual idle ratio.
425  */
426 static void poll_pkg_cstate(struct work_struct *dummy);
427 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
428 static void poll_pkg_cstate(struct work_struct *dummy)
429 {
430 	static u64 msr_last;
431 	static u64 tsc_last;
432 
433 	u64 msr_now;
434 	u64 tsc_now;
435 	u64 val64;
436 
437 	msr_now = pkg_state_counter();
438 	tsc_now = rdtsc();
439 
440 	/* calculate pkg cstate vs tsc ratio */
441 	if (!msr_last || !tsc_last)
442 		pkg_cstate_ratio_cur = 1;
443 	else {
444 		if (tsc_now - tsc_last) {
445 			val64 = 100 * (msr_now - msr_last);
446 			do_div(val64, (tsc_now - tsc_last));
447 			pkg_cstate_ratio_cur = val64;
448 		}
449 	}
450 
451 	/* update record */
452 	msr_last = msr_now;
453 	tsc_last = tsc_now;
454 
455 	if (true == clamping)
456 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
457 }
458 
459 static void start_power_clamp_worker(unsigned long cpu)
460 {
461 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
462 	struct kthread_worker *worker;
463 
464 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
465 	if (IS_ERR(worker))
466 		return;
467 
468 	w_data->worker = worker;
469 	w_data->count = 0;
470 	w_data->cpu = cpu;
471 	w_data->clamping = true;
472 	set_bit(cpu, cpu_clamping_mask);
473 	sched_set_fifo(worker->task);
474 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
475 	kthread_init_delayed_work(&w_data->idle_injection_work,
476 				  clamp_idle_injection_func);
477 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
478 }
479 
480 static void stop_power_clamp_worker(unsigned long cpu)
481 {
482 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
483 
484 	if (!w_data->worker)
485 		return;
486 
487 	w_data->clamping = false;
488 	/*
489 	 * Make sure that all works that get queued after this point see
490 	 * the clamping disabled. The counter part is not needed because
491 	 * there is an implicit memory barrier when the queued work
492 	 * is proceed.
493 	 */
494 	smp_wmb();
495 	kthread_cancel_work_sync(&w_data->balancing_work);
496 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
497 	/*
498 	 * The balancing work still might be queued here because
499 	 * the handling of the "clapming" variable, cancel, and queue
500 	 * operations are not synchronized via a lock. But it is not
501 	 * a big deal. The balancing work is fast and destroy kthread
502 	 * will wait for it.
503 	 */
504 	clear_bit(w_data->cpu, cpu_clamping_mask);
505 	kthread_destroy_worker(w_data->worker);
506 
507 	w_data->worker = NULL;
508 }
509 
510 static int start_power_clamp(void)
511 {
512 	unsigned long cpu;
513 
514 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
515 	/* prevent cpu hotplug */
516 	cpus_read_lock();
517 
518 	/* prefer BSP */
519 	control_cpu = 0;
520 	if (!cpu_online(control_cpu)) {
521 		control_cpu = get_cpu();
522 		put_cpu();
523 	}
524 
525 	clamping = true;
526 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
527 
528 	/* start one kthread worker per online cpu */
529 	for_each_online_cpu(cpu) {
530 		start_power_clamp_worker(cpu);
531 	}
532 	cpus_read_unlock();
533 
534 	return 0;
535 }
536 
537 static void end_power_clamp(void)
538 {
539 	int i;
540 
541 	/*
542 	 * Block requeuing in all the kthread workers. They will flush and
543 	 * stop faster.
544 	 */
545 	clamping = false;
546 	for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
547 		pr_debug("clamping worker for cpu %d alive, destroy\n", i);
548 		stop_power_clamp_worker(i);
549 	}
550 }
551 
552 static int powerclamp_cpu_online(unsigned int cpu)
553 {
554 	if (clamping == false)
555 		return 0;
556 	start_power_clamp_worker(cpu);
557 	/* prefer BSP as controlling CPU */
558 	if (cpu == 0) {
559 		control_cpu = 0;
560 		smp_mb();
561 	}
562 	return 0;
563 }
564 
565 static int powerclamp_cpu_predown(unsigned int cpu)
566 {
567 	if (clamping == false)
568 		return 0;
569 
570 	stop_power_clamp_worker(cpu);
571 	if (cpu != control_cpu)
572 		return 0;
573 
574 	control_cpu = cpumask_first(cpu_online_mask);
575 	if (control_cpu == cpu)
576 		control_cpu = cpumask_next(cpu, cpu_online_mask);
577 	smp_mb();
578 	return 0;
579 }
580 
581 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
582 				 unsigned long *state)
583 {
584 	*state = MAX_TARGET_RATIO;
585 
586 	return 0;
587 }
588 
589 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
590 				 unsigned long *state)
591 {
592 	if (true == clamping)
593 		*state = pkg_cstate_ratio_cur;
594 	else
595 		/* to save power, do not poll idle ratio while not clamping */
596 		*state = -1; /* indicates invalid state */
597 
598 	return 0;
599 }
600 
601 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
602 				 unsigned long new_target_ratio)
603 {
604 	int ret = 0;
605 
606 	new_target_ratio = clamp(new_target_ratio, 0UL,
607 				(unsigned long) (MAX_TARGET_RATIO-1));
608 	if (set_target_ratio == 0 && new_target_ratio > 0) {
609 		pr_info("Start idle injection to reduce power\n");
610 		set_target_ratio = new_target_ratio;
611 		ret = start_power_clamp();
612 		goto exit_set;
613 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
614 		pr_info("Stop forced idle injection\n");
615 		end_power_clamp();
616 		set_target_ratio = 0;
617 	} else	/* adjust currently running */ {
618 		set_target_ratio = new_target_ratio;
619 		/* make new set_target_ratio visible to other cpus */
620 		smp_mb();
621 	}
622 
623 exit_set:
624 	return ret;
625 }
626 
627 /* bind to generic thermal layer as cooling device*/
628 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
629 	.get_max_state = powerclamp_get_max_state,
630 	.get_cur_state = powerclamp_get_cur_state,
631 	.set_cur_state = powerclamp_set_cur_state,
632 };
633 
634 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
635 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
636 	{}
637 };
638 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
639 
640 static int __init powerclamp_probe(void)
641 {
642 
643 	if (!x86_match_cpu(intel_powerclamp_ids)) {
644 		pr_err("CPU does not support MWAIT\n");
645 		return -ENODEV;
646 	}
647 
648 	/* The goal for idle time alignment is to achieve package cstate. */
649 	if (!has_pkg_state_counter()) {
650 		pr_info("No package C-state available\n");
651 		return -ENODEV;
652 	}
653 
654 	/* find the deepest mwait value */
655 	find_target_mwait();
656 
657 	return 0;
658 }
659 
660 static int powerclamp_debug_show(struct seq_file *m, void *unused)
661 {
662 	int i = 0;
663 
664 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
665 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
666 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
667 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
668 			i,
669 			cal_data[i].confidence,
670 			cal_data[i].steady_comp,
671 			cal_data[i].dynamic_comp);
672 	}
673 
674 	return 0;
675 }
676 
677 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
678 
679 static inline void powerclamp_create_debug_files(void)
680 {
681 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
682 
683 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
684 			    &powerclamp_debug_fops);
685 }
686 
687 static enum cpuhp_state hp_state;
688 
689 static int __init powerclamp_init(void)
690 {
691 	int retval;
692 
693 	cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
694 	if (!cpu_clamping_mask)
695 		return -ENOMEM;
696 
697 	/* probe cpu features and ids here */
698 	retval = powerclamp_probe();
699 	if (retval)
700 		goto exit_free;
701 
702 	/* set default limit, maybe adjusted during runtime based on feedback */
703 	window_size = 2;
704 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
705 					   "thermal/intel_powerclamp:online",
706 					   powerclamp_cpu_online,
707 					   powerclamp_cpu_predown);
708 	if (retval < 0)
709 		goto exit_free;
710 
711 	hp_state = retval;
712 
713 	worker_data = alloc_percpu(struct powerclamp_worker_data);
714 	if (!worker_data) {
715 		retval = -ENOMEM;
716 		goto exit_unregister;
717 	}
718 
719 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
720 						&powerclamp_cooling_ops);
721 	if (IS_ERR(cooling_dev)) {
722 		retval = -ENODEV;
723 		goto exit_free_thread;
724 	}
725 
726 	if (!duration)
727 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
728 
729 	powerclamp_create_debug_files();
730 
731 	return 0;
732 
733 exit_free_thread:
734 	free_percpu(worker_data);
735 exit_unregister:
736 	cpuhp_remove_state_nocalls(hp_state);
737 exit_free:
738 	bitmap_free(cpu_clamping_mask);
739 	return retval;
740 }
741 module_init(powerclamp_init);
742 
743 static void __exit powerclamp_exit(void)
744 {
745 	end_power_clamp();
746 	cpuhp_remove_state_nocalls(hp_state);
747 	free_percpu(worker_data);
748 	thermal_cooling_device_unregister(cooling_dev);
749 	bitmap_free(cpu_clamping_mask);
750 
751 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
752 	debugfs_remove_recursive(debug_dir);
753 }
754 module_exit(powerclamp_exit);
755 
756 MODULE_LICENSE("GPL");
757 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
758 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
759 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
760