xref: /linux/drivers/thermal/intel/intel_powerclamp.c (revision b0f84a84fff180718995b1269da2988e5b28be42)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39 
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45 
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57 
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 
61 /* user selected target */
62 static unsigned int set_target_ratio;
63 static unsigned int current_ratio;
64 static bool should_skip;
65 static bool reduce_irq;
66 static atomic_t idle_wakeup_counter;
67 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
68 				  * control parameters. default to BSP but BSP
69 				  * can be offlined.
70 				  */
71 static bool clamping;
72 
73 static const struct sched_param sparam = {
74 	.sched_priority = MAX_USER_RT_PRIO / 2,
75 };
76 struct powerclamp_worker_data {
77 	struct kthread_worker *worker;
78 	struct kthread_work balancing_work;
79 	struct kthread_delayed_work idle_injection_work;
80 	unsigned int cpu;
81 	unsigned int count;
82 	unsigned int guard;
83 	unsigned int window_size_now;
84 	unsigned int target_ratio;
85 	unsigned int duration_jiffies;
86 	bool clamping;
87 };
88 
89 static struct powerclamp_worker_data __percpu *worker_data;
90 static struct thermal_cooling_device *cooling_dev;
91 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
92 					   * clamping kthread worker
93 					   */
94 
95 static unsigned int duration;
96 static unsigned int pkg_cstate_ratio_cur;
97 static unsigned int window_size;
98 
99 static int duration_set(const char *arg, const struct kernel_param *kp)
100 {
101 	int ret = 0;
102 	unsigned long new_duration;
103 
104 	ret = kstrtoul(arg, 10, &new_duration);
105 	if (ret)
106 		goto exit;
107 	if (new_duration > 25 || new_duration < 6) {
108 		pr_err("Out of recommended range %lu, between 6-25ms\n",
109 			new_duration);
110 		ret = -EINVAL;
111 	}
112 
113 	duration = clamp(new_duration, 6ul, 25ul);
114 	smp_mb();
115 
116 exit:
117 
118 	return ret;
119 }
120 
121 static const struct kernel_param_ops duration_ops = {
122 	.set = duration_set,
123 	.get = param_get_int,
124 };
125 
126 
127 module_param_cb(duration, &duration_ops, &duration, 0644);
128 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
129 
130 struct powerclamp_calibration_data {
131 	unsigned long confidence;  /* used for calibration, basically a counter
132 				    * gets incremented each time a clamping
133 				    * period is completed without extra wakeups
134 				    * once that counter is reached given level,
135 				    * compensation is deemed usable.
136 				    */
137 	unsigned long steady_comp; /* steady state compensation used when
138 				    * no extra wakeups occurred.
139 				    */
140 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
141 				     * mostly from external interrupts.
142 				     */
143 };
144 
145 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
146 
147 static int window_size_set(const char *arg, const struct kernel_param *kp)
148 {
149 	int ret = 0;
150 	unsigned long new_window_size;
151 
152 	ret = kstrtoul(arg, 10, &new_window_size);
153 	if (ret)
154 		goto exit_win;
155 	if (new_window_size > 10 || new_window_size < 2) {
156 		pr_err("Out of recommended window size %lu, between 2-10\n",
157 			new_window_size);
158 		ret = -EINVAL;
159 	}
160 
161 	window_size = clamp(new_window_size, 2ul, 10ul);
162 	smp_mb();
163 
164 exit_win:
165 
166 	return ret;
167 }
168 
169 static const struct kernel_param_ops window_size_ops = {
170 	.set = window_size_set,
171 	.get = param_get_int,
172 };
173 
174 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
175 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
176 	"\tpowerclamp controls idle ratio within this window. larger\n"
177 	"\twindow size results in slower response time but more smooth\n"
178 	"\tclamping results. default to 2.");
179 
180 static void find_target_mwait(void)
181 {
182 	unsigned int eax, ebx, ecx, edx;
183 	unsigned int highest_cstate = 0;
184 	unsigned int highest_subcstate = 0;
185 	int i;
186 
187 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
188 		return;
189 
190 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
191 
192 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
193 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
194 		return;
195 
196 	edx >>= MWAIT_SUBSTATE_SIZE;
197 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
198 		if (edx & MWAIT_SUBSTATE_MASK) {
199 			highest_cstate = i;
200 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
201 		}
202 	}
203 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
204 		(highest_subcstate - 1);
205 
206 }
207 
208 struct pkg_cstate_info {
209 	bool skip;
210 	int msr_index;
211 	int cstate_id;
212 };
213 
214 #define PKG_CSTATE_INIT(id) {				\
215 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
216 		.cstate_id = id				\
217 			}
218 
219 static struct pkg_cstate_info pkg_cstates[] = {
220 	PKG_CSTATE_INIT(2),
221 	PKG_CSTATE_INIT(3),
222 	PKG_CSTATE_INIT(6),
223 	PKG_CSTATE_INIT(7),
224 	PKG_CSTATE_INIT(8),
225 	PKG_CSTATE_INIT(9),
226 	PKG_CSTATE_INIT(10),
227 	{NULL},
228 };
229 
230 static bool has_pkg_state_counter(void)
231 {
232 	u64 val;
233 	struct pkg_cstate_info *info = pkg_cstates;
234 
235 	/* check if any one of the counter msrs exists */
236 	while (info->msr_index) {
237 		if (!rdmsrl_safe(info->msr_index, &val))
238 			return true;
239 		info++;
240 	}
241 
242 	return false;
243 }
244 
245 static u64 pkg_state_counter(void)
246 {
247 	u64 val;
248 	u64 count = 0;
249 	struct pkg_cstate_info *info = pkg_cstates;
250 
251 	while (info->msr_index) {
252 		if (!info->skip) {
253 			if (!rdmsrl_safe(info->msr_index, &val))
254 				count += val;
255 			else
256 				info->skip = true;
257 		}
258 		info++;
259 	}
260 
261 	return count;
262 }
263 
264 static unsigned int get_compensation(int ratio)
265 {
266 	unsigned int comp = 0;
267 
268 	/* we only use compensation if all adjacent ones are good */
269 	if (ratio == 1 &&
270 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
271 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
272 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
273 		comp = (cal_data[ratio].steady_comp +
274 			cal_data[ratio + 1].steady_comp +
275 			cal_data[ratio + 2].steady_comp) / 3;
276 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
277 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
278 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
279 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
280 		comp = (cal_data[ratio].steady_comp +
281 			cal_data[ratio - 1].steady_comp +
282 			cal_data[ratio - 2].steady_comp) / 3;
283 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
284 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
286 		comp = (cal_data[ratio].steady_comp +
287 			cal_data[ratio - 1].steady_comp +
288 			cal_data[ratio + 1].steady_comp) / 3;
289 	}
290 
291 	/* REVISIT: simple penalty of double idle injection */
292 	if (reduce_irq)
293 		comp = ratio;
294 	/* do not exceed limit */
295 	if (comp + ratio >= MAX_TARGET_RATIO)
296 		comp = MAX_TARGET_RATIO - ratio - 1;
297 
298 	return comp;
299 }
300 
301 static void adjust_compensation(int target_ratio, unsigned int win)
302 {
303 	int delta;
304 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
305 
306 	/*
307 	 * adjust compensations if confidence level has not been reached or
308 	 * there are too many wakeups during the last idle injection period, we
309 	 * cannot trust the data for compensation.
310 	 */
311 	if (d->confidence >= CONFIDENCE_OK ||
312 		atomic_read(&idle_wakeup_counter) >
313 		win * num_online_cpus())
314 		return;
315 
316 	delta = set_target_ratio - current_ratio;
317 	/* filter out bad data */
318 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
319 		if (d->steady_comp)
320 			d->steady_comp =
321 				roundup(delta+d->steady_comp, 2)/2;
322 		else
323 			d->steady_comp = delta;
324 		d->confidence++;
325 	}
326 }
327 
328 static bool powerclamp_adjust_controls(unsigned int target_ratio,
329 				unsigned int guard, unsigned int win)
330 {
331 	static u64 msr_last, tsc_last;
332 	u64 msr_now, tsc_now;
333 	u64 val64;
334 
335 	/* check result for the last window */
336 	msr_now = pkg_state_counter();
337 	tsc_now = rdtsc();
338 
339 	/* calculate pkg cstate vs tsc ratio */
340 	if (!msr_last || !tsc_last)
341 		current_ratio = 1;
342 	else if (tsc_now-tsc_last) {
343 		val64 = 100*(msr_now-msr_last);
344 		do_div(val64, (tsc_now-tsc_last));
345 		current_ratio = val64;
346 	}
347 
348 	/* update record */
349 	msr_last = msr_now;
350 	tsc_last = tsc_now;
351 
352 	adjust_compensation(target_ratio, win);
353 	/*
354 	 * too many external interrupts, set flag such
355 	 * that we can take measure later.
356 	 */
357 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
358 		2 * win * num_online_cpus();
359 
360 	atomic_set(&idle_wakeup_counter, 0);
361 	/* if we are above target+guard, skip */
362 	return set_target_ratio + guard <= current_ratio;
363 }
364 
365 static void clamp_balancing_func(struct kthread_work *work)
366 {
367 	struct powerclamp_worker_data *w_data;
368 	int sleeptime;
369 	unsigned long target_jiffies;
370 	unsigned int compensated_ratio;
371 	int interval; /* jiffies to sleep for each attempt */
372 
373 	w_data = container_of(work, struct powerclamp_worker_data,
374 			      balancing_work);
375 
376 	/*
377 	 * make sure user selected ratio does not take effect until
378 	 * the next round. adjust target_ratio if user has changed
379 	 * target such that we can converge quickly.
380 	 */
381 	w_data->target_ratio = READ_ONCE(set_target_ratio);
382 	w_data->guard = 1 + w_data->target_ratio / 20;
383 	w_data->window_size_now = window_size;
384 	w_data->duration_jiffies = msecs_to_jiffies(duration);
385 	w_data->count++;
386 
387 	/*
388 	 * systems may have different ability to enter package level
389 	 * c-states, thus we need to compensate the injected idle ratio
390 	 * to achieve the actual target reported by the HW.
391 	 */
392 	compensated_ratio = w_data->target_ratio +
393 		get_compensation(w_data->target_ratio);
394 	if (compensated_ratio <= 0)
395 		compensated_ratio = 1;
396 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
397 
398 	/* align idle time */
399 	target_jiffies = roundup(jiffies, interval);
400 	sleeptime = target_jiffies - jiffies;
401 	if (sleeptime <= 0)
402 		sleeptime = 1;
403 
404 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
405 		kthread_queue_delayed_work(w_data->worker,
406 					   &w_data->idle_injection_work,
407 					   sleeptime);
408 }
409 
410 static void clamp_idle_injection_func(struct kthread_work *work)
411 {
412 	struct powerclamp_worker_data *w_data;
413 
414 	w_data = container_of(work, struct powerclamp_worker_data,
415 			      idle_injection_work.work);
416 
417 	/*
418 	 * only elected controlling cpu can collect stats and update
419 	 * control parameters.
420 	 */
421 	if (w_data->cpu == control_cpu &&
422 	    !(w_data->count % w_data->window_size_now)) {
423 		should_skip =
424 			powerclamp_adjust_controls(w_data->target_ratio,
425 						   w_data->guard,
426 						   w_data->window_size_now);
427 		smp_mb();
428 	}
429 
430 	if (should_skip)
431 		goto balance;
432 
433 	play_idle(jiffies_to_msecs(w_data->duration_jiffies));
434 
435 balance:
436 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
437 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
438 }
439 
440 /*
441  * 1 HZ polling while clamping is active, useful for userspace
442  * to monitor actual idle ratio.
443  */
444 static void poll_pkg_cstate(struct work_struct *dummy);
445 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
446 static void poll_pkg_cstate(struct work_struct *dummy)
447 {
448 	static u64 msr_last;
449 	static u64 tsc_last;
450 
451 	u64 msr_now;
452 	u64 tsc_now;
453 	u64 val64;
454 
455 	msr_now = pkg_state_counter();
456 	tsc_now = rdtsc();
457 
458 	/* calculate pkg cstate vs tsc ratio */
459 	if (!msr_last || !tsc_last)
460 		pkg_cstate_ratio_cur = 1;
461 	else {
462 		if (tsc_now - tsc_last) {
463 			val64 = 100 * (msr_now - msr_last);
464 			do_div(val64, (tsc_now - tsc_last));
465 			pkg_cstate_ratio_cur = val64;
466 		}
467 	}
468 
469 	/* update record */
470 	msr_last = msr_now;
471 	tsc_last = tsc_now;
472 
473 	if (true == clamping)
474 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
475 }
476 
477 static void start_power_clamp_worker(unsigned long cpu)
478 {
479 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
480 	struct kthread_worker *worker;
481 
482 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
483 	if (IS_ERR(worker))
484 		return;
485 
486 	w_data->worker = worker;
487 	w_data->count = 0;
488 	w_data->cpu = cpu;
489 	w_data->clamping = true;
490 	set_bit(cpu, cpu_clamping_mask);
491 	sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
492 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
493 	kthread_init_delayed_work(&w_data->idle_injection_work,
494 				  clamp_idle_injection_func);
495 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
496 }
497 
498 static void stop_power_clamp_worker(unsigned long cpu)
499 {
500 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
501 
502 	if (!w_data->worker)
503 		return;
504 
505 	w_data->clamping = false;
506 	/*
507 	 * Make sure that all works that get queued after this point see
508 	 * the clamping disabled. The counter part is not needed because
509 	 * there is an implicit memory barrier when the queued work
510 	 * is proceed.
511 	 */
512 	smp_wmb();
513 	kthread_cancel_work_sync(&w_data->balancing_work);
514 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
515 	/*
516 	 * The balancing work still might be queued here because
517 	 * the handling of the "clapming" variable, cancel, and queue
518 	 * operations are not synchronized via a lock. But it is not
519 	 * a big deal. The balancing work is fast and destroy kthread
520 	 * will wait for it.
521 	 */
522 	clear_bit(w_data->cpu, cpu_clamping_mask);
523 	kthread_destroy_worker(w_data->worker);
524 
525 	w_data->worker = NULL;
526 }
527 
528 static int start_power_clamp(void)
529 {
530 	unsigned long cpu;
531 
532 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
533 	/* prevent cpu hotplug */
534 	get_online_cpus();
535 
536 	/* prefer BSP */
537 	control_cpu = 0;
538 	if (!cpu_online(control_cpu))
539 		control_cpu = smp_processor_id();
540 
541 	clamping = true;
542 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
543 
544 	/* start one kthread worker per online cpu */
545 	for_each_online_cpu(cpu) {
546 		start_power_clamp_worker(cpu);
547 	}
548 	put_online_cpus();
549 
550 	return 0;
551 }
552 
553 static void end_power_clamp(void)
554 {
555 	int i;
556 
557 	/*
558 	 * Block requeuing in all the kthread workers. They will flush and
559 	 * stop faster.
560 	 */
561 	clamping = false;
562 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
563 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
564 			pr_debug("clamping worker for cpu %d alive, destroy\n",
565 				 i);
566 			stop_power_clamp_worker(i);
567 		}
568 	}
569 }
570 
571 static int powerclamp_cpu_online(unsigned int cpu)
572 {
573 	if (clamping == false)
574 		return 0;
575 	start_power_clamp_worker(cpu);
576 	/* prefer BSP as controlling CPU */
577 	if (cpu == 0) {
578 		control_cpu = 0;
579 		smp_mb();
580 	}
581 	return 0;
582 }
583 
584 static int powerclamp_cpu_predown(unsigned int cpu)
585 {
586 	if (clamping == false)
587 		return 0;
588 
589 	stop_power_clamp_worker(cpu);
590 	if (cpu != control_cpu)
591 		return 0;
592 
593 	control_cpu = cpumask_first(cpu_online_mask);
594 	if (control_cpu == cpu)
595 		control_cpu = cpumask_next(cpu, cpu_online_mask);
596 	smp_mb();
597 	return 0;
598 }
599 
600 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
601 				 unsigned long *state)
602 {
603 	*state = MAX_TARGET_RATIO;
604 
605 	return 0;
606 }
607 
608 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
609 				 unsigned long *state)
610 {
611 	if (true == clamping)
612 		*state = pkg_cstate_ratio_cur;
613 	else
614 		/* to save power, do not poll idle ratio while not clamping */
615 		*state = -1; /* indicates invalid state */
616 
617 	return 0;
618 }
619 
620 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
621 				 unsigned long new_target_ratio)
622 {
623 	int ret = 0;
624 
625 	new_target_ratio = clamp(new_target_ratio, 0UL,
626 				(unsigned long) (MAX_TARGET_RATIO-1));
627 	if (set_target_ratio == 0 && new_target_ratio > 0) {
628 		pr_info("Start idle injection to reduce power\n");
629 		set_target_ratio = new_target_ratio;
630 		ret = start_power_clamp();
631 		goto exit_set;
632 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
633 		pr_info("Stop forced idle injection\n");
634 		end_power_clamp();
635 		set_target_ratio = 0;
636 	} else	/* adjust currently running */ {
637 		set_target_ratio = new_target_ratio;
638 		/* make new set_target_ratio visible to other cpus */
639 		smp_mb();
640 	}
641 
642 exit_set:
643 	return ret;
644 }
645 
646 /* bind to generic thermal layer as cooling device*/
647 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
648 	.get_max_state = powerclamp_get_max_state,
649 	.get_cur_state = powerclamp_get_cur_state,
650 	.set_cur_state = powerclamp_set_cur_state,
651 };
652 
653 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
654 	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
655 	{}
656 };
657 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
658 
659 static int __init powerclamp_probe(void)
660 {
661 
662 	if (!x86_match_cpu(intel_powerclamp_ids)) {
663 		pr_err("CPU does not support MWAIT\n");
664 		return -ENODEV;
665 	}
666 
667 	/* The goal for idle time alignment is to achieve package cstate. */
668 	if (!has_pkg_state_counter()) {
669 		pr_info("No package C-state available\n");
670 		return -ENODEV;
671 	}
672 
673 	/* find the deepest mwait value */
674 	find_target_mwait();
675 
676 	return 0;
677 }
678 
679 static int powerclamp_debug_show(struct seq_file *m, void *unused)
680 {
681 	int i = 0;
682 
683 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
684 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
685 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
686 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
687 			i,
688 			cal_data[i].confidence,
689 			cal_data[i].steady_comp,
690 			cal_data[i].dynamic_comp);
691 	}
692 
693 	return 0;
694 }
695 
696 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
697 
698 static inline void powerclamp_create_debug_files(void)
699 {
700 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
701 	if (!debug_dir)
702 		return;
703 
704 	if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
705 					cal_data, &powerclamp_debug_fops))
706 		goto file_error;
707 
708 	return;
709 
710 file_error:
711 	debugfs_remove_recursive(debug_dir);
712 }
713 
714 static enum cpuhp_state hp_state;
715 
716 static int __init powerclamp_init(void)
717 {
718 	int retval;
719 	int bitmap_size;
720 
721 	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
722 	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
723 	if (!cpu_clamping_mask)
724 		return -ENOMEM;
725 
726 	/* probe cpu features and ids here */
727 	retval = powerclamp_probe();
728 	if (retval)
729 		goto exit_free;
730 
731 	/* set default limit, maybe adjusted during runtime based on feedback */
732 	window_size = 2;
733 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
734 					   "thermal/intel_powerclamp:online",
735 					   powerclamp_cpu_online,
736 					   powerclamp_cpu_predown);
737 	if (retval < 0)
738 		goto exit_free;
739 
740 	hp_state = retval;
741 
742 	worker_data = alloc_percpu(struct powerclamp_worker_data);
743 	if (!worker_data) {
744 		retval = -ENOMEM;
745 		goto exit_unregister;
746 	}
747 
748 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
749 						&powerclamp_cooling_ops);
750 	if (IS_ERR(cooling_dev)) {
751 		retval = -ENODEV;
752 		goto exit_free_thread;
753 	}
754 
755 	if (!duration)
756 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
757 
758 	powerclamp_create_debug_files();
759 
760 	return 0;
761 
762 exit_free_thread:
763 	free_percpu(worker_data);
764 exit_unregister:
765 	cpuhp_remove_state_nocalls(hp_state);
766 exit_free:
767 	kfree(cpu_clamping_mask);
768 	return retval;
769 }
770 module_init(powerclamp_init);
771 
772 static void __exit powerclamp_exit(void)
773 {
774 	end_power_clamp();
775 	cpuhp_remove_state_nocalls(hp_state);
776 	free_percpu(worker_data);
777 	thermal_cooling_device_unregister(cooling_dev);
778 	kfree(cpu_clamping_mask);
779 
780 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
781 	debugfs_remove_recursive(debug_dir);
782 }
783 module_exit(powerclamp_exit);
784 
785 MODULE_LICENSE("GPL");
786 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
787 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
788 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
789