xref: /linux/drivers/cpufreq/powernv-cpufreq.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * POWERNV cpufreq driver for the IBM POWER processors
3  *
4  * (C) Copyright IBM 2014
5  *
6  * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2, or (at your option)
11  * any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  */
19 
20 #define pr_fmt(fmt)	"powernv-cpufreq: " fmt
21 
22 #include <linux/kernel.h>
23 #include <linux/sysfs.h>
24 #include <linux/cpumask.h>
25 #include <linux/module.h>
26 #include <linux/cpufreq.h>
27 #include <linux/smp.h>
28 #include <linux/of.h>
29 #include <linux/reboot.h>
30 #include <linux/slab.h>
31 #include <linux/cpu.h>
32 #include <trace/events/power.h>
33 
34 #include <asm/cputhreads.h>
35 #include <asm/firmware.h>
36 #include <asm/reg.h>
37 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
38 #include <asm/opal.h>
39 #include <linux/timer.h>
40 
41 #define POWERNV_MAX_PSTATES	256
42 #define PMSR_PSAFE_ENABLE	(1UL << 30)
43 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
44 #define PMSR_MAX(x)		((x >> 32) & 0xFF)
45 
46 #define MAX_RAMP_DOWN_TIME				5120
47 /*
48  * On an idle system we want the global pstate to ramp-down from max value to
49  * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
50  * then ramp-down rapidly later on.
51  *
52  * This gives a percentage rampdown for time elapsed in milliseconds.
53  * ramp_down_percentage = ((ms * ms) >> 18)
54  *			~= 3.8 * (sec * sec)
55  *
56  * At 0 ms	ramp_down_percent = 0
57  * At 5120 ms	ramp_down_percent = 100
58  */
59 #define ramp_down_percent(time)		((time * time) >> 18)
60 
61 /* Interval after which the timer is queued to bring down global pstate */
62 #define GPSTATE_TIMER_INTERVAL				2000
63 
64 /**
65  * struct global_pstate_info -	Per policy data structure to maintain history of
66  *				global pstates
67  * @highest_lpstate:		The local pstate from which we are ramping down
68  * @elapsed_time:		Time in ms spent in ramping down from
69  *				highest_lpstate
70  * @last_sampled_time:		Time from boot in ms when global pstates were
71  *				last set
72  * @last_lpstate,last_gpstate:	Last set values for local and global pstates
73  * @timer:			Is used for ramping down if cpu goes idle for
74  *				a long time with global pstate held high
75  * @gpstate_lock:		A spinlock to maintain synchronization between
76  *				routines called by the timer handler and
77  *				governer's target_index calls
78  */
79 struct global_pstate_info {
80 	int highest_lpstate;
81 	unsigned int elapsed_time;
82 	unsigned int last_sampled_time;
83 	int last_lpstate;
84 	int last_gpstate;
85 	spinlock_t gpstate_lock;
86 	struct timer_list timer;
87 };
88 
89 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
90 static bool rebooting, throttled, occ_reset;
91 
92 static const char * const throttle_reason[] = {
93 	"No throttling",
94 	"Power Cap",
95 	"Processor Over Temperature",
96 	"Power Supply Failure",
97 	"Over Current",
98 	"OCC Reset"
99 };
100 
101 enum throttle_reason_type {
102 	NO_THROTTLE = 0,
103 	POWERCAP,
104 	CPU_OVERTEMP,
105 	POWER_SUPPLY_FAILURE,
106 	OVERCURRENT,
107 	OCC_RESET_THROTTLE,
108 	OCC_MAX_REASON
109 };
110 
111 static struct chip {
112 	unsigned int id;
113 	bool throttled;
114 	bool restore;
115 	u8 throttle_reason;
116 	cpumask_t mask;
117 	struct work_struct throttle;
118 	int throttle_turbo;
119 	int throttle_sub_turbo;
120 	int reason[OCC_MAX_REASON];
121 } *chips;
122 
123 static int nr_chips;
124 static DEFINE_PER_CPU(struct chip *, chip_info);
125 
126 /*
127  * Note: The set of pstates consists of contiguous integers, the
128  * smallest of which is indicated by powernv_pstate_info.min, the
129  * largest of which is indicated by powernv_pstate_info.max.
130  *
131  * The nominal pstate is the highest non-turbo pstate in this
132  * platform. This is indicated by powernv_pstate_info.nominal.
133  */
134 static struct powernv_pstate_info {
135 	int min;
136 	int max;
137 	int nominal;
138 	int nr_pstates;
139 } powernv_pstate_info;
140 
141 static inline void reset_gpstates(struct cpufreq_policy *policy)
142 {
143 	struct global_pstate_info *gpstates = policy->driver_data;
144 
145 	gpstates->highest_lpstate = 0;
146 	gpstates->elapsed_time = 0;
147 	gpstates->last_sampled_time = 0;
148 	gpstates->last_lpstate = 0;
149 	gpstates->last_gpstate = 0;
150 }
151 
152 /*
153  * Initialize the freq table based on data obtained
154  * from the firmware passed via device-tree
155  */
156 static int init_powernv_pstates(void)
157 {
158 	struct device_node *power_mgt;
159 	int i, pstate_min, pstate_max, pstate_nominal, nr_pstates = 0;
160 	const __be32 *pstate_ids, *pstate_freqs;
161 	u32 len_ids, len_freqs;
162 
163 	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
164 	if (!power_mgt) {
165 		pr_warn("power-mgt node not found\n");
166 		return -ENODEV;
167 	}
168 
169 	if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
170 		pr_warn("ibm,pstate-min node not found\n");
171 		return -ENODEV;
172 	}
173 
174 	if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
175 		pr_warn("ibm,pstate-max node not found\n");
176 		return -ENODEV;
177 	}
178 
179 	if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
180 				 &pstate_nominal)) {
181 		pr_warn("ibm,pstate-nominal not found\n");
182 		return -ENODEV;
183 	}
184 	pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
185 		pstate_nominal, pstate_max);
186 
187 	pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
188 	if (!pstate_ids) {
189 		pr_warn("ibm,pstate-ids not found\n");
190 		return -ENODEV;
191 	}
192 
193 	pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
194 				      &len_freqs);
195 	if (!pstate_freqs) {
196 		pr_warn("ibm,pstate-frequencies-mhz not found\n");
197 		return -ENODEV;
198 	}
199 
200 	if (len_ids != len_freqs) {
201 		pr_warn("Entries in ibm,pstate-ids and "
202 			"ibm,pstate-frequencies-mhz does not match\n");
203 	}
204 
205 	nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
206 	if (!nr_pstates) {
207 		pr_warn("No PStates found\n");
208 		return -ENODEV;
209 	}
210 
211 	pr_debug("NR PStates %d\n", nr_pstates);
212 	for (i = 0; i < nr_pstates; i++) {
213 		u32 id = be32_to_cpu(pstate_ids[i]);
214 		u32 freq = be32_to_cpu(pstate_freqs[i]);
215 
216 		pr_debug("PState id %d freq %d MHz\n", id, freq);
217 		powernv_freqs[i].frequency = freq * 1000; /* kHz */
218 		powernv_freqs[i].driver_data = id;
219 	}
220 	/* End of list marker entry */
221 	powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
222 
223 	powernv_pstate_info.min = pstate_min;
224 	powernv_pstate_info.max = pstate_max;
225 	powernv_pstate_info.nominal = pstate_nominal;
226 	powernv_pstate_info.nr_pstates = nr_pstates;
227 
228 	return 0;
229 }
230 
231 /* Returns the CPU frequency corresponding to the pstate_id. */
232 static unsigned int pstate_id_to_freq(int pstate_id)
233 {
234 	int i;
235 
236 	i = powernv_pstate_info.max - pstate_id;
237 	if (i >= powernv_pstate_info.nr_pstates || i < 0) {
238 		pr_warn("PState id %d outside of PState table, "
239 			"reporting nominal id %d instead\n",
240 			pstate_id, powernv_pstate_info.nominal);
241 		i = powernv_pstate_info.max - powernv_pstate_info.nominal;
242 	}
243 
244 	return powernv_freqs[i].frequency;
245 }
246 
247 /*
248  * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by
249  * the firmware
250  */
251 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy,
252 					char *buf)
253 {
254 	return sprintf(buf, "%u\n",
255 		pstate_id_to_freq(powernv_pstate_info.nominal));
256 }
257 
258 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq =
259 	__ATTR_RO(cpuinfo_nominal_freq);
260 
261 static struct freq_attr *powernv_cpu_freq_attr[] = {
262 	&cpufreq_freq_attr_scaling_available_freqs,
263 	&cpufreq_freq_attr_cpuinfo_nominal_freq,
264 	NULL,
265 };
266 
267 #define throttle_attr(name, member)					\
268 static ssize_t name##_show(struct cpufreq_policy *policy, char *buf)	\
269 {									\
270 	struct chip *chip = per_cpu(chip_info, policy->cpu);		\
271 									\
272 	return sprintf(buf, "%u\n", chip->member);			\
273 }									\
274 									\
275 static struct freq_attr throttle_attr_##name = __ATTR_RO(name)		\
276 
277 throttle_attr(unthrottle, reason[NO_THROTTLE]);
278 throttle_attr(powercap, reason[POWERCAP]);
279 throttle_attr(overtemp, reason[CPU_OVERTEMP]);
280 throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
281 throttle_attr(overcurrent, reason[OVERCURRENT]);
282 throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
283 throttle_attr(turbo_stat, throttle_turbo);
284 throttle_attr(sub_turbo_stat, throttle_sub_turbo);
285 
286 static struct attribute *throttle_attrs[] = {
287 	&throttle_attr_unthrottle.attr,
288 	&throttle_attr_powercap.attr,
289 	&throttle_attr_overtemp.attr,
290 	&throttle_attr_supply_fault.attr,
291 	&throttle_attr_overcurrent.attr,
292 	&throttle_attr_occ_reset.attr,
293 	&throttle_attr_turbo_stat.attr,
294 	&throttle_attr_sub_turbo_stat.attr,
295 	NULL,
296 };
297 
298 static const struct attribute_group throttle_attr_grp = {
299 	.name	= "throttle_stats",
300 	.attrs	= throttle_attrs,
301 };
302 
303 /* Helper routines */
304 
305 /* Access helpers to power mgt SPR */
306 
307 static inline unsigned long get_pmspr(unsigned long sprn)
308 {
309 	switch (sprn) {
310 	case SPRN_PMCR:
311 		return mfspr(SPRN_PMCR);
312 
313 	case SPRN_PMICR:
314 		return mfspr(SPRN_PMICR);
315 
316 	case SPRN_PMSR:
317 		return mfspr(SPRN_PMSR);
318 	}
319 	BUG();
320 }
321 
322 static inline void set_pmspr(unsigned long sprn, unsigned long val)
323 {
324 	switch (sprn) {
325 	case SPRN_PMCR:
326 		mtspr(SPRN_PMCR, val);
327 		return;
328 
329 	case SPRN_PMICR:
330 		mtspr(SPRN_PMICR, val);
331 		return;
332 	}
333 	BUG();
334 }
335 
336 /*
337  * Use objects of this type to query/update
338  * pstates on a remote CPU via smp_call_function.
339  */
340 struct powernv_smp_call_data {
341 	unsigned int freq;
342 	int pstate_id;
343 	int gpstate_id;
344 };
345 
346 /*
347  * powernv_read_cpu_freq: Reads the current frequency on this CPU.
348  *
349  * Called via smp_call_function.
350  *
351  * Note: The caller of the smp_call_function should pass an argument of
352  * the type 'struct powernv_smp_call_data *' along with this function.
353  *
354  * The current frequency on this CPU will be returned via
355  * ((struct powernv_smp_call_data *)arg)->freq;
356  */
357 static void powernv_read_cpu_freq(void *arg)
358 {
359 	unsigned long pmspr_val;
360 	s8 local_pstate_id;
361 	struct powernv_smp_call_data *freq_data = arg;
362 
363 	pmspr_val = get_pmspr(SPRN_PMSR);
364 
365 	/*
366 	 * The local pstate id corresponds bits 48..55 in the PMSR.
367 	 * Note: Watch out for the sign!
368 	 */
369 	local_pstate_id = (pmspr_val >> 48) & 0xFF;
370 	freq_data->pstate_id = local_pstate_id;
371 	freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
372 
373 	pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n",
374 		raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
375 		freq_data->freq);
376 }
377 
378 /*
379  * powernv_cpufreq_get: Returns the CPU frequency as reported by the
380  * firmware for CPU 'cpu'. This value is reported through the sysfs
381  * file cpuinfo_cur_freq.
382  */
383 static unsigned int powernv_cpufreq_get(unsigned int cpu)
384 {
385 	struct powernv_smp_call_data freq_data;
386 
387 	smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq,
388 			&freq_data, 1);
389 
390 	return freq_data.freq;
391 }
392 
393 /*
394  * set_pstate: Sets the pstate on this CPU.
395  *
396  * This is called via an smp_call_function.
397  *
398  * The caller must ensure that freq_data is of the type
399  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
400  * on this CPU should be present in freq_data->pstate_id.
401  */
402 static void set_pstate(void *data)
403 {
404 	unsigned long val;
405 	struct powernv_smp_call_data *freq_data = data;
406 	unsigned long pstate_ul = freq_data->pstate_id;
407 	unsigned long gpstate_ul = freq_data->gpstate_id;
408 
409 	val = get_pmspr(SPRN_PMCR);
410 	val = val & 0x0000FFFFFFFFFFFFULL;
411 
412 	pstate_ul = pstate_ul & 0xFF;
413 	gpstate_ul = gpstate_ul & 0xFF;
414 
415 	/* Set both global(bits 56..63) and local(bits 48..55) PStates */
416 	val = val | (gpstate_ul << 56) | (pstate_ul << 48);
417 
418 	pr_debug("Setting cpu %d pmcr to %016lX\n",
419 			raw_smp_processor_id(), val);
420 	set_pmspr(SPRN_PMCR, val);
421 }
422 
423 /*
424  * get_nominal_index: Returns the index corresponding to the nominal
425  * pstate in the cpufreq table
426  */
427 static inline unsigned int get_nominal_index(void)
428 {
429 	return powernv_pstate_info.max - powernv_pstate_info.nominal;
430 }
431 
432 static void powernv_cpufreq_throttle_check(void *data)
433 {
434 	struct chip *chip;
435 	unsigned int cpu = smp_processor_id();
436 	unsigned long pmsr;
437 	int pmsr_pmax;
438 
439 	pmsr = get_pmspr(SPRN_PMSR);
440 	chip = this_cpu_read(chip_info);
441 
442 	/* Check for Pmax Capping */
443 	pmsr_pmax = (s8)PMSR_MAX(pmsr);
444 	if (pmsr_pmax != powernv_pstate_info.max) {
445 		if (chip->throttled)
446 			goto next;
447 		chip->throttled = true;
448 		if (pmsr_pmax < powernv_pstate_info.nominal) {
449 			pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
450 				     cpu, chip->id, pmsr_pmax,
451 				     powernv_pstate_info.nominal);
452 			chip->throttle_sub_turbo++;
453 		} else {
454 			chip->throttle_turbo++;
455 		}
456 		trace_powernv_throttle(chip->id,
457 				      throttle_reason[chip->throttle_reason],
458 				      pmsr_pmax);
459 	} else if (chip->throttled) {
460 		chip->throttled = false;
461 		trace_powernv_throttle(chip->id,
462 				      throttle_reason[chip->throttle_reason],
463 				      pmsr_pmax);
464 	}
465 
466 	/* Check if Psafe_mode_active is set in PMSR. */
467 next:
468 	if (pmsr & PMSR_PSAFE_ENABLE) {
469 		throttled = true;
470 		pr_info("Pstate set to safe frequency\n");
471 	}
472 
473 	/* Check if SPR_EM_DISABLE is set in PMSR */
474 	if (pmsr & PMSR_SPR_EM_DISABLE) {
475 		throttled = true;
476 		pr_info("Frequency Control disabled from OS\n");
477 	}
478 
479 	if (throttled) {
480 		pr_info("PMSR = %16lx\n", pmsr);
481 		pr_warn("CPU Frequency could be throttled\n");
482 	}
483 }
484 
485 /**
486  * calc_global_pstate - Calculate global pstate
487  * @elapsed_time:	Elapsed time in milliseconds
488  * @local_pstate:	New local pstate
489  * @highest_lpstate:	pstate from which its ramping down
490  *
491  * Finds the appropriate global pstate based on the pstate from which its
492  * ramping down and the time elapsed in ramping down. It follows a quadratic
493  * equation which ensures that it reaches ramping down to pmin in 5sec.
494  */
495 static inline int calc_global_pstate(unsigned int elapsed_time,
496 				     int highest_lpstate, int local_pstate)
497 {
498 	int pstate_diff;
499 
500 	/*
501 	 * Using ramp_down_percent we get the percentage of rampdown
502 	 * that we are expecting to be dropping. Difference between
503 	 * highest_lpstate and powernv_pstate_info.min will give a absolute
504 	 * number of how many pstates we will drop eventually by the end of
505 	 * 5 seconds, then just scale it get the number pstates to be dropped.
506 	 */
507 	pstate_diff =  ((int)ramp_down_percent(elapsed_time) *
508 			(highest_lpstate - powernv_pstate_info.min)) / 100;
509 
510 	/* Ensure that global pstate is >= to local pstate */
511 	if (highest_lpstate - pstate_diff < local_pstate)
512 		return local_pstate;
513 	else
514 		return highest_lpstate - pstate_diff;
515 }
516 
517 static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
518 {
519 	unsigned int timer_interval;
520 
521 	/*
522 	 * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
523 	 * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
524 	 * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
525 	 * seconds of ramp down time.
526 	 */
527 	if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
528 	     > MAX_RAMP_DOWN_TIME)
529 		timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
530 	else
531 		timer_interval = GPSTATE_TIMER_INTERVAL;
532 
533 	mod_timer_pinned(&gpstates->timer, jiffies +
534 			msecs_to_jiffies(timer_interval));
535 }
536 
537 /**
538  * gpstate_timer_handler
539  *
540  * @data: pointer to cpufreq_policy on which timer was queued
541  *
542  * This handler brings down the global pstate closer to the local pstate
543  * according quadratic equation. Queues a new timer if it is still not equal
544  * to local pstate
545  */
546 void gpstate_timer_handler(unsigned long data)
547 {
548 	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
549 	struct global_pstate_info *gpstates = policy->driver_data;
550 	int gpstate_id;
551 	unsigned int time_diff = jiffies_to_msecs(jiffies)
552 					- gpstates->last_sampled_time;
553 	struct powernv_smp_call_data freq_data;
554 
555 	if (!spin_trylock(&gpstates->gpstate_lock))
556 		return;
557 
558 	gpstates->last_sampled_time += time_diff;
559 	gpstates->elapsed_time += time_diff;
560 	freq_data.pstate_id = gpstates->last_lpstate;
561 
562 	if ((gpstates->last_gpstate == freq_data.pstate_id) ||
563 	    (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
564 		gpstate_id = freq_data.pstate_id;
565 		reset_gpstates(policy);
566 		gpstates->highest_lpstate = freq_data.pstate_id;
567 	} else {
568 		gpstate_id = calc_global_pstate(gpstates->elapsed_time,
569 						gpstates->highest_lpstate,
570 						freq_data.pstate_id);
571 	}
572 
573 	/*
574 	 * If local pstate is equal to global pstate, rampdown is over
575 	 * So timer is not required to be queued.
576 	 */
577 	if (gpstate_id != freq_data.pstate_id)
578 		queue_gpstate_timer(gpstates);
579 
580 	freq_data.gpstate_id = gpstate_id;
581 	gpstates->last_gpstate = freq_data.gpstate_id;
582 	gpstates->last_lpstate = freq_data.pstate_id;
583 
584 	spin_unlock(&gpstates->gpstate_lock);
585 
586 	/* Timer may get migrated to a different cpu on cpu hot unplug */
587 	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
588 }
589 
590 /*
591  * powernv_cpufreq_target_index: Sets the frequency corresponding to
592  * the cpufreq table entry indexed by new_index on the cpus in the
593  * mask policy->cpus
594  */
595 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
596 					unsigned int new_index)
597 {
598 	struct powernv_smp_call_data freq_data;
599 	unsigned int cur_msec, gpstate_id;
600 	struct global_pstate_info *gpstates = policy->driver_data;
601 
602 	if (unlikely(rebooting) && new_index != get_nominal_index())
603 		return 0;
604 
605 	if (!throttled)
606 		powernv_cpufreq_throttle_check(NULL);
607 
608 	cur_msec = jiffies_to_msecs(get_jiffies_64());
609 
610 	spin_lock(&gpstates->gpstate_lock);
611 	freq_data.pstate_id = powernv_freqs[new_index].driver_data;
612 
613 	if (!gpstates->last_sampled_time) {
614 		gpstate_id = freq_data.pstate_id;
615 		gpstates->highest_lpstate = freq_data.pstate_id;
616 		goto gpstates_done;
617 	}
618 
619 	if (gpstates->last_gpstate > freq_data.pstate_id) {
620 		gpstates->elapsed_time += cur_msec -
621 						 gpstates->last_sampled_time;
622 
623 		/*
624 		 * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
625 		 * we should be resetting all global pstate related data. Set it
626 		 * equal to local pstate to start fresh.
627 		 */
628 		if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
629 			reset_gpstates(policy);
630 			gpstates->highest_lpstate = freq_data.pstate_id;
631 			gpstate_id = freq_data.pstate_id;
632 		} else {
633 		/* Elaspsed_time is less than 5 seconds, continue to rampdown */
634 			gpstate_id = calc_global_pstate(gpstates->elapsed_time,
635 							gpstates->highest_lpstate,
636 							freq_data.pstate_id);
637 		}
638 	} else {
639 		reset_gpstates(policy);
640 		gpstates->highest_lpstate = freq_data.pstate_id;
641 		gpstate_id = freq_data.pstate_id;
642 	}
643 
644 	/*
645 	 * If local pstate is equal to global pstate, rampdown is over
646 	 * So timer is not required to be queued.
647 	 */
648 	if (gpstate_id != freq_data.pstate_id)
649 		queue_gpstate_timer(gpstates);
650 	else
651 		del_timer_sync(&gpstates->timer);
652 
653 gpstates_done:
654 	freq_data.gpstate_id = gpstate_id;
655 	gpstates->last_sampled_time = cur_msec;
656 	gpstates->last_gpstate = freq_data.gpstate_id;
657 	gpstates->last_lpstate = freq_data.pstate_id;
658 
659 	spin_unlock(&gpstates->gpstate_lock);
660 
661 	/*
662 	 * Use smp_call_function to send IPI and execute the
663 	 * mtspr on target CPU.  We could do that without IPI
664 	 * if current CPU is within policy->cpus (core)
665 	 */
666 	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
667 	return 0;
668 }
669 
670 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
671 {
672 	int base, i, ret;
673 	struct kernfs_node *kn;
674 	struct global_pstate_info *gpstates;
675 
676 	base = cpu_first_thread_sibling(policy->cpu);
677 
678 	for (i = 0; i < threads_per_core; i++)
679 		cpumask_set_cpu(base + i, policy->cpus);
680 
681 	kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
682 	if (!kn) {
683 		int ret;
684 
685 		ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
686 		if (ret) {
687 			pr_info("Failed to create throttle stats directory for cpu %d\n",
688 				policy->cpu);
689 			return ret;
690 		}
691 	} else {
692 		kernfs_put(kn);
693 	}
694 
695 	gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
696 	if (!gpstates)
697 		return -ENOMEM;
698 
699 	policy->driver_data = gpstates;
700 
701 	/* initialize timer */
702 	init_timer_deferrable(&gpstates->timer);
703 	gpstates->timer.data = (unsigned long)policy;
704 	gpstates->timer.function = gpstate_timer_handler;
705 	gpstates->timer.expires = jiffies +
706 				msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
707 	spin_lock_init(&gpstates->gpstate_lock);
708 	ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
709 
710 	if (ret < 0)
711 		kfree(policy->driver_data);
712 
713 	return ret;
714 }
715 
716 static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
717 {
718 	/* timer is deleted in cpufreq_cpu_stop() */
719 	kfree(policy->driver_data);
720 
721 	return 0;
722 }
723 
724 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
725 				unsigned long action, void *unused)
726 {
727 	int cpu;
728 	struct cpufreq_policy cpu_policy;
729 
730 	rebooting = true;
731 	for_each_online_cpu(cpu) {
732 		cpufreq_get_policy(&cpu_policy, cpu);
733 		powernv_cpufreq_target_index(&cpu_policy, get_nominal_index());
734 	}
735 
736 	return NOTIFY_DONE;
737 }
738 
739 static struct notifier_block powernv_cpufreq_reboot_nb = {
740 	.notifier_call = powernv_cpufreq_reboot_notifier,
741 };
742 
743 void powernv_cpufreq_work_fn(struct work_struct *work)
744 {
745 	struct chip *chip = container_of(work, struct chip, throttle);
746 	unsigned int cpu;
747 	cpumask_t mask;
748 
749 	get_online_cpus();
750 	cpumask_and(&mask, &chip->mask, cpu_online_mask);
751 	smp_call_function_any(&mask,
752 			      powernv_cpufreq_throttle_check, NULL, 0);
753 
754 	if (!chip->restore)
755 		goto out;
756 
757 	chip->restore = false;
758 	for_each_cpu(cpu, &mask) {
759 		int index;
760 		struct cpufreq_policy policy;
761 
762 		cpufreq_get_policy(&policy, cpu);
763 		cpufreq_frequency_table_target(&policy, policy.freq_table,
764 					       policy.cur,
765 					       CPUFREQ_RELATION_C, &index);
766 		powernv_cpufreq_target_index(&policy, index);
767 		cpumask_andnot(&mask, &mask, policy.cpus);
768 	}
769 out:
770 	put_online_cpus();
771 }
772 
773 static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
774 				   unsigned long msg_type, void *_msg)
775 {
776 	struct opal_msg *msg = _msg;
777 	struct opal_occ_msg omsg;
778 	int i;
779 
780 	if (msg_type != OPAL_MSG_OCC)
781 		return 0;
782 
783 	omsg.type = be64_to_cpu(msg->params[0]);
784 
785 	switch (omsg.type) {
786 	case OCC_RESET:
787 		occ_reset = true;
788 		pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n");
789 		/*
790 		 * powernv_cpufreq_throttle_check() is called in
791 		 * target() callback which can detect the throttle state
792 		 * for governors like ondemand.
793 		 * But static governors will not call target() often thus
794 		 * report throttling here.
795 		 */
796 		if (!throttled) {
797 			throttled = true;
798 			pr_warn("CPU frequency is throttled for duration\n");
799 		}
800 
801 		break;
802 	case OCC_LOAD:
803 		pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n");
804 		break;
805 	case OCC_THROTTLE:
806 		omsg.chip = be64_to_cpu(msg->params[1]);
807 		omsg.throttle_status = be64_to_cpu(msg->params[2]);
808 
809 		if (occ_reset) {
810 			occ_reset = false;
811 			throttled = false;
812 			pr_info("OCC Active, CPU frequency is no longer throttled\n");
813 
814 			for (i = 0; i < nr_chips; i++) {
815 				chips[i].restore = true;
816 				schedule_work(&chips[i].throttle);
817 			}
818 
819 			return 0;
820 		}
821 
822 		for (i = 0; i < nr_chips; i++)
823 			if (chips[i].id == omsg.chip)
824 				break;
825 
826 		if (omsg.throttle_status >= 0 &&
827 		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
828 			chips[i].throttle_reason = omsg.throttle_status;
829 			chips[i].reason[omsg.throttle_status]++;
830 		}
831 
832 		if (!omsg.throttle_status)
833 			chips[i].restore = true;
834 
835 		schedule_work(&chips[i].throttle);
836 	}
837 	return 0;
838 }
839 
840 static struct notifier_block powernv_cpufreq_opal_nb = {
841 	.notifier_call	= powernv_cpufreq_occ_msg,
842 	.next		= NULL,
843 	.priority	= 0,
844 };
845 
846 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
847 {
848 	struct powernv_smp_call_data freq_data;
849 	struct global_pstate_info *gpstates = policy->driver_data;
850 
851 	freq_data.pstate_id = powernv_pstate_info.min;
852 	freq_data.gpstate_id = powernv_pstate_info.min;
853 	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
854 	del_timer_sync(&gpstates->timer);
855 }
856 
857 static struct cpufreq_driver powernv_cpufreq_driver = {
858 	.name		= "powernv-cpufreq",
859 	.flags		= CPUFREQ_CONST_LOOPS,
860 	.init		= powernv_cpufreq_cpu_init,
861 	.exit		= powernv_cpufreq_cpu_exit,
862 	.verify		= cpufreq_generic_frequency_table_verify,
863 	.target_index	= powernv_cpufreq_target_index,
864 	.get		= powernv_cpufreq_get,
865 	.stop_cpu	= powernv_cpufreq_stop_cpu,
866 	.attr		= powernv_cpu_freq_attr,
867 };
868 
869 static int init_chip_info(void)
870 {
871 	unsigned int chip[256];
872 	unsigned int cpu, i;
873 	unsigned int prev_chip_id = UINT_MAX;
874 
875 	for_each_possible_cpu(cpu) {
876 		unsigned int id = cpu_to_chip_id(cpu);
877 
878 		if (prev_chip_id != id) {
879 			prev_chip_id = id;
880 			chip[nr_chips++] = id;
881 		}
882 	}
883 
884 	chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
885 	if (!chips)
886 		return -ENOMEM;
887 
888 	for (i = 0; i < nr_chips; i++) {
889 		chips[i].id = chip[i];
890 		cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
891 		INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
892 		for_each_cpu(cpu, &chips[i].mask)
893 			per_cpu(chip_info, cpu) =  &chips[i];
894 	}
895 
896 	return 0;
897 }
898 
899 static inline void clean_chip_info(void)
900 {
901 	kfree(chips);
902 }
903 
904 static inline void unregister_all_notifiers(void)
905 {
906 	opal_message_notifier_unregister(OPAL_MSG_OCC,
907 					 &powernv_cpufreq_opal_nb);
908 	unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
909 }
910 
911 static int __init powernv_cpufreq_init(void)
912 {
913 	int rc = 0;
914 
915 	/* Don't probe on pseries (guest) platforms */
916 	if (!firmware_has_feature(FW_FEATURE_OPAL))
917 		return -ENODEV;
918 
919 	/* Discover pstates from device tree and init */
920 	rc = init_powernv_pstates();
921 	if (rc)
922 		goto out;
923 
924 	/* Populate chip info */
925 	rc = init_chip_info();
926 	if (rc)
927 		goto out;
928 
929 	register_reboot_notifier(&powernv_cpufreq_reboot_nb);
930 	opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
931 
932 	rc = cpufreq_register_driver(&powernv_cpufreq_driver);
933 	if (!rc)
934 		return 0;
935 
936 	pr_info("Failed to register the cpufreq driver (%d)\n", rc);
937 	unregister_all_notifiers();
938 	clean_chip_info();
939 out:
940 	pr_info("Platform driver disabled. System does not support PState control\n");
941 	return rc;
942 }
943 module_init(powernv_cpufreq_init);
944 
945 static void __exit powernv_cpufreq_exit(void)
946 {
947 	cpufreq_unregister_driver(&powernv_cpufreq_driver);
948 	unregister_all_notifiers();
949 	clean_chip_info();
950 }
951 module_exit(powernv_cpufreq_exit);
952 
953 MODULE_LICENSE("GPL");
954 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");
955