xref: /illumos-gate/usr/src/uts/common/os/cpu_pm.c (revision 4c28a617e3922d92a58e813a5b955eb526b9c386)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpu_pm.h>
27 #include <sys/cmn_err.h>
28 #include <sys/time.h>
29 #include <sys/sdt.h>
30 
31 /*
32  * Solaris Event Based CPU Power Manager
33  *
34  * This file implements platform independent event based CPU power management.
35  * When CPUs are configured into the system, the CMT scheduling subsystem will
36  * query the platform to determine if the CPU belongs to any power management
37  * domains. That is, sets of CPUs that share power management states.
38  *
39  * Active Power Management domains represent a group of CPUs across which the
40  * Operating System can request speed changes (which may in turn result
41  * in voltage changes). This allows the operating system to trade off
42  * performance for power savings.
43  *
44  * Idle Power Management domains can enter power savings states when they are
45  * unutilized. These states allow the Operating System to trade off power
46  * for performance (in the form of latency to transition from the idle state
47  * to an active one).
48  *
49  * For each active and idle power domain the CMT subsystem instantiates, a
50  * cpupm_domain_t structure is created. As the dispatcher schedules threads
51  * to run on the system's CPUs, it will also track the utilization of the
52  * enumerated power domains. Significant changes in utilization will result
53  * in the dispatcher sending the power manager events that relate to the
54  * utilization of the power domain. The power manager recieves the events,
55  * and in the context of the policy objectives in force, may decide to request
56  * the domain's power/performance state be changed.
57  *
58  * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
59  * manager will request the CPUs in the domain run at their fastest (and most
60  * power consuming) state. When the domain becomes idle (utilization at zero),
61  * the power manager will request that the CPUs run at a speed that saves the
62  * most power.
63  *
64  * The advantage of this scheme, is that the CPU power manager working with the
65  * dispatcher can be extremely responsive to changes in utilization. Optimizing
66  * for performance in the presence of utilization, and power savings in the
67  * presence of idleness. Such close collaboration with the dispatcher has other
68  * benefits that will play out in the form of more sophisticated power /
69  * performance policy in the near future.
70  *
71  * Avoiding state thrashing in the presence of transient periods of utilization
72  * and idleness while still being responsive to non-transient periods is key.
73  * The power manager implements a "governor" that is used to throttle
74  * state transitions when a significant amount of transient idle or transient
75  * work is detected.
76  *
77  * Kernel background activity (e.g. taskq threads) are by far the most common
78  * form of transient utilization. Ungoverned in the face of this utililzation,
79  * hundreds of state transitions per second would result on an idle system.
80  *
81  * Transient idleness is common when a thread briefly yields the CPU to
82  * wait for an event elsewhere in the system. Where the idle period is short
83  * enough, the overhead associated with making the state transition doesn't
84  * justify the power savings.
85  *
86  * The following is the state machine for the governor implemented by
87  * cpupm_utilization_event():
88  *
89  *         ----->---tw---->-----
90  *        /                     \
91  *      (I)-<-ti-<-     -<-ntw-<(W)
92  *       |         \   /         |
93  *       \          \ /          /
94  *        >-nti/rm->(D)--->-tw->-
95  * Key:
96  *
97  * States
98  * - (D): Default (ungoverned)
99  * - (W): Transient work governed
100  * - (I): Transient idle governed
101  * State Transitions
102  * - tw: transient work
103  * - ti: transient idleness
104  * - ntw: non-transient work
105  * - nti: non-transient idleness
106  * - rm: thread remain event
107  */
108 
109 static cpupm_domain_t *cpupm_domains = NULL;
110 
111 /*
112  * Uninitialized state of CPU power management is disabled
113  */
114 cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
115 
116 /*
117  * Periods of utilization lasting less than this time interval are characterized
118  * as transient. State changes associated with transient work are considered
119  * to be mispredicted. That is, it's not worth raising and lower power states
120  * where the utilization lasts for less than this interval.
121  */
122 hrtime_t cpupm_tw_predict_interval;
123 
124 /*
125  * Periods of idleness lasting less than this time interval are characterized
126  * as transient. State changes associated with transient idle are considered
127  * to be mispredicted. That is, it's not worth lowering and raising power
128  * states where the idleness lasts for less than this interval.
129  */
130 hrtime_t cpupm_ti_predict_interval;
131 
132 /*
133  * Number of mispredictions after which future transitions will be governed.
134  */
135 int cpupm_mispredict_thresh = 4;
136 
137 /*
138  * Likewise, the number of mispredicted governed transitions after which the
139  * governor will be removed.
140  */
141 int cpupm_mispredict_gov_thresh = 4;
142 
143 /*
144  * The transient work and transient idle prediction intervals are specified
145  * here. Tuning them higher will result in the transient work, and transient
146  * idle governors being used more aggresively, which limits the frequency of
147  * state transitions at the expense of performance and power savings,
148  * respectively. The intervals are specified in nanoseconds.
149  */
150 /*
151  * 400 usec
152  */
153 #define	CPUPM_DEFAULT_TI_INTERVAL	400000
154 /*
155  * 400 usec
156  */
157 #define	CPUPM_DEFAULT_TW_INTERVAL	400000
158 
159 hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
160 hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
161 
162 
163 static void	cpupm_governor_initialize(void);
164 static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
165 
166 cpupm_policy_t
167 cpupm_get_policy(void)
168 {
169 	return (cpupm_policy);
170 }
171 
172 int
173 cpupm_set_policy(cpupm_policy_t new_policy)
174 {
175 	static int	gov_init = 0;
176 	int		result = 0;
177 
178 	mutex_enter(&cpu_lock);
179 	if (new_policy == cpupm_policy) {
180 		mutex_exit(&cpu_lock);
181 		return (result);
182 	}
183 
184 	/*
185 	 * Pausing CPUs causes a high priority thread to be scheduled
186 	 * on all other CPUs (besides the current one). This locks out
187 	 * other CPUs from making CPUPM state transitions.
188 	 */
189 	switch (new_policy) {
190 	case CPUPM_POLICY_DISABLED:
191 		pause_cpus(NULL, NULL);
192 		cpupm_policy = CPUPM_POLICY_DISABLED;
193 		start_cpus();
194 
195 		result = cmt_pad_disable(PGHW_POW_ACTIVE);
196 
197 		/*
198 		 * Once PAD has been enabled, it should always be possible
199 		 * to disable it.
200 		 */
201 		ASSERT(result == 0);
202 
203 		/*
204 		 * Bring all the active power domains to the maximum
205 		 * performance state.
206 		 */
207 		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
208 		    CPUPM_STATE_MAX_PERF);
209 
210 		break;
211 	case CPUPM_POLICY_ELASTIC:
212 
213 		result = cmt_pad_enable(PGHW_POW_ACTIVE);
214 		if (result < 0) {
215 			/*
216 			 * Failed to enable PAD across the active power
217 			 * domains, which may well be because none were
218 			 * enumerated.
219 			 */
220 			break;
221 		}
222 
223 		/*
224 		 * Initialize the governor parameters the first time through.
225 		 */
226 		if (gov_init == 0) {
227 			cpupm_governor_initialize();
228 			gov_init = 1;
229 		}
230 
231 		pause_cpus(NULL, NULL);
232 		cpupm_policy = CPUPM_POLICY_ELASTIC;
233 		start_cpus();
234 
235 		break;
236 	default:
237 		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
238 		    new_policy);
239 		ASSERT(0);
240 		break;
241 	}
242 	mutex_exit(&cpu_lock);
243 
244 	return (result);
245 }
246 
247 /*
248  * Look for an existing power domain
249  */
250 static cpupm_domain_t *
251 cpupm_domain_find(id_t id, cpupm_dtype_t type)
252 {
253 	ASSERT(MUTEX_HELD(&cpu_lock));
254 
255 	cpupm_domain_t *dom;
256 
257 	dom = cpupm_domains;
258 	while (dom != NULL) {
259 		if (id == dom->cpd_id && type == dom->cpd_type)
260 			return (dom);
261 		dom = dom->cpd_next;
262 	}
263 	return (NULL);
264 }
265 
266 /*
267  * Create a new domain
268  */
269 static cpupm_domain_t *
270 cpupm_domain_create(id_t id, cpupm_dtype_t type)
271 {
272 	cpupm_domain_t *dom;
273 
274 	ASSERT(MUTEX_HELD(&cpu_lock));
275 
276 	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
277 	dom->cpd_id = id;
278 	dom->cpd_type = type;
279 
280 	/* Link into the known domain list */
281 	dom->cpd_next = cpupm_domains;
282 	cpupm_domains = dom;
283 
284 	return (dom);
285 }
286 
287 static void
288 cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
289 {
290 	/*
291 	 * In the envent we're enumerating because the domain's state
292 	 * configuration has changed, toss any existing states.
293 	 */
294 	if (dom->cpd_nstates > 0) {
295 		kmem_free(dom->cpd_states,
296 		    sizeof (cpupm_state_t) * dom->cpd_nstates);
297 		dom->cpd_nstates = 0;
298 	}
299 
300 	/*
301 	 * Query to determine the number of states, allocate storage
302 	 * large enough to hold the state information, and pass it back
303 	 * to the platform driver to complete the enumeration.
304 	 */
305 	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
306 
307 	if (dom->cpd_nstates == 0)
308 		return;
309 
310 	dom->cpd_states =
311 	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
312 	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
313 }
314 
315 /*
316  * Initialize the specified type of power domain on behalf of the CPU
317  */
318 cpupm_domain_t *
319 cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
320 {
321 	cpupm_domain_t	*dom;
322 	id_t		did;
323 
324 	ASSERT(MUTEX_HELD(&cpu_lock));
325 
326 	/*
327 	 * Instantiate the domain if it doesn't already exist
328 	 * and enumerate its power states.
329 	 */
330 	did = cpupm_domain_id(cp, type);
331 	dom = cpupm_domain_find(did, type);
332 	if (dom == NULL) {
333 		dom = cpupm_domain_create(did, type);
334 		cpupm_domain_state_enum(cp, dom);
335 	}
336 
337 	/*
338 	 * Named state initialization
339 	 */
340 	if (type == CPUPM_DTYPE_ACTIVE) {
341 		/*
342 		 * For active power domains, the highest performance
343 		 * state is defined as first state returned from
344 		 * the domain enumeration.
345 		 */
346 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
347 		    &dom->cpd_states[0];
348 		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
349 		    &dom->cpd_states[dom->cpd_nstates - 1];
350 
351 		/*
352 		 * Begin by assuming CPU is running at the max perf state.
353 		 */
354 		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
355 	}
356 
357 	return (dom);
358 }
359 
360 /*
361  * Return the id associated with the given type of domain
362  * to which cp belongs
363  */
364 id_t
365 cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
366 {
367 	return (cpupm_plat_domain_id(cp, type));
368 }
369 
370 /*
371  * Initiate a state change for the specified domain on behalf of cp
372  */
373 int
374 cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
375 {
376 	if (cpupm_plat_change_state(cp, state) < 0)
377 		return (-1);
378 
379 	DTRACE_PROBE2(cpupm__change__state,
380 	    cpupm_domain_t *, dom,
381 	    cpupm_state_t *, state);
382 
383 	dom->cpd_state = state;
384 	return (0);
385 }
386 
387 /*
388  * Interface into the CPU power manager to indicate a significant change
389  * in utilization of the specified active power domain
390  */
391 void
392 cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
393 			    cpupm_util_event_t event)
394 {
395 	cpupm_state_t	*new_state = NULL;
396 	hrtime_t	last;
397 
398 	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
399 		return;
400 	}
401 
402 	/*
403 	 * What follows is a simple elastic power state management policy.
404 	 *
405 	 * If the utilization has become non-zero, and the domain was
406 	 * previously at it's lowest power state, then transition it
407 	 * to the highest state in the spirit of "race to idle".
408 	 *
409 	 * If the utilization has dropped to zero, then transition the
410 	 * domain to its lowest power state.
411 	 *
412 	 * Statistics are maintained to implement a governor to reduce state
413 	 * transitions resulting from either transient work, or periods of
414 	 * transient idleness on the domain.
415 	 */
416 	switch (event) {
417 	case CPUPM_DOM_REMAIN_BUSY:
418 
419 		/*
420 		 * We've received an event that the domain is running a thread
421 		 * that's made it to the end of it's time slice. If we are at
422 		 * low power, then raise it. If the transient work governor
423 		 * is engaged, then remove it.
424 		 */
425 		if (dom->cpd_state ==
426 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
427 			new_state =
428 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
429 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
430 				dom->cpd_governor = CPUPM_GOV_DISENGAGED;
431 				dom->cpd_tw = 0;
432 			}
433 		}
434 		break;
435 
436 	case CPUPM_DOM_BUSY_FROM_IDLE:
437 		last = dom->cpd_last_lower;
438 		dom->cpd_last_raise = now;
439 
440 		DTRACE_PROBE3(cpupm__raise__req,
441 		    cpupm_domain_t *, dom,
442 		    hrtime_t, last,
443 		    hrtime_t, now);
444 
445 		if (dom->cpd_state ==
446 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
447 
448 			/*
449 			 * There's non-zero utilization, and the domain is
450 			 * running in the lower power state. Before we
451 			 * consider raising power, check if the preceeding
452 			 * idle period was transient in duration.
453 			 *
454 			 * If the domain is already transient work governed,
455 			 * then we don't bother maintaining transient idle
456 			 * statistics, as the presence of enough transient work
457 			 * can also make the domain frequently transiently idle.
458 			 * In this case, we still want to remain transient work
459 			 * governed.
460 			 */
461 			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
462 				if ((now - last) < cpupm_ti_predict_interval) {
463 					/*
464 					 * We're raising the domain power and
465 					 * we *just* lowered it. Consider
466 					 * this a mispredicted power state
467 					 * transition due to a transient
468 					 * idle period.
469 					 */
470 					if (++dom->cpd_ti >=
471 					    cpupm_mispredict_thresh) {
472 						/*
473 						 * There's enough transient
474 						 * idle transitions to
475 						 * justify governing future
476 						 * lowering requests.
477 						 */
478 						dom->cpd_governor =
479 						    CPUPM_GOV_TRANS_IDLE;
480 						dom->cpd_ti = 0;
481 						DTRACE_PROBE1(
482 						    cpupm__ti__governed,
483 						    cpupm_domain_t *, dom);
484 					}
485 				} else {
486 					/*
487 					 * We correctly predicted the last
488 					 * lowering.
489 					 */
490 					dom->cpd_ti = 0;
491 				}
492 			}
493 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
494 				/*
495 				 * Raise requests are governed due to
496 				 * transient work.
497 				 */
498 				DTRACE_PROBE1(cpupm__raise__governed,
499 				    cpupm_domain_t *, dom);
500 
501 				return;
502 			}
503 			/*
504 			 * Prepare to transition to the higher power state
505 			 */
506 			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
507 
508 		} else if (dom->cpd_state ==
509 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
510 
511 			/*
512 			 * Utilization is non-zero, and we're already running
513 			 * in the higher power state. Take this opportunity to
514 			 * perform some book keeping if the last lowering
515 			 * request was governed.
516 			 */
517 			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
518 
519 				if ((now - last) >= cpupm_ti_predict_interval) {
520 					/*
521 					 * The domain is transient idle
522 					 * governed, and we mispredicted
523 					 * governing the last lowering request.
524 					 */
525 					if (++dom->cpd_ti >=
526 					    cpupm_mispredict_gov_thresh) {
527 						/*
528 						 * There's enough non-transient
529 						 * idle periods to justify
530 						 * removing the governor.
531 						 */
532 						dom->cpd_governor =
533 						    CPUPM_GOV_DISENGAGED;
534 						dom->cpd_ti = 0;
535 						DTRACE_PROBE1(
536 						    cpupm__ti__ungoverned,
537 						    cpupm_domain_t *, dom);
538 					}
539 				} else {
540 					/*
541 					 * Correctly predicted governing the
542 					 * last lowering request.
543 					 */
544 					dom->cpd_ti = 0;
545 				}
546 			}
547 		}
548 		break;
549 
550 	case CPUPM_DOM_IDLE_FROM_BUSY:
551 		last = dom->cpd_last_raise;
552 		dom->cpd_last_lower = now;
553 
554 		DTRACE_PROBE3(cpupm__lower__req,
555 		    cpupm_domain_t *, dom,
556 		    hrtime_t, last,
557 		    hrtime_t, now);
558 
559 		if (dom->cpd_state ==
560 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
561 
562 			/*
563 			 * The domain is idle, and is running in the highest
564 			 * performance state. Before we consider lowering power,
565 			 * perform some book keeping for the transient work
566 			 * governor.
567 			 */
568 			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
569 				if ((now - last) < cpupm_tw_predict_interval) {
570 					/*
571 					 * We're lowering the domain power and
572 					 * we *just* raised it. Consider the
573 					 * last raise mispredicted due to
574 					 * transient work.
575 					 */
576 					if (++dom->cpd_tw >=
577 					    cpupm_mispredict_thresh) {
578 						/*
579 						 * There's enough transient work
580 						 * transitions to justify
581 						 * governing future raise
582 						 * requests.
583 						 */
584 						dom->cpd_governor =
585 						    CPUPM_GOV_TRANS_WORK;
586 						dom->cpd_tw = 0;
587 						DTRACE_PROBE1(
588 						    cpupm__tw__governed,
589 						    cpupm_domain_t *, dom);
590 					}
591 				} else {
592 					/*
593 					 * We correctly predicted during the
594 					 * last raise.
595 					 */
596 					dom->cpd_tw = 0;
597 				}
598 			}
599 			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
600 				/*
601 				 * Lowering requests are governed due to
602 				 * transient idleness.
603 				 */
604 				DTRACE_PROBE1(cpupm__lowering__governed,
605 				    cpupm_domain_t *, dom);
606 
607 				return;
608 			}
609 
610 			/*
611 			 * Prepare to transition to a lower power state.
612 			 */
613 			new_state =
614 			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
615 
616 		} else if (dom->cpd_state ==
617 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
618 
619 			/*
620 			 * The domain is idle, and we're already running in
621 			 * the lower power state. Take this opportunity to
622 			 * perform some book keeping if the last raising
623 			 * request was governed.
624 			 */
625 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
626 				if ((now - last) >= cpupm_tw_predict_interval) {
627 					/*
628 					 * The domain is transient work
629 					 * governed, and we mispredicted
630 					 * governing the last raising request.
631 					 */
632 					if (++dom->cpd_tw >=
633 					    cpupm_mispredict_gov_thresh) {
634 						/*
635 						 * There's enough non-transient
636 						 * work to justify removing
637 						 * the governor.
638 						 */
639 						dom->cpd_governor =
640 						    CPUPM_GOV_DISENGAGED;
641 						dom->cpd_tw = 0;
642 						DTRACE_PROBE1(
643 						    cpupm__tw__ungoverned,
644 						    cpupm_domain_t *, dom);
645 					}
646 				} else {
647 					/*
648 					 * We correctly predicted governing
649 					 * the last raise.
650 					 */
651 					dom->cpd_tw = 0;
652 				}
653 			}
654 		}
655 		break;
656 	}
657 	/*
658 	 * Change the power state
659 	 * Not much currently done if this doesn't succeed
660 	 */
661 	if (new_state)
662 		(void) cpupm_change_state(cp, dom, new_state);
663 }
664 
665 
666 /*
667  * Interface called by platforms to dynamically change the
668  * MAX performance cpupm state
669  */
670 void
671 cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
672 {
673 	cpupm_domain_t	*dom;
674 	id_t		did;
675 	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
676 	boolean_t	change_state = B_FALSE;
677 	cpupm_state_t	*new_state = NULL;
678 
679 	did = cpupm_domain_id(cp, type);
680 	if (MUTEX_HELD(&cpu_lock)) {
681 		dom = cpupm_domain_find(did, type);
682 	} else {
683 		mutex_enter(&cpu_lock);
684 		dom = cpupm_domain_find(did, type);
685 		mutex_exit(&cpu_lock);
686 	}
687 
688 	/*
689 	 * Can use a lock to avoid changing the power state of the cpu when
690 	 * CPUPM_STATE_MAX_PERF is getting changed.
691 	 * Since the occurance of events to change MAX_PERF is not frequent,
692 	 * it may not be a good idea to overburden with locks. In the worst
693 	 * case, for one cycle the power may not get changed to the required
694 	 * level
695 	 */
696 	if (dom != NULL) {
697 		if (dom->cpd_state ==
698 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
699 			change_state = B_TRUE;
700 		}
701 
702 		/*
703 		 * If an out of range level is passed, use the lowest supported
704 		 * speed.
705 		 */
706 		if (max_perf_level >= dom->cpd_nstates &&
707 		    dom->cpd_nstates > 1) {
708 			max_perf_level = dom->cpd_nstates - 1;
709 		}
710 
711 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
712 		    &dom->cpd_states[max_perf_level];
713 
714 		/*
715 		 * If the current state is MAX_PERF, change the current state
716 		 * to the new MAX_PERF
717 		 */
718 		if (change_state) {
719 			new_state =
720 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
721 			if (new_state) {
722 				(void) cpupm_change_state(cp, dom, new_state);
723 			}
724 		}
725 	}
726 }
727 
728 /*
729  * Initialize the parameters for the transience governor state machine
730  */
731 static void
732 cpupm_governor_initialize(void)
733 {
734 	/*
735 	 * The default prediction intervals are specified in nanoseconds.
736 	 * Convert these to the equivalent in unscaled hrtime, which is the
737 	 * format of the timestamps passed to cpupm_utilization_event()
738 	 */
739 	cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
740 	cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
741 }
742 
743 /*
744  * Initiate a state change in all CPUPM domain instances of the specified type
745  */
746 static void
747 cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
748 {
749 	cpu_t		*cp;
750 	pg_cmt_t	*pwr_pg;
751 	cpupm_domain_t	*dom;
752 	group_t		*hwset;
753 	group_iter_t	giter;
754 	pg_cpu_itr_t	cpu_iter;
755 	pghw_type_t	hw;
756 
757 	ASSERT(MUTEX_HELD(&cpu_lock));
758 
759 	switch (type) {
760 	case CPUPM_DTYPE_ACTIVE:
761 		hw = PGHW_POW_ACTIVE;
762 		break;
763 	default:
764 		/*
765 		 * Power domain types other than "active" unsupported.
766 		 */
767 		ASSERT(type == CPUPM_DTYPE_ACTIVE);
768 		return;
769 	}
770 
771 	if ((hwset = pghw_set_lookup(hw)) == NULL)
772 		return;
773 
774 	/*
775 	 * Iterate over the power domains
776 	 */
777 	group_iter_init(&giter);
778 	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
779 
780 		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
781 
782 		/*
783 		 * Iterate over the CPUs in each domain
784 		 */
785 		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
786 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
787 			(void) cpupm_change_state(cp, dom,
788 			    dom->cpd_named_states[state]);
789 		}
790 	}
791 }
792