xref: /illumos-gate/usr/src/uts/common/os/cpu_pm.c (revision 635216b673cf196ac523ff2a7ab715717e553292)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpu_pm.h>
27 #include <sys/cmn_err.h>
28 #include <sys/sdt.h>
29 
30 /*
31  * Solaris Event Based CPU Power Manager
32  *
33  * This file implements platform independent event based CPU power management.
34  * When CPUs are configured into the system, the CMT scheduling subsystem will
35  * query the platform to determine if the CPU belongs to any power management
36  * domains. That is, sets of CPUs that share power management states.
37  *
38  * Active Power Management domains represent a group of CPUs across which the
39  * Operating System can request speed changes (which may in turn result
40  * in voltage changes). This allows the operating system to trade off
41  * performance for power savings.
42  *
43  * Idle Power Management domains can enter power savings states when they are
44  * unutilized. These states allow the Operating System to trade off power
45  * for performance (in the form of latency to transition from the idle state
46  * to an active one).
47  *
48  * For each active and idle power domain the CMT subsystem instantiates, a
49  * cpupm_domain_t structure is created. As the dispatcher schedules threads
50  * to run on the system's CPUs, it will also track the utilization of the
51  * enumerated power domains. Significant changes in utilization will result
52  * in the dispatcher sending the power manager events that relate to the
53  * utilization of the power domain. The power manager recieves the events,
54  * and in the context of the policy objectives in force, may decide to request
55  * the domain's power/performance state be changed.
56  *
57  * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
58  * manager will request the CPUs in the domain run at their fastest (and most
59  * power consuming) state. When the domain becomes idle (utilization at zero),
60  * the power manager will request that the CPUs run at a speed that saves the
61  * most power.
62  *
63  * The advantage of this scheme, is that the CPU power manager working with the
64  * dispatcher can be extremely responsive to changes in utilization. Optimizing
65  * for performance in the presence of utilization, and power savings in the
66  * presence of idleness. Such close collaboration with the dispatcher has other
67  * benefits that will play out in the form of more sophisticated power /
68  * performance policy in the near future.
69  *
70  * Avoiding state thrashing in the presence of transient periods of utilization
71  * and idleness while still being responsive to non-transient periods is key.
72  * The power manager implmeents several "governors" that are used to throttle
73  * state transitions when a significant amount of transient idle or transient
74  * work is detected.
75  *
76  * Kernel background activity (e.g. taskq threads) are by far the most common
77  * form of transient utilization. Ungoverned in the face of this utililzation,
78  * hundreds of state transitions per second would result on an idle system.
79  *
80  * Transient idleness is common when a thread briefly yields the CPU to
81  * wait for an event elsewhere in the system. Where the idle period is short
82  * enough, the overhead associated with making the state transition doesn't
83  * justify the power savings.
84  */
85 
86 static cpupm_domain_t *cpupm_domains = NULL;
87 
88 /*
89  * Uninitialized state of CPU power management is disabled
90  */
91 cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
92 
93 /*
94  * Periods of utilization lasting less than this time interval are characterized
95  * as transient. State changes associated with transient work are considered
96  * to be mispredicted. That is, it's not worth raising and lower power states
97  * where the utilization lasts for less than this interval.
98  */
99 hrtime_t cpupm_tw_predict_interval;
100 
101 /*
102  * Periods of idleness lasting less than this time interval are characterized
103  * as transient. State changes associated with transient idle are considered
104  * to be mispredicted. That is, it's not worth lowering and raising power
105  * states where the idleness lasts for less than this interval.
106  */
107 hrtime_t cpupm_ti_predict_interval;
108 
109 /*
110  * Number of mispredictions after which future transitions will be governed.
111  */
112 int cpupm_mispredict_thresh = 2;
113 
114 /*
115  * Likewise, the number of mispredicted governed transitions after which the
116  * governor will be removed.
117  */
118 int cpupm_mispredict_gov_thresh = 10;
119 
120 /*
121  * The transient work and transient idle prediction intervals are initialized
122  * to be some multiple of the amount of time it takes to transition a power
123  * domain from the highest to the lowest power state, and back again, which
124  * is measured.
125  *
126  * The default values of those multiples are specified here. Tuning them higher
127  * will result in the transient work, and transient idle governors being used
128  * more aggresively, which limits the frequency of state transitions at the
129  * expense of performance and power savings, respectively.
130  */
131 #define	CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
132 #define	CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
133 
134 /*
135  * Number of high=>low=>high measurements performed, of which the average
136  * is taken.
137  */
138 #define	CPUPM_BENCHMARK_ITERS 5
139 
140 int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
141 int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
142 
143 
144 static int	cpupm_governor_initialize(void);
145 static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
146 
147 cpupm_policy_t
148 cpupm_get_policy(void)
149 {
150 	return (cpupm_policy);
151 }
152 
153 int
154 cpupm_set_policy(cpupm_policy_t new_policy)
155 {
156 	static int	gov_init = 0;
157 	int		result = 0;
158 
159 	mutex_enter(&cpu_lock);
160 	if (new_policy == cpupm_policy) {
161 		mutex_exit(&cpu_lock);
162 		return (result);
163 	}
164 
165 	/*
166 	 * Pausing CPUs causes a high priority thread to be scheduled
167 	 * on all other CPUs (besides the current one). This locks out
168 	 * other CPUs from making CPUPM state transitions.
169 	 */
170 	switch (new_policy) {
171 	case CPUPM_POLICY_DISABLED:
172 		pause_cpus(NULL);
173 		cpupm_policy = CPUPM_POLICY_DISABLED;
174 		start_cpus();
175 
176 		result = cmt_pad_disable(PGHW_POW_ACTIVE);
177 
178 		/*
179 		 * Once PAD has been enabled, it should always be possible
180 		 * to disable it.
181 		 */
182 		ASSERT(result == 0);
183 
184 		/*
185 		 * Bring all the active power domains to the maximum
186 		 * performance state.
187 		 */
188 		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
189 		    CPUPM_STATE_MAX_PERF);
190 
191 		break;
192 	case CPUPM_POLICY_ELASTIC:
193 
194 		result = cmt_pad_enable(PGHW_POW_ACTIVE);
195 		if (result < 0) {
196 			/*
197 			 * Failed to enable PAD across the active power
198 			 * domains, which may well be because none were
199 			 * enumerated.
200 			 */
201 			break;
202 		}
203 
204 		pause_cpus(NULL);
205 		/*
206 		 * Attempt to initialize the governor parameters the first
207 		 * time through.
208 		 */
209 		if (gov_init == 0) {
210 			result = cpupm_governor_initialize();
211 			if (result == 0) {
212 				gov_init = 1;
213 			} else {
214 				/*
215 				 * Failed to initialize the governor parameters
216 				 */
217 				start_cpus();
218 				break;
219 			}
220 		}
221 		cpupm_policy = CPUPM_POLICY_ELASTIC;
222 		start_cpus();
223 
224 		break;
225 	default:
226 		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
227 		    new_policy);
228 		ASSERT(0);
229 		break;
230 	}
231 	mutex_exit(&cpu_lock);
232 
233 	return (result);
234 }
235 
236 /*
237  * Look for an existing power domain
238  */
239 static cpupm_domain_t *
240 cpupm_domain_find(id_t id, cpupm_dtype_t type)
241 {
242 	ASSERT(MUTEX_HELD(&cpu_lock));
243 
244 	cpupm_domain_t *dom;
245 
246 	dom = cpupm_domains;
247 	while (dom != NULL) {
248 		if (id == dom->cpd_id && type == dom->cpd_type)
249 			return (dom);
250 		dom = dom->cpd_next;
251 	}
252 	return (NULL);
253 }
254 
255 /*
256  * Create a new domain
257  */
258 static cpupm_domain_t *
259 cpupm_domain_create(id_t id, cpupm_dtype_t type)
260 {
261 	cpupm_domain_t *dom;
262 
263 	ASSERT(MUTEX_HELD(&cpu_lock));
264 
265 	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
266 	dom->cpd_id = id;
267 	dom->cpd_type = type;
268 
269 	/* Link into the known domain list */
270 	dom->cpd_next = cpupm_domains;
271 	cpupm_domains = dom;
272 
273 	return (dom);
274 }
275 
276 static void
277 cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
278 {
279 	/*
280 	 * In the envent we're enumerating because the domain's state
281 	 * configuration has changed, toss any existing states.
282 	 */
283 	if (dom->cpd_nstates > 0) {
284 		kmem_free(dom->cpd_states,
285 		    sizeof (cpupm_state_t) * dom->cpd_nstates);
286 		dom->cpd_nstates = 0;
287 	}
288 
289 	/*
290 	 * Query to determine the number of states, allocate storage
291 	 * large enough to hold the state information, and pass it back
292 	 * to the platform driver to complete the enumeration.
293 	 */
294 	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
295 
296 	if (dom->cpd_nstates == 0)
297 		return;
298 
299 	dom->cpd_states =
300 	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
301 	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
302 }
303 
304 /*
305  * Initialize the specified type of power domain on behalf of the CPU
306  */
307 cpupm_domain_t *
308 cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
309 {
310 	cpupm_domain_t	*dom;
311 	id_t		did;
312 
313 	ASSERT(MUTEX_HELD(&cpu_lock));
314 
315 	/*
316 	 * Instantiate the domain if it doesn't already exist
317 	 * and enumerate its power states.
318 	 */
319 	did = cpupm_domain_id(cp, type);
320 	dom = cpupm_domain_find(did, type);
321 	if (dom == NULL) {
322 		dom = cpupm_domain_create(did, type);
323 		cpupm_domain_state_enum(cp, dom);
324 	}
325 
326 	/*
327 	 * Named state initialization
328 	 */
329 	if (type == CPUPM_DTYPE_ACTIVE) {
330 		/*
331 		 * For active power domains, the highest performance
332 		 * state is defined as first state returned from
333 		 * the domain enumeration.
334 		 */
335 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
336 		    &dom->cpd_states[0];
337 		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
338 		    &dom->cpd_states[dom->cpd_nstates - 1];
339 
340 		/*
341 		 * Begin by assuming CPU is running at the max perf state.
342 		 */
343 		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
344 	}
345 
346 	return (dom);
347 }
348 
349 /*
350  * Return the id associated with the given type of domain
351  * to which cp belongs
352  */
353 id_t
354 cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
355 {
356 	return (cpupm_plat_domain_id(cp, type));
357 }
358 
359 /*
360  * Initiate a state change for the specified domain on behalf of cp
361  */
362 int
363 cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
364 {
365 	if (cpupm_plat_change_state(cp, state) < 0)
366 		return (-1);
367 
368 	DTRACE_PROBE2(cpupm__change__state,
369 	    cpupm_domain_t *, dom,
370 	    cpupm_state_t *, state);
371 
372 	dom->cpd_state = state;
373 	return (0);
374 }
375 
376 /*
377  * Interface into the CPU power manager to indicate a significant change
378  * in utilization of the specified active power domain
379  */
380 void
381 cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
382 			    cpupm_util_event_t event)
383 {
384 	cpupm_state_t	*new_state = NULL;
385 	hrtime_t	last;
386 
387 	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
388 		return;
389 	}
390 
391 	/*
392 	 * What follows is a simple elastic power state management policy.
393 	 *
394 	 * If the utilization has become non-zero, and the domain was
395 	 * previously at it's lowest power state, then transition it
396 	 * to the highest state in the spirit of "race to idle".
397 	 *
398 	 * If the utilization has dropped to zero, then transition the
399 	 * domain to its lowest power state.
400 	 *
401 	 * Statistics are maintained to implement governors to reduce state
402 	 * transitions resulting from either transient work, or periods of
403 	 * transient idleness on the domain.
404 	 */
405 	switch (event) {
406 	case CPUPM_DOM_REMAIN_BUSY:
407 
408 		/*
409 		 * We've received an event that the domain is running a thread
410 		 * that's made it to the end of it's time slice. If we are at
411 		 * low power, then raise it. If the transient work governor
412 		 * is engaged, then remove it.
413 		 */
414 		if (dom->cpd_state ==
415 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
416 			new_state =
417 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
418 			if (dom->cpd_tw_governed == B_TRUE) {
419 				dom->cpd_tw_governed = B_FALSE;
420 				dom->cpd_tw = 0;
421 			}
422 		}
423 		break;
424 
425 	case CPUPM_DOM_BUSY_FROM_IDLE:
426 		last = dom->cpd_last_lower;
427 		dom->cpd_last_raise = now;
428 
429 		DTRACE_PROBE3(cpupm__raise__req,
430 		    cpupm_domain_t *, dom,
431 		    hrtime_t, last,
432 		    hrtime_t, now);
433 
434 		if (dom->cpd_state ==
435 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
436 
437 			/*
438 			 * There's non-zero utilization, and the domain is
439 			 * running in the lower power state. Before we
440 			 * consider raising power, perform some book keeping
441 			 * for the transient idle governor.
442 			 */
443 			if (dom->cpd_ti_governed == B_FALSE) {
444 				if ((now - last) < cpupm_ti_predict_interval) {
445 					/*
446 					 * We're raising the domain power and
447 					 * we *just* lowered it. Consider
448 					 * this a mispredicted power state
449 					 * transition due to a transient
450 					 * idle period.
451 					 *
452 					 * Note: The presence of enough
453 					 * transient work across the domain can
454 					 * result in frequent transient idle
455 					 * periods. We don't want the ti
456 					 * governor being installed as a side
457 					 * effect of transient work, so the ti
458 					 * governor is left alone if the tw
459 					 * governor is already installed.
460 					 */
461 					if (dom->cpd_tw_governed == B_FALSE &&
462 					    ++dom->cpd_ti >=
463 					    cpupm_mispredict_thresh) {
464 						/*
465 						 * There's enough transient
466 						 * idle transitions to
467 						 * justify governing future
468 						 * lowering requests.
469 						 */
470 						dom->cpd_ti_governed = B_TRUE;
471 						dom->cpd_ti = 0;
472 						DTRACE_PROBE1(
473 						    cpupm__ti__governed,
474 						    cpupm_domain_t *, dom);
475 					}
476 				} else {
477 					/*
478 					 * We correctly predicted the last
479 					 * lowering.
480 					 */
481 					dom->cpd_ti = 0;
482 				}
483 			}
484 			if (dom->cpd_tw_governed == B_TRUE) {
485 				/*
486 				 * Raise requests are governed due to
487 				 * transient work.
488 				 */
489 				DTRACE_PROBE1(cpupm__raise__governed,
490 				    cpupm_domain_t *, dom);
491 
492 				/*
493 				 * It's likely that we'll be governed for a
494 				 * while. If the transient idle governor is
495 				 * also in place, examine the preceeding idle
496 				 * interval to see if that still makes sense.
497 				 */
498 				if (dom->cpd_ti_governed == B_TRUE &&
499 				    ((now - last) >=
500 				    cpupm_ti_predict_interval)) {
501 					if (++dom->cpd_ti >=
502 					    cpupm_mispredict_gov_thresh) {
503 						dom->cpd_ti_governed =
504 						    B_FALSE;
505 						dom->cpd_ti = 0;
506 					}
507 				}
508 				return;
509 			}
510 			/*
511 			 * Prepare to transition to the higher power state
512 			 */
513 			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
514 
515 		} else if (dom->cpd_state ==
516 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
517 
518 			/*
519 			 * Utilization is non-zero, and we're already running
520 			 * in the higher power state. Take this opportunity to
521 			 * perform some book keeping if the last lowering
522 			 * request was governed.
523 			 */
524 			if (dom->cpd_ti_governed == B_TRUE) {
525 				if ((now - last) >= cpupm_ti_predict_interval) {
526 					/*
527 					 * The domain is transient idle
528 					 * governed, and we mispredicted
529 					 * governing the last lowering request.
530 					 */
531 					if (++dom->cpd_ti >=
532 					    cpupm_mispredict_gov_thresh) {
533 						/*
534 						 * There's enough non-transient
535 						 * idle periods to justify
536 						 * removing the governor.
537 						 */
538 						dom->cpd_ti_governed = B_FALSE;
539 						dom->cpd_ti = 0;
540 						DTRACE_PROBE1(
541 						    cpupm__ti__ungoverned,
542 						    cpupm_domain_t *, dom);
543 					}
544 				} else {
545 					/*
546 					 * Correctly predicted governing the
547 					 * last lowering request.
548 					 */
549 					dom->cpd_ti = 0;
550 				}
551 			}
552 		}
553 		break;
554 
555 	case CPUPM_DOM_IDLE_FROM_BUSY:
556 		last = dom->cpd_last_raise;
557 		dom->cpd_last_lower = now;
558 
559 		DTRACE_PROBE3(cpupm__lower__req,
560 		    cpupm_domain_t *, dom,
561 		    hrtime_t, last,
562 		    hrtime_t, now);
563 
564 		if (dom->cpd_state ==
565 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
566 
567 			/*
568 			 * The domain is idle, and is running in the highest
569 			 * performance state. Before we consider lowering power,
570 			 * perform some book keeping for the transient work
571 			 * governor.
572 			 */
573 			if (dom->cpd_tw_governed == B_FALSE) {
574 				if ((now - last) < cpupm_tw_predict_interval) {
575 					/*
576 					 * We're lowering the domain power and
577 					 * we *just* raised it. Consider the
578 					 * last raise mispredicted due to
579 					 * transient work.
580 					 */
581 					if (++dom->cpd_tw >=
582 					    cpupm_mispredict_thresh) {
583 						/*
584 						 * There's enough transient idle
585 						 * transitions to justify
586 						 * governing future lowering
587 						 * requests.
588 						 */
589 						dom->cpd_tw_governed = B_TRUE;
590 						dom->cpd_tw = 0;
591 						DTRACE_PROBE1(
592 						    cpupm__tw__governed,
593 						    cpupm_domain_t *, dom);
594 					}
595 				} else {
596 					/*
597 					 * We correctly predicted during the
598 					 * last raise.
599 					 */
600 					dom->cpd_tw = 0;
601 				}
602 			}
603 			if (dom->cpd_ti_governed == B_TRUE) {
604 				/*
605 				 * Lowering requests are governed due to
606 				 * transient idleness.
607 				 */
608 				DTRACE_PROBE1(cpupm__lowering__governed,
609 				    cpupm_domain_t *, dom);
610 
611 				/*
612 				 * It's likely that we'll be governed for a
613 				 * while. If the transient work governor is
614 				 * also in place, examine the preceeding busy
615 				 * interval to see if that still makes sense.
616 				 */
617 				if (dom->cpd_tw_governed == B_TRUE &&
618 				    ((now - last) >=
619 				    cpupm_tw_predict_interval)) {
620 					if (++dom->cpd_tw >=
621 					    cpupm_mispredict_gov_thresh) {
622 						dom->cpd_tw_governed =
623 						    B_FALSE;
624 						dom->cpd_tw = 0;
625 					}
626 				}
627 				return;
628 			}
629 
630 			/*
631 			 * Prepare to transition to a lower power state.
632 			 */
633 			new_state =
634 			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
635 
636 		} else if (dom->cpd_state ==
637 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
638 
639 			/*
640 			 * The domain is idle, and we're already running in
641 			 * the lower power state. Take this opportunity to
642 			 * perform some book keeping if the last raising
643 			 * request was governed.
644 			 */
645 			if (dom->cpd_tw_governed == B_TRUE) {
646 				if ((now - last) >= cpupm_tw_predict_interval) {
647 					/*
648 					 * The domain is transient work
649 					 * governed, and we mispredicted
650 					 * governing the last raising request.
651 					 */
652 					if (++dom->cpd_tw >=
653 					    cpupm_mispredict_gov_thresh) {
654 						/*
655 						 * There's enough non-transient
656 						 * work to justify removing
657 						 * the governor.
658 						 */
659 						dom->cpd_tw_governed = B_FALSE;
660 						dom->cpd_tw = 0;
661 						DTRACE_PROBE1(
662 						    cpupm__tw__ungoverned,
663 						    cpupm_domain_t *, dom);
664 					}
665 				} else {
666 					/*
667 					 * We correctly predicted governing
668 					 * the last raise.
669 					 */
670 					dom->cpd_tw = 0;
671 				}
672 			}
673 		}
674 		break;
675 	}
676 	/*
677 	 * Change the power state
678 	 * Not much currently done if this doesn't succeed
679 	 */
680 	if (new_state)
681 		(void) cpupm_change_state(cp, dom, new_state);
682 }
683 
684 
685 /*
686  * Interface called by platforms to dynamically change the
687  * MAX performance cpupm state
688  */
689 void
690 cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
691 {
692 	cpupm_domain_t	*dom;
693 	id_t		did;
694 	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
695 	boolean_t	change_state = B_FALSE;
696 	cpupm_state_t	*new_state = NULL;
697 
698 	did = cpupm_domain_id(cp, type);
699 	mutex_enter(&cpu_lock);
700 	dom = cpupm_domain_find(did, type);
701 	mutex_exit(&cpu_lock);
702 
703 	/*
704 	 * Can use a lock to avoid changing the power state of the cpu when
705 	 * CPUPM_STATE_MAX_PERF is getting changed.
706 	 * Since the occurance of events to change MAX_PERF is not frequent,
707 	 * it may not be a good idea to overburden with locks. In the worst
708 	 * case, for one cycle the power may not get changed to the required
709 	 * level
710 	 */
711 	if (dom != NULL) {
712 		if (dom->cpd_state ==
713 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
714 			change_state = B_TRUE;
715 		}
716 
717 		/*
718 		 * If an out of range level is passed, use the lowest supported
719 		 * speed.
720 		 */
721 		if (max_perf_level >= dom->cpd_nstates &&
722 		    dom->cpd_nstates > 1) {
723 			max_perf_level = dom->cpd_nstates - 1;
724 		}
725 
726 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
727 		    &dom->cpd_states[max_perf_level];
728 
729 		/*
730 		 * If the current state is MAX_PERF, change the current state
731 		 * to the new MAX_PERF
732 		 */
733 		if (change_state) {
734 			new_state =
735 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
736 			if (new_state) {
737 				(void) cpupm_change_state(cp, dom, new_state);
738 			}
739 		}
740 	}
741 }
742 
743 /*
744  * Benchmark some power state transitions and use the transition latencies as
745  * a basis for initializing parameters for the transient idle and transient
746  * work governors.
747  *
748  * Returns 0 on success or -1 if the governor parameters could not be
749  * initialized.
750  */
751 static int
752 cpupm_governor_initialize(void)
753 {
754 	cpu_t		*cp = CPU;
755 	cpupm_domain_t	*dom;
756 	cpupm_state_t	*low, *high;
757 	id_t		did;
758 	hrtime_t	start, delta, deltas = 0;
759 	int		iterations;
760 
761 	did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
762 	if (did == CPUPM_NO_DOMAIN)
763 		return (-1);
764 
765 	dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
766 	if (dom == NULL)
767 		return (-1);
768 
769 	low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
770 	high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
771 
772 	for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
773 
774 		/*
775 		 * Measure the amount of time it takes to transition the
776 		 * domain down to the lowest, and back to the highest power
777 		 * state.
778 		 */
779 		start = gethrtime_unscaled();
780 		(void) cpupm_change_state(cp, dom, low);
781 		(void) cpupm_change_state(cp, dom, high);
782 		delta = gethrtime_unscaled() - start;
783 
784 		DTRACE_PROBE1(cpupm__benchmark__latency,
785 		    hrtime_t, delta);
786 
787 		deltas += delta;
788 	}
789 
790 	/*
791 	 * Figure the average latency, and tune the transient work and
792 	 * transient idle prediction intervals accordingly.
793 	 */
794 	delta = deltas / iterations;
795 
796 	cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
797 	cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
798 
799 	return (0);
800 }
801 
802 /*
803  * Initiate a state change in all CPUPM domain instances of the specified type
804  */
805 static void
806 cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
807 {
808 	cpu_t		*cp;
809 	pg_cmt_t	*pwr_pg;
810 	cpupm_domain_t	*dom;
811 	group_t		*hwset;
812 	group_iter_t	giter;
813 	pg_cpu_itr_t	cpu_iter;
814 	pghw_type_t	hw;
815 
816 	ASSERT(MUTEX_HELD(&cpu_lock));
817 
818 	switch (type) {
819 	case CPUPM_DTYPE_ACTIVE:
820 		hw = PGHW_POW_ACTIVE;
821 		break;
822 	default:
823 		/*
824 		 * Power domain types other than "active" unsupported.
825 		 */
826 		ASSERT(type == CPUPM_DTYPE_ACTIVE);
827 		return;
828 	}
829 
830 	if ((hwset = pghw_set_lookup(hw)) == NULL)
831 		return;
832 
833 	/*
834 	 * Iterate over the power domains
835 	 */
836 	group_iter_init(&giter);
837 	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
838 
839 		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
840 
841 		/*
842 		 * Iterate over the CPUs in each domain
843 		 */
844 		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
845 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
846 			(void) cpupm_change_state(cp, dom,
847 			    dom->cpd_named_states[state]);
848 		}
849 	}
850 }
851