xref: /titanic_51/usr/src/uts/common/os/cpu_pm.c (revision 54d5ddcceae506b00e8889ad38c9d15489f670c5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cpu_pm.h>
27 #include <sys/cmn_err.h>
28 #include <sys/sdt.h>
29 
30 /*
31  * Solaris Event Based CPU Power Manager
32  *
33  * This file implements platform independent event based CPU power management.
34  * When CPUs are configured into the system, the CMT scheduling subsystem will
35  * query the platform to determine if the CPU belongs to any power management
36  * domains. That is, sets of CPUs that share power management states.
37  *
38  * Active Power Management domains represent a group of CPUs across which the
39  * Operating System can request speed changes (which may in turn result
40  * in voltage changes). This allows the operating system to trade off
41  * performance for power savings.
42  *
43  * Idle Power Management domains can enter power savings states when they are
44  * unutilized. These states allow the Operating System to trade off power
45  * for performance (in the form of latency to transition from the idle state
46  * to an active one).
47  *
48  * For each active and idle power domain the CMT subsystem instantiates, a
49  * cpupm_domain_t structure is created. As the dispatcher schedules threads
50  * to run on the system's CPUs, it will also track the utilization of the
51  * enumerated power domains. Significant changes in utilization will result
52  * in the dispatcher sending the power manager events that relate to the
53  * utilization of the power domain. The power manager recieves the events,
54  * and in the context of the policy objectives in force, may decide to request
55  * the domain's power/performance state be changed.
56  *
57  * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
58  * manager will request the CPUs in the domain run at their fastest (and most
59  * power consuming) state. When the domain becomes idle (utilization at zero),
60  * the power manager will request that the CPUs run at a speed that saves the
61  * most power.
62  *
63  * The advantage of this scheme, is that the CPU power manager working with the
64  * dispatcher can be extremely responsive to changes in utilization. Optimizing
65  * for performance in the presence of utilization, and power savings in the
66  * presence of idleness. Such close collaboration with the dispatcher has other
67  * benefits that will play out in the form of more sophisticated power /
68  * performance policy in the near future.
69  *
70  * Avoiding state thrashing in the presence of transient periods of utilization
71  * and idleness while still being responsive to non-transient periods is key.
72  * The power manager implmeents several "governors" that are used to throttle
73  * state transitions when a significant amount of transient idle or transient
74  * work is detected.
75  *
76  * Kernel background activity (e.g. taskq threads) are by far the most common
77  * form of transient utilization. Ungoverned in the face of this utililzation,
78  * hundreds of state transitions per second would result on an idle system.
79  *
80  * Transient idleness is common when a thread briefly yields the CPU to
81  * wait for an event elsewhere in the system. Where the idle period is short
82  * enough, the overhead associated with making the state transition doesn't
83  * justify the power savings.
84  */
85 
86 static cpupm_domain_t *cpupm_domains = NULL;
87 
88 /*
89  * Uninitialized state of CPU power management is disabled
90  */
91 cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
92 
93 /*
94  * Periods of utilization lasting less than this time interval are characterized
95  * as transient. State changes associated with transient work are considered
96  * to be mispredicted. That is, it's not worth raising and lower power states
97  * where the utilization lasts for less than this interval.
98  */
99 hrtime_t cpupm_tw_predict_interval;
100 
101 /*
102  * Periods of idleness lasting less than this time interval are characterized
103  * as transient. State changes associated with transient idle are considered
104  * to be mispredicted. That is, it's not worth lowering and raising power
105  * states where the idleness lasts for less than this interval.
106  */
107 hrtime_t cpupm_ti_predict_interval;
108 
109 /*
110  * Number of mispredictions after which future transitions will be governed.
111  */
112 int cpupm_mispredict_thresh = 2;
113 
114 /*
115  * Likewise, the number of mispredicted governed transitions after which the
116  * governor will be removed.
117  */
118 int cpupm_mispredict_gov_thresh = 10;
119 
120 /*
121  * The transient work and transient idle prediction intervals are initialized
122  * to be some multiple of the amount of time it takes to transition a power
123  * domain from the highest to the lowest power state, and back again, which
124  * is measured.
125  *
126  * The default values of those multiples are specified here. Tuning them higher
127  * will result in the transient work, and transient idle governors being used
128  * more aggresively, which limits the frequency of state transitions at the
129  * expense of performance and power savings, respectively.
130  */
131 #define	CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
132 #define	CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
133 
134 /*
135  * Number of high=>low=>high measurements performed, of which the average
136  * is taken.
137  */
138 #define	CPUPM_BENCHMARK_ITERS 5
139 
140 int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
141 int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
142 
143 
144 static int	cpupm_governor_initialize(void);
145 static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
146 
147 cpupm_policy_t
148 cpupm_get_policy(void)
149 {
150 	return (cpupm_policy);
151 }
152 
153 int
154 cpupm_set_policy(cpupm_policy_t new_policy)
155 {
156 	static int	gov_init = 0;
157 	int		result = 0;
158 
159 	mutex_enter(&cpu_lock);
160 	if (new_policy == cpupm_policy) {
161 		mutex_exit(&cpu_lock);
162 		return (result);
163 	}
164 
165 	/*
166 	 * Pausing CPUs causes a high priority thread to be scheduled
167 	 * on all other CPUs (besides the current one). This locks out
168 	 * other CPUs from making CPUPM state transitions.
169 	 */
170 	switch (new_policy) {
171 	case CPUPM_POLICY_DISABLED:
172 		pause_cpus(NULL);
173 		cpupm_policy = CPUPM_POLICY_DISABLED;
174 		start_cpus();
175 
176 		result = cmt_pad_disable(PGHW_POW_ACTIVE);
177 
178 		/*
179 		 * Once PAD has been enabled, it should always be possible
180 		 * to disable it.
181 		 */
182 		ASSERT(result == 0);
183 
184 		/*
185 		 * Bring all the active power domains to the maximum
186 		 * performance state.
187 		 */
188 		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
189 		    CPUPM_STATE_MAX_PERF);
190 
191 		break;
192 	case CPUPM_POLICY_ELASTIC:
193 
194 		result = cmt_pad_enable(PGHW_POW_ACTIVE);
195 		if (result < 0) {
196 			/*
197 			 * Failed to enable PAD across the active power
198 			 * domains, which may well be because none were
199 			 * enumerated.
200 			 */
201 			break;
202 		}
203 
204 		pause_cpus(NULL);
205 		/*
206 		 * Attempt to initialize the governor parameters the first
207 		 * time through.
208 		 */
209 		if (gov_init == 0) {
210 			result = cpupm_governor_initialize();
211 			if (result == 0) {
212 				gov_init = 1;
213 			} else {
214 				/*
215 				 * Failed to initialize the governor parameters
216 				 */
217 				start_cpus();
218 				break;
219 			}
220 		}
221 		cpupm_policy = CPUPM_POLICY_ELASTIC;
222 		start_cpus();
223 
224 		break;
225 	default:
226 		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
227 		    new_policy);
228 		ASSERT(0);
229 		break;
230 	}
231 	mutex_exit(&cpu_lock);
232 
233 	return (result);
234 }
235 
236 /*
237  * Look for an existing power domain
238  */
239 static cpupm_domain_t *
240 cpupm_domain_find(id_t id, cpupm_dtype_t type)
241 {
242 	ASSERT(MUTEX_HELD(&cpu_lock));
243 
244 	cpupm_domain_t *dom;
245 
246 	dom = cpupm_domains;
247 	while (dom != NULL) {
248 		if (id == dom->cpd_id && type == dom->cpd_type)
249 			return (dom);
250 		dom = dom->cpd_next;
251 	}
252 	return (NULL);
253 }
254 
255 /*
256  * Create a new domain
257  */
258 static cpupm_domain_t *
259 cpupm_domain_create(id_t id, cpupm_dtype_t type)
260 {
261 	cpupm_domain_t *dom;
262 
263 	ASSERT(MUTEX_HELD(&cpu_lock));
264 
265 	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
266 	dom->cpd_id = id;
267 	dom->cpd_type = type;
268 
269 	/* Link into the known domain list */
270 	dom->cpd_next = cpupm_domains;
271 	cpupm_domains = dom;
272 
273 	return (dom);
274 }
275 
276 static void
277 cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
278 {
279 	/*
280 	 * In the envent we're enumerating because the domain's state
281 	 * configuration has changed, toss any existing states.
282 	 */
283 	if (dom->cpd_nstates > 0) {
284 		kmem_free(dom->cpd_states,
285 		    sizeof (cpupm_state_t) * dom->cpd_nstates);
286 		dom->cpd_nstates = 0;
287 	}
288 
289 	/*
290 	 * Query to determine the number of states, allocate storage
291 	 * large enough to hold the state information, and pass it back
292 	 * to the platform driver to complete the enumeration.
293 	 */
294 	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
295 
296 	if (dom->cpd_nstates == 0)
297 		return;
298 
299 	dom->cpd_states =
300 	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
301 	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
302 }
303 
304 /*
305  * Initialize the specified type of power domain on behalf of the CPU
306  */
307 cpupm_domain_t *
308 cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
309 {
310 	cpupm_domain_t	*dom;
311 	id_t		did;
312 
313 	ASSERT(MUTEX_HELD(&cpu_lock));
314 
315 	/*
316 	 * Instantiate the domain if it doesn't already exist
317 	 * and enumerate its power states.
318 	 */
319 	did = cpupm_domain_id(cp, type);
320 	dom = cpupm_domain_find(did, type);
321 	if (dom == NULL) {
322 		dom = cpupm_domain_create(did, type);
323 		cpupm_domain_state_enum(cp, dom);
324 	}
325 
326 	/*
327 	 * Named state initialization
328 	 */
329 	if (type == CPUPM_DTYPE_ACTIVE) {
330 		/*
331 		 * For active power domains, the highest performance
332 		 * state is defined as first state returned from
333 		 * the domain enumeration.
334 		 */
335 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
336 		    &dom->cpd_states[0];
337 		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
338 		    &dom->cpd_states[dom->cpd_nstates - 1];
339 
340 		/*
341 		 * Begin by assuming CPU is running at the max perf state.
342 		 */
343 		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
344 	}
345 
346 	return (dom);
347 }
348 
349 /*
350  * Return the id associated with the given type of domain
351  * to which cp belongs
352  */
353 id_t
354 cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
355 {
356 	return (cpupm_plat_domain_id(cp, type));
357 }
358 
359 /*
360  * Initiate a state change for the specified domain on behalf of cp
361  */
362 int
363 cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
364 {
365 	if (cpupm_plat_change_state(cp, state) < 0)
366 		return (-1);
367 
368 	DTRACE_PROBE2(cpupm__change__state,
369 	    cpupm_domain_t *, dom,
370 	    cpupm_state_t *, state);
371 
372 	dom->cpd_state = state;
373 	return (0);
374 }
375 
376 /*
377  * Interface into the CPU power manager to indicate a significant change
378  * in utilization of the specified active power domain
379  */
380 void
381 cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
382 			    cpupm_util_event_t event)
383 {
384 	cpupm_state_t	*new_state = NULL;
385 	hrtime_t	last;
386 
387 	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
388 		return;
389 	}
390 
391 	/*
392 	 * What follows is a simple elastic power state management policy.
393 	 *
394 	 * If the utilization has become non-zero, and the domain was
395 	 * previously at it's lowest power state, then transition it
396 	 * to the highest state in the spirit of "race to idle".
397 	 *
398 	 * If the utilization has dropped to zero, then transition the
399 	 * domain to its lowest power state.
400 	 *
401 	 * Statistics are maintained to implement governors to reduce state
402 	 * transitions resulting from either transient work, or periods of
403 	 * transient idleness on the domain.
404 	 */
405 	switch (event) {
406 	case CPUPM_DOM_REMAIN_BUSY:
407 
408 		/*
409 		 * We've received an event that the domain is running a thread
410 		 * that's made it to the end of it's time slice. If we are at
411 		 * low power, then raise it. If the transient work governor
412 		 * is engaged, then remove it.
413 		 */
414 		if (dom->cpd_state ==
415 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
416 			new_state =
417 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
418 			if (dom->cpd_tw_governed == B_TRUE) {
419 				dom->cpd_tw_governed = B_FALSE;
420 				dom->cpd_tw = 0;
421 			}
422 		}
423 		break;
424 
425 	case CPUPM_DOM_BUSY_FROM_IDLE:
426 		last = dom->cpd_last_lower;
427 		dom->cpd_last_raise = now;
428 
429 		DTRACE_PROBE3(cpupm__raise__req,
430 		    cpupm_domain_t *, dom,
431 		    hrtime_t, last,
432 		    hrtime_t, now);
433 
434 		if (dom->cpd_state ==
435 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
436 
437 			/*
438 			 * There's non-zero utilization, and the domain is
439 			 * running in the lower power state. Before we
440 			 * consider raising power, perform some book keeping
441 			 * for the transient idle governor.
442 			 */
443 			if (dom->cpd_ti_governed == B_FALSE) {
444 				if ((now - last) < cpupm_ti_predict_interval) {
445 					/*
446 					 * We're raising the domain power and
447 					 * we *just* lowered it. Consider
448 					 * this a mispredicted power state
449 					 * transition due to a transient
450 					 * idle period.
451 					 */
452 					if (++dom->cpd_ti >=
453 					    cpupm_mispredict_thresh) {
454 						/*
455 						 * There's enough transient
456 						 * idle transitions to
457 						 * justify governing future
458 						 * lowering requests.
459 						 */
460 						dom->cpd_ti_governed = B_TRUE;
461 						dom->cpd_ti = 0;
462 						DTRACE_PROBE1(
463 						    cpupm__ti__governed,
464 						    cpupm_domain_t *, dom);
465 					}
466 				} else {
467 					/*
468 					 * We correctly predicted the last
469 					 * lowering.
470 					 */
471 					dom->cpd_ti = 0;
472 				}
473 			}
474 			if (dom->cpd_tw_governed == B_TRUE) {
475 				/*
476 				 * Raise requests are governed due to
477 				 * transient work.
478 				 */
479 				DTRACE_PROBE1(cpupm__raise__governed,
480 				    cpupm_domain_t *, dom);
481 
482 				/*
483 				 * It's likely that we'll be governed for a
484 				 * while. If the transient idle governor is
485 				 * also in place, examine the preceeding idle
486 				 * interval to see if that still makes sense.
487 				 */
488 				if (dom->cpd_ti_governed == B_TRUE &&
489 				    ((now - last) >=
490 				    cpupm_ti_predict_interval)) {
491 					if (++dom->cpd_ti >=
492 					    cpupm_mispredict_gov_thresh) {
493 						dom->cpd_ti_governed =
494 						    B_FALSE;
495 						dom->cpd_ti = 0;
496 					}
497 				}
498 				return;
499 			}
500 			/*
501 			 * Prepare to transition to the higher power state
502 			 */
503 			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
504 
505 		} else if (dom->cpd_state ==
506 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
507 
508 			/*
509 			 * Utilization is non-zero, and we're already running
510 			 * in the higher power state. Take this opportunity to
511 			 * perform some book keeping if the last lowering
512 			 * request was governed.
513 			 */
514 			if (dom->cpd_ti_governed == B_TRUE) {
515 				if ((now - last) >= cpupm_ti_predict_interval) {
516 					/*
517 					 * The domain is transient idle
518 					 * governed, and we mispredicted
519 					 * governing the last lowering request.
520 					 */
521 					if (++dom->cpd_ti >=
522 					    cpupm_mispredict_gov_thresh) {
523 						/*
524 						 * There's enough non-transient
525 						 * idle periods to justify
526 						 * removing the governor.
527 						 */
528 						dom->cpd_ti_governed = B_FALSE;
529 						dom->cpd_ti = 0;
530 						DTRACE_PROBE1(
531 						    cpupm__ti__ungoverned,
532 						    cpupm_domain_t *, dom);
533 					}
534 				} else {
535 					/*
536 					 * Correctly predicted governing the
537 					 * last lowering request.
538 					 */
539 					dom->cpd_ti = 0;
540 				}
541 			}
542 		}
543 		break;
544 
545 	case CPUPM_DOM_IDLE_FROM_BUSY:
546 		last = dom->cpd_last_raise;
547 		dom->cpd_last_lower = now;
548 
549 		DTRACE_PROBE3(cpupm__lower__req,
550 		    cpupm_domain_t *, dom,
551 		    hrtime_t, last,
552 		    hrtime_t, now);
553 
554 		if (dom->cpd_state ==
555 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
556 
557 			/*
558 			 * The domain is idle, and is running in the highest
559 			 * performance state. Before we consider lowering power,
560 			 * perform some book keeping for the transient work
561 			 * governor.
562 			 */
563 			if (dom->cpd_tw_governed == B_FALSE) {
564 				if ((now - last) < cpupm_tw_predict_interval) {
565 					/*
566 					 * We're lowering the domain power and
567 					 * we *just* raised it. Consider the
568 					 * last raise mispredicted due to
569 					 * transient work.
570 					 */
571 					if (++dom->cpd_tw >=
572 					    cpupm_mispredict_thresh) {
573 						/*
574 						 * There's enough transient idle
575 						 * transitions to justify
576 						 * governing future lowering
577 						 * requests.
578 						 */
579 						dom->cpd_tw_governed = B_TRUE;
580 						dom->cpd_tw = 0;
581 						DTRACE_PROBE1(
582 						    cpupm__tw__governed,
583 						    cpupm_domain_t *, dom);
584 					}
585 				} else {
586 					/*
587 					 * We correctly predicted during the
588 					 * last raise.
589 					 */
590 					dom->cpd_tw = 0;
591 				}
592 			}
593 			if (dom->cpd_ti_governed == B_TRUE) {
594 				/*
595 				 * Lowering requests are governed due to
596 				 * transient idleness.
597 				 */
598 				DTRACE_PROBE1(cpupm__lowering__governed,
599 				    cpupm_domain_t *, dom);
600 
601 				/*
602 				 * It's likely that we'll be governed for a
603 				 * while. If the transient work governor is
604 				 * also in place, examine the preceeding busy
605 				 * interval to see if that still makes sense.
606 				 */
607 				if (dom->cpd_tw_governed == B_TRUE &&
608 				    ((now - last) >=
609 				    cpupm_tw_predict_interval)) {
610 					if (++dom->cpd_tw >=
611 					    cpupm_mispredict_gov_thresh) {
612 						dom->cpd_tw_governed =
613 						    B_FALSE;
614 						dom->cpd_tw = 0;
615 					}
616 				}
617 				return;
618 			}
619 
620 			/*
621 			 * Prepare to transition to a lower power state.
622 			 */
623 			new_state =
624 			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
625 
626 		} else if (dom->cpd_state ==
627 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
628 
629 			/*
630 			 * The domain is idle, and we're already running in
631 			 * the lower power state. Take this opportunity to
632 			 * perform some book keeping if the last raising
633 			 * request was governed.
634 			 */
635 			if (dom->cpd_tw_governed == B_TRUE) {
636 				if ((now - last) >= cpupm_tw_predict_interval) {
637 					/*
638 					 * The domain is transient work
639 					 * governed, and we mispredicted
640 					 * governing the last raising request.
641 					 */
642 					if (++dom->cpd_tw >=
643 					    cpupm_mispredict_gov_thresh) {
644 						/*
645 						 * There's enough non-transient
646 						 * work to justify removing
647 						 * the governor.
648 						 */
649 						dom->cpd_tw_governed = B_FALSE;
650 						dom->cpd_tw = 0;
651 						DTRACE_PROBE1(
652 						    cpupm__tw__ungoverned,
653 						    cpupm_domain_t *, dom);
654 					}
655 				} else {
656 					/*
657 					 * We correctly predicted governing
658 					 * the last raise.
659 					 */
660 					dom->cpd_tw = 0;
661 				}
662 			}
663 		}
664 		break;
665 	}
666 	/*
667 	 * Change the power state
668 	 * Not much currently done if this doesn't succeed
669 	 */
670 	if (new_state)
671 		(void) cpupm_change_state(cp, dom, new_state);
672 }
673 
674 
675 /*
676  * Interface called by platforms to dynamically change the
677  * MAX performance cpupm state
678  */
679 void
680 cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
681 {
682 	cpupm_domain_t	*dom;
683 	id_t		did;
684 	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
685 	boolean_t	change_state = B_FALSE;
686 	cpupm_state_t	*new_state = NULL;
687 
688 	did = cpupm_domain_id(cp, type);
689 	mutex_enter(&cpu_lock);
690 	dom = cpupm_domain_find(did, type);
691 	mutex_exit(&cpu_lock);
692 
693 	/*
694 	 * Can use a lock to avoid changing the power state of the cpu when
695 	 * CPUPM_STATE_MAX_PERF is getting changed.
696 	 * Since the occurance of events to change MAX_PERF is not frequent,
697 	 * it may not be a good idea to overburden with locks. In the worst
698 	 * case, for one cycle the power may not get changed to the required
699 	 * level
700 	 */
701 	if (dom != NULL) {
702 		if (dom->cpd_state ==
703 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
704 			change_state = B_TRUE;
705 		}
706 
707 		/*
708 		 * If an out of range level is passed, use the lowest supported
709 		 * speed.
710 		 */
711 		if (max_perf_level >= dom->cpd_nstates &&
712 		    dom->cpd_nstates > 1) {
713 			max_perf_level = dom->cpd_nstates - 1;
714 		}
715 
716 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
717 		    &dom->cpd_states[max_perf_level];
718 
719 		/*
720 		 * If the current state is MAX_PERF, change the current state
721 		 * to the new MAX_PERF
722 		 */
723 		if (change_state) {
724 			new_state =
725 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
726 			if (new_state) {
727 				(void) cpupm_change_state(cp, dom, new_state);
728 			}
729 		}
730 	}
731 }
732 
733 /*
734  * Benchmark some power state transitions and use the transition latencies as
735  * a basis for initializing parameters for the transient idle and transient
736  * work governors.
737  *
738  * Returns 0 on success or -1 if the governor parameters could not be
739  * initialized.
740  */
741 static int
742 cpupm_governor_initialize(void)
743 {
744 	cpu_t		*cp = CPU;
745 	cpupm_domain_t	*dom;
746 	cpupm_state_t	*low, *high;
747 	id_t		did;
748 	hrtime_t	start, delta, deltas = 0;
749 	int		iterations;
750 
751 	did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
752 	if (did == CPUPM_NO_DOMAIN)
753 		return (-1);
754 
755 	dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
756 	if (dom == NULL)
757 		return (-1);
758 
759 	low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
760 	high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
761 
762 	for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
763 
764 		/*
765 		 * Measure the amount of time it takes to transition the
766 		 * domain down to the lowest, and back to the highest power
767 		 * state.
768 		 */
769 		start = gethrtime_unscaled();
770 		(void) cpupm_change_state(cp, dom, low);
771 		(void) cpupm_change_state(cp, dom, high);
772 		delta = gethrtime_unscaled() - start;
773 
774 		DTRACE_PROBE1(cpupm__benchmark__latency,
775 		    hrtime_t, delta);
776 
777 		deltas += delta;
778 	}
779 
780 	/*
781 	 * Figure the average latency, and tune the transient work and
782 	 * transient idle prediction intervals accordingly.
783 	 */
784 	delta = deltas / iterations;
785 
786 	cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
787 	cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
788 
789 	return (0);
790 }
791 
792 /*
793  * Initiate a state change in all CPUPM domain instances of the specified type
794  */
795 static void
796 cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
797 {
798 	cpu_t		*cp;
799 	pg_cmt_t	*pwr_pg;
800 	cpupm_domain_t	*dom;
801 	group_t		*hwset;
802 	group_iter_t	giter;
803 	pg_cpu_itr_t	cpu_iter;
804 	pghw_type_t	hw;
805 
806 	ASSERT(MUTEX_HELD(&cpu_lock));
807 
808 	switch (type) {
809 	case CPUPM_DTYPE_ACTIVE:
810 		hw = PGHW_POW_ACTIVE;
811 		break;
812 	default:
813 		/*
814 		 * Power domain types other than "active" unsupported.
815 		 */
816 		ASSERT(type == CPUPM_DTYPE_ACTIVE);
817 		return;
818 	}
819 
820 	if ((hwset = pghw_set_lookup(hw)) == NULL)
821 		return;
822 
823 	/*
824 	 * Iterate over the power domains
825 	 */
826 	group_iter_init(&giter);
827 	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
828 
829 		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
830 
831 		/*
832 		 * Iterate over the CPUs in each domain
833 		 */
834 		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
835 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
836 			(void) cpupm_change_state(cp, dom,
837 			    dom->cpd_named_states[state]);
838 		}
839 	}
840 }
841