10e751525SEric Saxe /* 20e751525SEric Saxe * CDDL HEADER START 30e751525SEric Saxe * 40e751525SEric Saxe * The contents of this file are subject to the terms of the 50e751525SEric Saxe * Common Development and Distribution License (the "License"). 60e751525SEric Saxe * You may not use this file except in compliance with the License. 70e751525SEric Saxe * 80e751525SEric Saxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90e751525SEric Saxe * or http://www.opensolaris.org/os/licensing. 100e751525SEric Saxe * See the License for the specific language governing permissions 110e751525SEric Saxe * and limitations under the License. 120e751525SEric Saxe * 130e751525SEric Saxe * When distributing Covered Code, include this CDDL HEADER in each 140e751525SEric Saxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150e751525SEric Saxe * If applicable, add the following below this CDDL HEADER, with the 160e751525SEric Saxe * fields enclosed by brackets "[]" replaced with your own identifying 170e751525SEric Saxe * information: Portions Copyright [yyyy] [name of copyright owner] 180e751525SEric Saxe * 190e751525SEric Saxe * CDDL HEADER END 200e751525SEric Saxe */ 210e751525SEric Saxe /* 220e751525SEric Saxe * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 230e751525SEric Saxe * Use is subject to license terms. 240e751525SEric Saxe */ 250e751525SEric Saxe 260e751525SEric Saxe #include <sys/cpu_pm.h> 270e751525SEric Saxe #include <sys/cmn_err.h> 28113b131bSEric Saxe #include <sys/time.h> 290e751525SEric Saxe #include <sys/sdt.h> 300e751525SEric Saxe 310e751525SEric Saxe /* 320e751525SEric Saxe * Solaris Event Based CPU Power Manager 330e751525SEric Saxe * 340e751525SEric Saxe * This file implements platform independent event based CPU power management. 350e751525SEric Saxe * When CPUs are configured into the system, the CMT scheduling subsystem will 360e751525SEric Saxe * query the platform to determine if the CPU belongs to any power management 370e751525SEric Saxe * domains. That is, sets of CPUs that share power management states. 380e751525SEric Saxe * 390e751525SEric Saxe * Active Power Management domains represent a group of CPUs across which the 400e751525SEric Saxe * Operating System can request speed changes (which may in turn result 410e751525SEric Saxe * in voltage changes). This allows the operating system to trade off 420e751525SEric Saxe * performance for power savings. 430e751525SEric Saxe * 440e751525SEric Saxe * Idle Power Management domains can enter power savings states when they are 450e751525SEric Saxe * unutilized. These states allow the Operating System to trade off power 460e751525SEric Saxe * for performance (in the form of latency to transition from the idle state 470e751525SEric Saxe * to an active one). 480e751525SEric Saxe * 490e751525SEric Saxe * For each active and idle power domain the CMT subsystem instantiates, a 500e751525SEric Saxe * cpupm_domain_t structure is created. As the dispatcher schedules threads 510e751525SEric Saxe * to run on the system's CPUs, it will also track the utilization of the 520e751525SEric Saxe * enumerated power domains. Significant changes in utilization will result 530e751525SEric Saxe * in the dispatcher sending the power manager events that relate to the 540e751525SEric Saxe * utilization of the power domain. The power manager recieves the events, 550e751525SEric Saxe * and in the context of the policy objectives in force, may decide to request 560e751525SEric Saxe * the domain's power/performance state be changed. 570e751525SEric Saxe * 580e751525SEric Saxe * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power 590e751525SEric Saxe * manager will request the CPUs in the domain run at their fastest (and most 600e751525SEric Saxe * power consuming) state. When the domain becomes idle (utilization at zero), 610e751525SEric Saxe * the power manager will request that the CPUs run at a speed that saves the 620e751525SEric Saxe * most power. 630e751525SEric Saxe * 640e751525SEric Saxe * The advantage of this scheme, is that the CPU power manager working with the 650e751525SEric Saxe * dispatcher can be extremely responsive to changes in utilization. Optimizing 660e751525SEric Saxe * for performance in the presence of utilization, and power savings in the 670e751525SEric Saxe * presence of idleness. Such close collaboration with the dispatcher has other 680e751525SEric Saxe * benefits that will play out in the form of more sophisticated power / 690e751525SEric Saxe * performance policy in the near future. 700e751525SEric Saxe * 710e751525SEric Saxe * Avoiding state thrashing in the presence of transient periods of utilization 720e751525SEric Saxe * and idleness while still being responsive to non-transient periods is key. 73113b131bSEric Saxe * The power manager implements a "governor" that is used to throttle 740e751525SEric Saxe * state transitions when a significant amount of transient idle or transient 750e751525SEric Saxe * work is detected. 760e751525SEric Saxe * 770e751525SEric Saxe * Kernel background activity (e.g. taskq threads) are by far the most common 780e751525SEric Saxe * form of transient utilization. Ungoverned in the face of this utililzation, 790e751525SEric Saxe * hundreds of state transitions per second would result on an idle system. 800e751525SEric Saxe * 810e751525SEric Saxe * Transient idleness is common when a thread briefly yields the CPU to 820e751525SEric Saxe * wait for an event elsewhere in the system. Where the idle period is short 830e751525SEric Saxe * enough, the overhead associated with making the state transition doesn't 840e751525SEric Saxe * justify the power savings. 85113b131bSEric Saxe * 86113b131bSEric Saxe * The following is the state machine for the governor implemented by 87113b131bSEric Saxe * cpupm_utilization_event(): 88113b131bSEric Saxe * 89113b131bSEric Saxe * ----->---tw---->----- 90113b131bSEric Saxe * / \ 91113b131bSEric Saxe * (I)-<-ti-<- -<-ntw-<(W) 92113b131bSEric Saxe * | \ / | 93113b131bSEric Saxe * \ \ / / 94113b131bSEric Saxe * >-nti/rm->(D)--->-tw->- 95113b131bSEric Saxe * Key: 96113b131bSEric Saxe * 97113b131bSEric Saxe * States 98113b131bSEric Saxe * - (D): Default (ungoverned) 99113b131bSEric Saxe * - (W): Transient work governed 100113b131bSEric Saxe * - (I): Transient idle governed 101113b131bSEric Saxe * State Transitions 102113b131bSEric Saxe * - tw: transient work 103113b131bSEric Saxe * - ti: transient idleness 104113b131bSEric Saxe * - ntw: non-transient work 105113b131bSEric Saxe * - nti: non-transient idleness 106113b131bSEric Saxe * - rm: thread remain event 1070e751525SEric Saxe */ 1080e751525SEric Saxe 1090e751525SEric Saxe static cpupm_domain_t *cpupm_domains = NULL; 1100e751525SEric Saxe 1110e751525SEric Saxe /* 1120e751525SEric Saxe * Uninitialized state of CPU power management is disabled 1130e751525SEric Saxe */ 1140e751525SEric Saxe cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED; 1150e751525SEric Saxe 1160e751525SEric Saxe /* 1170e751525SEric Saxe * Periods of utilization lasting less than this time interval are characterized 1180e751525SEric Saxe * as transient. State changes associated with transient work are considered 1190e751525SEric Saxe * to be mispredicted. That is, it's not worth raising and lower power states 1200e751525SEric Saxe * where the utilization lasts for less than this interval. 1210e751525SEric Saxe */ 1220e751525SEric Saxe hrtime_t cpupm_tw_predict_interval; 1230e751525SEric Saxe 1240e751525SEric Saxe /* 1250e751525SEric Saxe * Periods of idleness lasting less than this time interval are characterized 1260e751525SEric Saxe * as transient. State changes associated with transient idle are considered 1270e751525SEric Saxe * to be mispredicted. That is, it's not worth lowering and raising power 1280e751525SEric Saxe * states where the idleness lasts for less than this interval. 1290e751525SEric Saxe */ 1300e751525SEric Saxe hrtime_t cpupm_ti_predict_interval; 1310e751525SEric Saxe 1320e751525SEric Saxe /* 1330e751525SEric Saxe * Number of mispredictions after which future transitions will be governed. 1340e751525SEric Saxe */ 135113b131bSEric Saxe int cpupm_mispredict_thresh = 4; 1360e751525SEric Saxe 1370e751525SEric Saxe /* 1380e751525SEric Saxe * Likewise, the number of mispredicted governed transitions after which the 1390e751525SEric Saxe * governor will be removed. 1400e751525SEric Saxe */ 141113b131bSEric Saxe int cpupm_mispredict_gov_thresh = 4; 1420e751525SEric Saxe 1430e751525SEric Saxe /* 144113b131bSEric Saxe * The transient work and transient idle prediction intervals are specified 145113b131bSEric Saxe * here. Tuning them higher will result in the transient work, and transient 146113b131bSEric Saxe * idle governors being used more aggresively, which limits the frequency of 147113b131bSEric Saxe * state transitions at the expense of performance and power savings, 148113b131bSEric Saxe * respectively. The intervals are specified in nanoseconds. 1490e751525SEric Saxe */ 1500e751525SEric Saxe /* 151113b131bSEric Saxe * 400 usec 1520e751525SEric Saxe */ 153113b131bSEric Saxe #define CPUPM_DEFAULT_TI_INTERVAL 400000 154113b131bSEric Saxe /* 155113b131bSEric Saxe * 400 usec 156113b131bSEric Saxe */ 157113b131bSEric Saxe #define CPUPM_DEFAULT_TW_INTERVAL 400000 1580e751525SEric Saxe 159113b131bSEric Saxe hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL; 160113b131bSEric Saxe hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL; 1610e751525SEric Saxe 1620e751525SEric Saxe 163113b131bSEric Saxe static void cpupm_governor_initialize(void); 1640e751525SEric Saxe static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t); 1650e751525SEric Saxe 1660e751525SEric Saxe cpupm_policy_t 1670e751525SEric Saxe cpupm_get_policy(void) 1680e751525SEric Saxe { 1690e751525SEric Saxe return (cpupm_policy); 1700e751525SEric Saxe } 1710e751525SEric Saxe 1720e751525SEric Saxe int 1730e751525SEric Saxe cpupm_set_policy(cpupm_policy_t new_policy) 1740e751525SEric Saxe { 1750e751525SEric Saxe static int gov_init = 0; 1760e751525SEric Saxe int result = 0; 1770e751525SEric Saxe 1780e751525SEric Saxe mutex_enter(&cpu_lock); 1790e751525SEric Saxe if (new_policy == cpupm_policy) { 1800e751525SEric Saxe mutex_exit(&cpu_lock); 1810e751525SEric Saxe return (result); 1820e751525SEric Saxe } 1830e751525SEric Saxe 1840e751525SEric Saxe /* 1850e751525SEric Saxe * Pausing CPUs causes a high priority thread to be scheduled 1860e751525SEric Saxe * on all other CPUs (besides the current one). This locks out 1870e751525SEric Saxe * other CPUs from making CPUPM state transitions. 1880e751525SEric Saxe */ 1890e751525SEric Saxe switch (new_policy) { 1900e751525SEric Saxe case CPUPM_POLICY_DISABLED: 191*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL); 1920e751525SEric Saxe cpupm_policy = CPUPM_POLICY_DISABLED; 1930e751525SEric Saxe start_cpus(); 1940e751525SEric Saxe 1950e751525SEric Saxe result = cmt_pad_disable(PGHW_POW_ACTIVE); 1960e751525SEric Saxe 1970e751525SEric Saxe /* 1980e751525SEric Saxe * Once PAD has been enabled, it should always be possible 1990e751525SEric Saxe * to disable it. 2000e751525SEric Saxe */ 2010e751525SEric Saxe ASSERT(result == 0); 2020e751525SEric Saxe 2030e751525SEric Saxe /* 2040e751525SEric Saxe * Bring all the active power domains to the maximum 2050e751525SEric Saxe * performance state. 2060e751525SEric Saxe */ 2070e751525SEric Saxe cpupm_state_change_global(CPUPM_DTYPE_ACTIVE, 2080e751525SEric Saxe CPUPM_STATE_MAX_PERF); 2090e751525SEric Saxe 2100e751525SEric Saxe break; 2110e751525SEric Saxe case CPUPM_POLICY_ELASTIC: 2120e751525SEric Saxe 2130e751525SEric Saxe result = cmt_pad_enable(PGHW_POW_ACTIVE); 2140e751525SEric Saxe if (result < 0) { 2150e751525SEric Saxe /* 2160e751525SEric Saxe * Failed to enable PAD across the active power 2170e751525SEric Saxe * domains, which may well be because none were 2180e751525SEric Saxe * enumerated. 2190e751525SEric Saxe */ 2200e751525SEric Saxe break; 2210e751525SEric Saxe } 2220e751525SEric Saxe 2230e751525SEric Saxe /* 224113b131bSEric Saxe * Initialize the governor parameters the first time through. 2250e751525SEric Saxe */ 2260e751525SEric Saxe if (gov_init == 0) { 227113b131bSEric Saxe cpupm_governor_initialize(); 2280e751525SEric Saxe gov_init = 1; 2290e751525SEric Saxe } 230113b131bSEric Saxe 231*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL); 2320e751525SEric Saxe cpupm_policy = CPUPM_POLICY_ELASTIC; 2330e751525SEric Saxe start_cpus(); 2340e751525SEric Saxe 2350e751525SEric Saxe break; 2360e751525SEric Saxe default: 2370e751525SEric Saxe cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n", 2380e751525SEric Saxe new_policy); 2390e751525SEric Saxe ASSERT(0); 2400e751525SEric Saxe break; 2410e751525SEric Saxe } 2420e751525SEric Saxe mutex_exit(&cpu_lock); 2430e751525SEric Saxe 2440e751525SEric Saxe return (result); 2450e751525SEric Saxe } 2460e751525SEric Saxe 2470e751525SEric Saxe /* 2480e751525SEric Saxe * Look for an existing power domain 2490e751525SEric Saxe */ 2500e751525SEric Saxe static cpupm_domain_t * 2510e751525SEric Saxe cpupm_domain_find(id_t id, cpupm_dtype_t type) 2520e751525SEric Saxe { 2530e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 2540e751525SEric Saxe 2550e751525SEric Saxe cpupm_domain_t *dom; 2560e751525SEric Saxe 2570e751525SEric Saxe dom = cpupm_domains; 2580e751525SEric Saxe while (dom != NULL) { 2590e751525SEric Saxe if (id == dom->cpd_id && type == dom->cpd_type) 2600e751525SEric Saxe return (dom); 2610e751525SEric Saxe dom = dom->cpd_next; 2620e751525SEric Saxe } 2630e751525SEric Saxe return (NULL); 2640e751525SEric Saxe } 2650e751525SEric Saxe 2660e751525SEric Saxe /* 2670e751525SEric Saxe * Create a new domain 2680e751525SEric Saxe */ 2690e751525SEric Saxe static cpupm_domain_t * 2700e751525SEric Saxe cpupm_domain_create(id_t id, cpupm_dtype_t type) 2710e751525SEric Saxe { 2720e751525SEric Saxe cpupm_domain_t *dom; 2730e751525SEric Saxe 2740e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 2750e751525SEric Saxe 2760e751525SEric Saxe dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP); 2770e751525SEric Saxe dom->cpd_id = id; 2780e751525SEric Saxe dom->cpd_type = type; 2790e751525SEric Saxe 2800e751525SEric Saxe /* Link into the known domain list */ 2810e751525SEric Saxe dom->cpd_next = cpupm_domains; 2820e751525SEric Saxe cpupm_domains = dom; 2830e751525SEric Saxe 2840e751525SEric Saxe return (dom); 2850e751525SEric Saxe } 2860e751525SEric Saxe 2870e751525SEric Saxe static void 2880e751525SEric Saxe cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom) 2890e751525SEric Saxe { 2900e751525SEric Saxe /* 2910e751525SEric Saxe * In the envent we're enumerating because the domain's state 2920e751525SEric Saxe * configuration has changed, toss any existing states. 2930e751525SEric Saxe */ 2940e751525SEric Saxe if (dom->cpd_nstates > 0) { 2950e751525SEric Saxe kmem_free(dom->cpd_states, 2960e751525SEric Saxe sizeof (cpupm_state_t) * dom->cpd_nstates); 2970e751525SEric Saxe dom->cpd_nstates = 0; 2980e751525SEric Saxe } 2990e751525SEric Saxe 3000e751525SEric Saxe /* 3010e751525SEric Saxe * Query to determine the number of states, allocate storage 3020e751525SEric Saxe * large enough to hold the state information, and pass it back 3030e751525SEric Saxe * to the platform driver to complete the enumeration. 3040e751525SEric Saxe */ 3050e751525SEric Saxe dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL); 3060e751525SEric Saxe 3070e751525SEric Saxe if (dom->cpd_nstates == 0) 3080e751525SEric Saxe return; 3090e751525SEric Saxe 3100e751525SEric Saxe dom->cpd_states = 3110e751525SEric Saxe kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP); 3120e751525SEric Saxe (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states); 3130e751525SEric Saxe } 3140e751525SEric Saxe 3150e751525SEric Saxe /* 3160e751525SEric Saxe * Initialize the specified type of power domain on behalf of the CPU 3170e751525SEric Saxe */ 3180e751525SEric Saxe cpupm_domain_t * 3190e751525SEric Saxe cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type) 3200e751525SEric Saxe { 3210e751525SEric Saxe cpupm_domain_t *dom; 3220e751525SEric Saxe id_t did; 3230e751525SEric Saxe 3240e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 3250e751525SEric Saxe 3260e751525SEric Saxe /* 3270e751525SEric Saxe * Instantiate the domain if it doesn't already exist 3280e751525SEric Saxe * and enumerate its power states. 3290e751525SEric Saxe */ 3300e751525SEric Saxe did = cpupm_domain_id(cp, type); 3310e751525SEric Saxe dom = cpupm_domain_find(did, type); 3320e751525SEric Saxe if (dom == NULL) { 3330e751525SEric Saxe dom = cpupm_domain_create(did, type); 3340e751525SEric Saxe cpupm_domain_state_enum(cp, dom); 3350e751525SEric Saxe } 3360e751525SEric Saxe 3370e751525SEric Saxe /* 3380e751525SEric Saxe * Named state initialization 3390e751525SEric Saxe */ 3400e751525SEric Saxe if (type == CPUPM_DTYPE_ACTIVE) { 3410e751525SEric Saxe /* 3420e751525SEric Saxe * For active power domains, the highest performance 3430e751525SEric Saxe * state is defined as first state returned from 3440e751525SEric Saxe * the domain enumeration. 3450e751525SEric Saxe */ 3460e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF] = 3470e751525SEric Saxe &dom->cpd_states[0]; 3480e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER] = 3490e751525SEric Saxe &dom->cpd_states[dom->cpd_nstates - 1]; 3500e751525SEric Saxe 3510e751525SEric Saxe /* 3520e751525SEric Saxe * Begin by assuming CPU is running at the max perf state. 3530e751525SEric Saxe */ 3540e751525SEric Saxe dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 3550e751525SEric Saxe } 3560e751525SEric Saxe 3570e751525SEric Saxe return (dom); 3580e751525SEric Saxe } 3590e751525SEric Saxe 3600e751525SEric Saxe /* 3610e751525SEric Saxe * Return the id associated with the given type of domain 3620e751525SEric Saxe * to which cp belongs 3630e751525SEric Saxe */ 3640e751525SEric Saxe id_t 3650e751525SEric Saxe cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type) 3660e751525SEric Saxe { 3670e751525SEric Saxe return (cpupm_plat_domain_id(cp, type)); 3680e751525SEric Saxe } 3690e751525SEric Saxe 3700e751525SEric Saxe /* 3710e751525SEric Saxe * Initiate a state change for the specified domain on behalf of cp 3720e751525SEric Saxe */ 3730e751525SEric Saxe int 3740e751525SEric Saxe cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state) 3750e751525SEric Saxe { 3760e751525SEric Saxe if (cpupm_plat_change_state(cp, state) < 0) 3770e751525SEric Saxe return (-1); 3780e751525SEric Saxe 3790e751525SEric Saxe DTRACE_PROBE2(cpupm__change__state, 3800e751525SEric Saxe cpupm_domain_t *, dom, 3810e751525SEric Saxe cpupm_state_t *, state); 3820e751525SEric Saxe 3830e751525SEric Saxe dom->cpd_state = state; 3840e751525SEric Saxe return (0); 3850e751525SEric Saxe } 3860e751525SEric Saxe 3870e751525SEric Saxe /* 3880e751525SEric Saxe * Interface into the CPU power manager to indicate a significant change 3890e751525SEric Saxe * in utilization of the specified active power domain 3900e751525SEric Saxe */ 3910e751525SEric Saxe void 3920e751525SEric Saxe cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, 3930e751525SEric Saxe cpupm_util_event_t event) 3940e751525SEric Saxe { 3950e751525SEric Saxe cpupm_state_t *new_state = NULL; 3960e751525SEric Saxe hrtime_t last; 3970e751525SEric Saxe 3980e751525SEric Saxe if (cpupm_policy == CPUPM_POLICY_DISABLED) { 3990e751525SEric Saxe return; 4000e751525SEric Saxe } 4010e751525SEric Saxe 4020e751525SEric Saxe /* 4030e751525SEric Saxe * What follows is a simple elastic power state management policy. 4040e751525SEric Saxe * 4050e751525SEric Saxe * If the utilization has become non-zero, and the domain was 4060e751525SEric Saxe * previously at it's lowest power state, then transition it 4070e751525SEric Saxe * to the highest state in the spirit of "race to idle". 4080e751525SEric Saxe * 4090e751525SEric Saxe * If the utilization has dropped to zero, then transition the 4100e751525SEric Saxe * domain to its lowest power state. 4110e751525SEric Saxe * 412113b131bSEric Saxe * Statistics are maintained to implement a governor to reduce state 4130e751525SEric Saxe * transitions resulting from either transient work, or periods of 4140e751525SEric Saxe * transient idleness on the domain. 4150e751525SEric Saxe */ 4160e751525SEric Saxe switch (event) { 4170e751525SEric Saxe case CPUPM_DOM_REMAIN_BUSY: 4180e751525SEric Saxe 4190e751525SEric Saxe /* 4200e751525SEric Saxe * We've received an event that the domain is running a thread 4210e751525SEric Saxe * that's made it to the end of it's time slice. If we are at 4220e751525SEric Saxe * low power, then raise it. If the transient work governor 4230e751525SEric Saxe * is engaged, then remove it. 4240e751525SEric Saxe */ 4250e751525SEric Saxe if (dom->cpd_state == 4260e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { 4270e751525SEric Saxe new_state = 4280e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 429113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) { 430113b131bSEric Saxe dom->cpd_governor = CPUPM_GOV_DISENGAGED; 4310e751525SEric Saxe dom->cpd_tw = 0; 4320e751525SEric Saxe } 4330e751525SEric Saxe } 4340e751525SEric Saxe break; 4350e751525SEric Saxe 4360e751525SEric Saxe case CPUPM_DOM_BUSY_FROM_IDLE: 4370e751525SEric Saxe last = dom->cpd_last_lower; 4380e751525SEric Saxe dom->cpd_last_raise = now; 4390e751525SEric Saxe 4400e751525SEric Saxe DTRACE_PROBE3(cpupm__raise__req, 4410e751525SEric Saxe cpupm_domain_t *, dom, 4420e751525SEric Saxe hrtime_t, last, 4430e751525SEric Saxe hrtime_t, now); 4440e751525SEric Saxe 4450e751525SEric Saxe if (dom->cpd_state == 4460e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { 4470e751525SEric Saxe 4480e751525SEric Saxe /* 4490e751525SEric Saxe * There's non-zero utilization, and the domain is 4500e751525SEric Saxe * running in the lower power state. Before we 451113b131bSEric Saxe * consider raising power, check if the preceeding 452113b131bSEric Saxe * idle period was transient in duration. 453113b131bSEric Saxe * 454113b131bSEric Saxe * If the domain is already transient work governed, 455113b131bSEric Saxe * then we don't bother maintaining transient idle 456113b131bSEric Saxe * statistics, as the presence of enough transient work 457113b131bSEric Saxe * can also make the domain frequently transiently idle. 458113b131bSEric Saxe * In this case, we still want to remain transient work 459113b131bSEric Saxe * governed. 4600e751525SEric Saxe */ 461113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) { 4620e751525SEric Saxe if ((now - last) < cpupm_ti_predict_interval) { 4630e751525SEric Saxe /* 4640e751525SEric Saxe * We're raising the domain power and 4650e751525SEric Saxe * we *just* lowered it. Consider 4660e751525SEric Saxe * this a mispredicted power state 4670e751525SEric Saxe * transition due to a transient 4680e751525SEric Saxe * idle period. 4690e751525SEric Saxe */ 470113b131bSEric Saxe if (++dom->cpd_ti >= 4710e751525SEric Saxe cpupm_mispredict_thresh) { 4720e751525SEric Saxe /* 4730e751525SEric Saxe * There's enough transient 4740e751525SEric Saxe * idle transitions to 4750e751525SEric Saxe * justify governing future 4760e751525SEric Saxe * lowering requests. 4770e751525SEric Saxe */ 478113b131bSEric Saxe dom->cpd_governor = 479113b131bSEric Saxe CPUPM_GOV_TRANS_IDLE; 4800e751525SEric Saxe dom->cpd_ti = 0; 4810e751525SEric Saxe DTRACE_PROBE1( 4820e751525SEric Saxe cpupm__ti__governed, 4830e751525SEric Saxe cpupm_domain_t *, dom); 4840e751525SEric Saxe } 4850e751525SEric Saxe } else { 4860e751525SEric Saxe /* 4870e751525SEric Saxe * We correctly predicted the last 4880e751525SEric Saxe * lowering. 4890e751525SEric Saxe */ 4900e751525SEric Saxe dom->cpd_ti = 0; 4910e751525SEric Saxe } 4920e751525SEric Saxe } 493113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) { 4940e751525SEric Saxe /* 4950e751525SEric Saxe * Raise requests are governed due to 4960e751525SEric Saxe * transient work. 4970e751525SEric Saxe */ 4980e751525SEric Saxe DTRACE_PROBE1(cpupm__raise__governed, 4990e751525SEric Saxe cpupm_domain_t *, dom); 5000e751525SEric Saxe 5010e751525SEric Saxe return; 5020e751525SEric Saxe } 5030e751525SEric Saxe /* 5040e751525SEric Saxe * Prepare to transition to the higher power state 5050e751525SEric Saxe */ 5060e751525SEric Saxe new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 5070e751525SEric Saxe 5080e751525SEric Saxe } else if (dom->cpd_state == 5090e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { 5100e751525SEric Saxe 5110e751525SEric Saxe /* 5120e751525SEric Saxe * Utilization is non-zero, and we're already running 5130e751525SEric Saxe * in the higher power state. Take this opportunity to 5140e751525SEric Saxe * perform some book keeping if the last lowering 5150e751525SEric Saxe * request was governed. 5160e751525SEric Saxe */ 517113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) { 518113b131bSEric Saxe 5190e751525SEric Saxe if ((now - last) >= cpupm_ti_predict_interval) { 5200e751525SEric Saxe /* 5210e751525SEric Saxe * The domain is transient idle 5220e751525SEric Saxe * governed, and we mispredicted 5230e751525SEric Saxe * governing the last lowering request. 5240e751525SEric Saxe */ 5250e751525SEric Saxe if (++dom->cpd_ti >= 5260e751525SEric Saxe cpupm_mispredict_gov_thresh) { 5270e751525SEric Saxe /* 5280e751525SEric Saxe * There's enough non-transient 5290e751525SEric Saxe * idle periods to justify 5300e751525SEric Saxe * removing the governor. 5310e751525SEric Saxe */ 532113b131bSEric Saxe dom->cpd_governor = 533113b131bSEric Saxe CPUPM_GOV_DISENGAGED; 5340e751525SEric Saxe dom->cpd_ti = 0; 5350e751525SEric Saxe DTRACE_PROBE1( 5360e751525SEric Saxe cpupm__ti__ungoverned, 5370e751525SEric Saxe cpupm_domain_t *, dom); 5380e751525SEric Saxe } 5390e751525SEric Saxe } else { 5400e751525SEric Saxe /* 5410e751525SEric Saxe * Correctly predicted governing the 5420e751525SEric Saxe * last lowering request. 5430e751525SEric Saxe */ 5440e751525SEric Saxe dom->cpd_ti = 0; 5450e751525SEric Saxe } 5460e751525SEric Saxe } 5470e751525SEric Saxe } 5480e751525SEric Saxe break; 5490e751525SEric Saxe 5500e751525SEric Saxe case CPUPM_DOM_IDLE_FROM_BUSY: 5510e751525SEric Saxe last = dom->cpd_last_raise; 5520e751525SEric Saxe dom->cpd_last_lower = now; 5530e751525SEric Saxe 5540e751525SEric Saxe DTRACE_PROBE3(cpupm__lower__req, 5550e751525SEric Saxe cpupm_domain_t *, dom, 5560e751525SEric Saxe hrtime_t, last, 5570e751525SEric Saxe hrtime_t, now); 5580e751525SEric Saxe 5590e751525SEric Saxe if (dom->cpd_state == 5600e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { 5610e751525SEric Saxe 5620e751525SEric Saxe /* 5630e751525SEric Saxe * The domain is idle, and is running in the highest 5640e751525SEric Saxe * performance state. Before we consider lowering power, 5650e751525SEric Saxe * perform some book keeping for the transient work 5660e751525SEric Saxe * governor. 5670e751525SEric Saxe */ 568113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) { 5690e751525SEric Saxe if ((now - last) < cpupm_tw_predict_interval) { 5700e751525SEric Saxe /* 5710e751525SEric Saxe * We're lowering the domain power and 5720e751525SEric Saxe * we *just* raised it. Consider the 5730e751525SEric Saxe * last raise mispredicted due to 5740e751525SEric Saxe * transient work. 5750e751525SEric Saxe */ 5760e751525SEric Saxe if (++dom->cpd_tw >= 5770e751525SEric Saxe cpupm_mispredict_thresh) { 5780e751525SEric Saxe /* 579113b131bSEric Saxe * There's enough transient work 5800e751525SEric Saxe * transitions to justify 581113b131bSEric Saxe * governing future raise 5820e751525SEric Saxe * requests. 5830e751525SEric Saxe */ 584113b131bSEric Saxe dom->cpd_governor = 585113b131bSEric Saxe CPUPM_GOV_TRANS_WORK; 5860e751525SEric Saxe dom->cpd_tw = 0; 5870e751525SEric Saxe DTRACE_PROBE1( 5880e751525SEric Saxe cpupm__tw__governed, 5890e751525SEric Saxe cpupm_domain_t *, dom); 5900e751525SEric Saxe } 5910e751525SEric Saxe } else { 5920e751525SEric Saxe /* 5930e751525SEric Saxe * We correctly predicted during the 5940e751525SEric Saxe * last raise. 5950e751525SEric Saxe */ 5960e751525SEric Saxe dom->cpd_tw = 0; 5970e751525SEric Saxe } 5980e751525SEric Saxe } 599113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) { 6000e751525SEric Saxe /* 6010e751525SEric Saxe * Lowering requests are governed due to 6020e751525SEric Saxe * transient idleness. 6030e751525SEric Saxe */ 6040e751525SEric Saxe DTRACE_PROBE1(cpupm__lowering__governed, 6050e751525SEric Saxe cpupm_domain_t *, dom); 6060e751525SEric Saxe 6070e751525SEric Saxe return; 6080e751525SEric Saxe } 6090e751525SEric Saxe 6100e751525SEric Saxe /* 6110e751525SEric Saxe * Prepare to transition to a lower power state. 6120e751525SEric Saxe */ 6130e751525SEric Saxe new_state = 6140e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER]; 6150e751525SEric Saxe 6160e751525SEric Saxe } else if (dom->cpd_state == 6170e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { 6180e751525SEric Saxe 6190e751525SEric Saxe /* 6200e751525SEric Saxe * The domain is idle, and we're already running in 6210e751525SEric Saxe * the lower power state. Take this opportunity to 6220e751525SEric Saxe * perform some book keeping if the last raising 6230e751525SEric Saxe * request was governed. 6240e751525SEric Saxe */ 625113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) { 6260e751525SEric Saxe if ((now - last) >= cpupm_tw_predict_interval) { 6270e751525SEric Saxe /* 6280e751525SEric Saxe * The domain is transient work 6290e751525SEric Saxe * governed, and we mispredicted 6300e751525SEric Saxe * governing the last raising request. 6310e751525SEric Saxe */ 6320e751525SEric Saxe if (++dom->cpd_tw >= 6330e751525SEric Saxe cpupm_mispredict_gov_thresh) { 6340e751525SEric Saxe /* 6350e751525SEric Saxe * There's enough non-transient 6360e751525SEric Saxe * work to justify removing 6370e751525SEric Saxe * the governor. 6380e751525SEric Saxe */ 639113b131bSEric Saxe dom->cpd_governor = 640113b131bSEric Saxe CPUPM_GOV_DISENGAGED; 6410e751525SEric Saxe dom->cpd_tw = 0; 6420e751525SEric Saxe DTRACE_PROBE1( 6430e751525SEric Saxe cpupm__tw__ungoverned, 6440e751525SEric Saxe cpupm_domain_t *, dom); 6450e751525SEric Saxe } 6460e751525SEric Saxe } else { 6470e751525SEric Saxe /* 6480e751525SEric Saxe * We correctly predicted governing 6490e751525SEric Saxe * the last raise. 6500e751525SEric Saxe */ 6510e751525SEric Saxe dom->cpd_tw = 0; 6520e751525SEric Saxe } 6530e751525SEric Saxe } 6540e751525SEric Saxe } 6550e751525SEric Saxe break; 6560e751525SEric Saxe } 6570e751525SEric Saxe /* 6580e751525SEric Saxe * Change the power state 6590e751525SEric Saxe * Not much currently done if this doesn't succeed 6600e751525SEric Saxe */ 6610e751525SEric Saxe if (new_state) 6620e751525SEric Saxe (void) cpupm_change_state(cp, dom, new_state); 6630e751525SEric Saxe } 6640e751525SEric Saxe 6650e751525SEric Saxe 6660e751525SEric Saxe /* 6670e751525SEric Saxe * Interface called by platforms to dynamically change the 6680e751525SEric Saxe * MAX performance cpupm state 6690e751525SEric Saxe */ 6700e751525SEric Saxe void 6710e751525SEric Saxe cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level) 6720e751525SEric Saxe { 6730e751525SEric Saxe cpupm_domain_t *dom; 6740e751525SEric Saxe id_t did; 6750e751525SEric Saxe cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE; 6760e751525SEric Saxe boolean_t change_state = B_FALSE; 6770e751525SEric Saxe cpupm_state_t *new_state = NULL; 6780e751525SEric Saxe 6790e751525SEric Saxe did = cpupm_domain_id(cp, type); 680a3114836SGerry Liu if (MUTEX_HELD(&cpu_lock)) { 681a3114836SGerry Liu dom = cpupm_domain_find(did, type); 682a3114836SGerry Liu } else { 6830e751525SEric Saxe mutex_enter(&cpu_lock); 6840e751525SEric Saxe dom = cpupm_domain_find(did, type); 6850e751525SEric Saxe mutex_exit(&cpu_lock); 686a3114836SGerry Liu } 6870e751525SEric Saxe 6880e751525SEric Saxe /* 6890e751525SEric Saxe * Can use a lock to avoid changing the power state of the cpu when 6900e751525SEric Saxe * CPUPM_STATE_MAX_PERF is getting changed. 6910e751525SEric Saxe * Since the occurance of events to change MAX_PERF is not frequent, 6920e751525SEric Saxe * it may not be a good idea to overburden with locks. In the worst 6930e751525SEric Saxe * case, for one cycle the power may not get changed to the required 6940e751525SEric Saxe * level 6950e751525SEric Saxe */ 6960e751525SEric Saxe if (dom != NULL) { 6970e751525SEric Saxe if (dom->cpd_state == 6980e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { 6990e751525SEric Saxe change_state = B_TRUE; 7000e751525SEric Saxe } 7010e751525SEric Saxe 7020e751525SEric Saxe /* 7030e751525SEric Saxe * If an out of range level is passed, use the lowest supported 7040e751525SEric Saxe * speed. 7050e751525SEric Saxe */ 7060e751525SEric Saxe if (max_perf_level >= dom->cpd_nstates && 7070e751525SEric Saxe dom->cpd_nstates > 1) { 7080e751525SEric Saxe max_perf_level = dom->cpd_nstates - 1; 7090e751525SEric Saxe } 7100e751525SEric Saxe 7110e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF] = 7120e751525SEric Saxe &dom->cpd_states[max_perf_level]; 7130e751525SEric Saxe 7140e751525SEric Saxe /* 7150e751525SEric Saxe * If the current state is MAX_PERF, change the current state 7160e751525SEric Saxe * to the new MAX_PERF 7170e751525SEric Saxe */ 7180e751525SEric Saxe if (change_state) { 7190e751525SEric Saxe new_state = 7200e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 7210e751525SEric Saxe if (new_state) { 7220e751525SEric Saxe (void) cpupm_change_state(cp, dom, new_state); 7230e751525SEric Saxe } 7240e751525SEric Saxe } 7250e751525SEric Saxe } 7260e751525SEric Saxe } 7270e751525SEric Saxe 7280e751525SEric Saxe /* 729113b131bSEric Saxe * Initialize the parameters for the transience governor state machine 7300e751525SEric Saxe */ 731113b131bSEric Saxe static void 7320e751525SEric Saxe cpupm_governor_initialize(void) 7330e751525SEric Saxe { 7340e751525SEric Saxe /* 735113b131bSEric Saxe * The default prediction intervals are specified in nanoseconds. 736113b131bSEric Saxe * Convert these to the equivalent in unscaled hrtime, which is the 737113b131bSEric Saxe * format of the timestamps passed to cpupm_utilization_event() 7380e751525SEric Saxe */ 739113b131bSEric Saxe cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval); 740113b131bSEric Saxe cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval); 7410e751525SEric Saxe } 7420e751525SEric Saxe 7430e751525SEric Saxe /* 7440e751525SEric Saxe * Initiate a state change in all CPUPM domain instances of the specified type 7450e751525SEric Saxe */ 7460e751525SEric Saxe static void 7470e751525SEric Saxe cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state) 7480e751525SEric Saxe { 7490e751525SEric Saxe cpu_t *cp; 7500e751525SEric Saxe pg_cmt_t *pwr_pg; 7510e751525SEric Saxe cpupm_domain_t *dom; 7520e751525SEric Saxe group_t *hwset; 7530e751525SEric Saxe group_iter_t giter; 7540e751525SEric Saxe pg_cpu_itr_t cpu_iter; 7550e751525SEric Saxe pghw_type_t hw; 7560e751525SEric Saxe 7570e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 7580e751525SEric Saxe 7590e751525SEric Saxe switch (type) { 7600e751525SEric Saxe case CPUPM_DTYPE_ACTIVE: 7610e751525SEric Saxe hw = PGHW_POW_ACTIVE; 7620e751525SEric Saxe break; 7630e751525SEric Saxe default: 7640e751525SEric Saxe /* 7650e751525SEric Saxe * Power domain types other than "active" unsupported. 7660e751525SEric Saxe */ 7670e751525SEric Saxe ASSERT(type == CPUPM_DTYPE_ACTIVE); 7680e751525SEric Saxe return; 7690e751525SEric Saxe } 7700e751525SEric Saxe 7710e751525SEric Saxe if ((hwset = pghw_set_lookup(hw)) == NULL) 7720e751525SEric Saxe return; 7730e751525SEric Saxe 7740e751525SEric Saxe /* 7750e751525SEric Saxe * Iterate over the power domains 7760e751525SEric Saxe */ 7770e751525SEric Saxe group_iter_init(&giter); 7780e751525SEric Saxe while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) { 7790e751525SEric Saxe 7800e751525SEric Saxe dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle; 7810e751525SEric Saxe 7820e751525SEric Saxe /* 7830e751525SEric Saxe * Iterate over the CPUs in each domain 7840e751525SEric Saxe */ 7850e751525SEric Saxe PG_CPU_ITR_INIT(pwr_pg, cpu_iter); 7860e751525SEric Saxe while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 7870e751525SEric Saxe (void) cpupm_change_state(cp, dom, 7880e751525SEric Saxe dom->cpd_named_states[state]); 7890e751525SEric Saxe } 7900e751525SEric Saxe } 7910e751525SEric Saxe } 792