1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * CPU Device driver. The driver is not DDI-compliant. 28 * 29 * The driver supports following features: 30 * - Power management. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/errno.h> 36 #include <sys/modctl.h> 37 #include <sys/kmem.h> 38 #include <sys/conf.h> 39 #include <sys/cmn_err.h> 40 #include <sys/stat.h> 41 #include <sys/debug.h> 42 #include <sys/systm.h> 43 #include <sys/ddi.h> 44 #include <sys/sunddi.h> 45 #include <sys/sdt.h> 46 #include <sys/epm.h> 47 #include <sys/machsystm.h> 48 #include <sys/x_call.h> 49 #include <sys/cpudrv_mach.h> 50 #include <sys/msacct.h> 51 52 /* 53 * CPU power management 54 * 55 * The supported power saving model is to slow down the CPU (on SPARC by 56 * dividing the CPU clock and on x86 by dropping down a P-state). 57 * Periodically we determine the amount of time the CPU is running 58 * idle thread and threads in user mode during the last quantum. If the idle 59 * thread was running less than its low water mark for current speed for 60 * number of consecutive sampling periods, or number of running threads in 61 * user mode are above its high water mark, we arrange to go to the higher 62 * speed. If the idle thread was running more than its high water mark without 63 * dropping a number of consecutive times below the mark, and number of threads 64 * running in user mode are below its low water mark, we arrange to go to the 65 * next lower speed. While going down, we go through all the speeds. While 66 * going up we go to the maximum speed to minimize impact on the user, but have 67 * provisions in the driver to go to other speeds. 68 * 69 * The driver does not have knowledge of a particular implementation of this 70 * scheme and will work with all CPUs supporting this model. On SPARC, the 71 * driver determines supported speeds by looking at 'clock-divisors' property 72 * created by OBP. On x86, the driver retrieves the supported speeds from 73 * ACPI. 74 */ 75 76 /* 77 * Configuration function prototypes and data structures 78 */ 79 static int cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 80 static int cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 81 static int cpudrv_power(dev_info_t *dip, int comp, int level); 82 83 struct dev_ops cpudrv_ops = { 84 DEVO_REV, /* rev */ 85 0, /* refcnt */ 86 nodev, /* getinfo */ 87 nulldev, /* identify */ 88 nulldev, /* probe */ 89 cpudrv_attach, /* attach */ 90 cpudrv_detach, /* detach */ 91 nodev, /* reset */ 92 (struct cb_ops *)NULL, /* cb_ops */ 93 (struct bus_ops *)NULL, /* bus_ops */ 94 cpudrv_power, /* power */ 95 ddi_quiesce_not_needed, /* quiesce */ 96 }; 97 98 static struct modldrv modldrv = { 99 &mod_driverops, /* modops */ 100 "CPU Driver", /* linkinfo */ 101 &cpudrv_ops, /* dev_ops */ 102 }; 103 104 static struct modlinkage modlinkage = { 105 MODREV_1, /* rev */ 106 &modldrv, /* linkage */ 107 NULL 108 }; 109 110 /* 111 * Function prototypes 112 */ 113 static int cpudrv_init(cpudrv_devstate_t *cpudsp); 114 static void cpudrv_free(cpudrv_devstate_t *cpudsp); 115 static int cpudrv_comp_create(cpudrv_devstate_t *cpudsp); 116 static void cpudrv_monitor_disp(void *arg); 117 static void cpudrv_monitor(void *arg); 118 119 /* 120 * Driver global variables 121 */ 122 uint_t cpudrv_debug = 0; 123 void *cpudrv_state; 124 static uint_t cpudrv_idle_hwm = CPUDRV_IDLE_HWM; 125 static uint_t cpudrv_idle_lwm = CPUDRV_IDLE_LWM; 126 static uint_t cpudrv_idle_buf_zone = CPUDRV_IDLE_BUF_ZONE; 127 static uint_t cpudrv_idle_bhwm_cnt_max = CPUDRV_IDLE_BHWM_CNT_MAX; 128 static uint_t cpudrv_idle_blwm_cnt_max = CPUDRV_IDLE_BLWM_CNT_MAX; 129 static uint_t cpudrv_user_hwm = CPUDRV_USER_HWM; 130 131 boolean_t cpudrv_enabled = B_TRUE; 132 133 /* 134 * cpudrv_direct_pm allows user applications to directly control the 135 * power state transitions (direct pm) without following the normal 136 * direct pm protocol. This is needed because the normal protocol 137 * requires that a device only be lowered when it is idle, and be 138 * brought up when it request to do so by calling pm_raise_power(). 139 * Ignoring this protocol is harmless for CPU (other than speed). 140 * Moreover it might be the case that CPU is never idle or wants 141 * to be at higher speed because of the addition CPU cycles required 142 * to run the user application. 143 * 144 * The driver will still report idle/busy status to the framework. Although 145 * framework will ignore this information for direct pm devices and not 146 * try to bring them down when idle, user applications can still use this 147 * information if they wants. 148 * 149 * In the future, provide an ioctl to control setting of this mode. In 150 * that case, this variable should move to the state structure and 151 * be protected by the lock in the state structure. 152 */ 153 int cpudrv_direct_pm = 0; 154 155 /* 156 * Arranges for the handler function to be called at the interval suitable 157 * for current speed. 158 */ 159 #define CPUDRV_MONITOR_INIT(cpudsp) { \ 160 if (cpudrv_is_enabled(cpudsp)) { \ 161 ASSERT(mutex_owned(&(cpudsp)->lock)); \ 162 (cpudsp)->cpudrv_pm.timeout_id = \ 163 timeout(cpudrv_monitor_disp, \ 164 (cpudsp), (((cpudsp)->cpudrv_pm.cur_spd == NULL) ? \ 165 CPUDRV_QUANT_CNT_OTHR : \ 166 (cpudsp)->cpudrv_pm.cur_spd->quant_cnt)); \ 167 } \ 168 } 169 170 /* 171 * Arranges for the handler function not to be called back. 172 */ 173 #define CPUDRV_MONITOR_FINI(cpudsp) { \ 174 timeout_id_t tmp_tid; \ 175 ASSERT(mutex_owned(&(cpudsp)->lock)); \ 176 tmp_tid = (cpudsp)->cpudrv_pm.timeout_id; \ 177 (cpudsp)->cpudrv_pm.timeout_id = 0; \ 178 mutex_exit(&(cpudsp)->lock); \ 179 if (tmp_tid != 0) { \ 180 (void) untimeout(tmp_tid); \ 181 mutex_enter(&(cpudsp)->cpudrv_pm.timeout_lock); \ 182 while ((cpudsp)->cpudrv_pm.timeout_count != 0) \ 183 cv_wait(&(cpudsp)->cpudrv_pm.timeout_cv, \ 184 &(cpudsp)->cpudrv_pm.timeout_lock); \ 185 mutex_exit(&(cpudsp)->cpudrv_pm.timeout_lock); \ 186 } \ 187 mutex_enter(&(cpudsp)->lock); \ 188 } 189 190 int 191 _init(void) 192 { 193 int error; 194 195 DPRINTF(D_INIT, (" _init: function called\n")); 196 if ((error = ddi_soft_state_init(&cpudrv_state, 197 sizeof (cpudrv_devstate_t), 0)) != 0) { 198 return (error); 199 } 200 201 if ((error = mod_install(&modlinkage)) != 0) { 202 ddi_soft_state_fini(&cpudrv_state); 203 } 204 205 /* 206 * Callbacks used by the PPM driver. 207 */ 208 CPUDRV_SET_PPM_CALLBACKS(); 209 return (error); 210 } 211 212 int 213 _fini(void) 214 { 215 int error; 216 217 DPRINTF(D_FINI, (" _fini: function called\n")); 218 if ((error = mod_remove(&modlinkage)) == 0) { 219 ddi_soft_state_fini(&cpudrv_state); 220 } 221 222 return (error); 223 } 224 225 int 226 _info(struct modinfo *modinfop) 227 { 228 return (mod_info(&modlinkage, modinfop)); 229 } 230 231 /* 232 * Driver attach(9e) entry point. 233 */ 234 static int 235 cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 236 { 237 int instance; 238 cpudrv_devstate_t *cpudsp; 239 extern pri_t maxclsyspri; 240 241 instance = ddi_get_instance(dip); 242 243 switch (cmd) { 244 case DDI_ATTACH: 245 DPRINTF(D_ATTACH, ("cpudrv_attach: instance %d: " 246 "DDI_ATTACH called\n", instance)); 247 if (!cpudrv_is_enabled(NULL)) 248 return (DDI_FAILURE); 249 if (ddi_soft_state_zalloc(cpudrv_state, instance) != 250 DDI_SUCCESS) { 251 cmn_err(CE_WARN, "cpudrv_attach: instance %d: " 252 "can't allocate state", instance); 253 cpudrv_enabled = B_FALSE; 254 return (DDI_FAILURE); 255 } 256 if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) == 257 NULL) { 258 cmn_err(CE_WARN, "cpudrv_attach: instance %d: " 259 "can't get state", instance); 260 ddi_soft_state_free(cpudrv_state, instance); 261 cpudrv_enabled = B_FALSE; 262 return (DDI_FAILURE); 263 } 264 cpudsp->dip = dip; 265 266 /* 267 * Find CPU number for this dev_info node. 268 */ 269 if (!cpudrv_get_cpu_id(dip, &(cpudsp->cpu_id))) { 270 cmn_err(CE_WARN, "cpudrv_attach: instance %d: " 271 "can't convert dip to cpu_id", instance); 272 ddi_soft_state_free(cpudrv_state, instance); 273 cpudrv_enabled = B_FALSE; 274 return (DDI_FAILURE); 275 } 276 if (!cpudrv_mach_init(cpudsp)) { 277 cpudrv_enabled = B_FALSE; 278 return (DDI_FAILURE); 279 } 280 281 mutex_init(&cpudsp->lock, NULL, MUTEX_DRIVER, NULL); 282 if (cpudrv_is_enabled(cpudsp)) { 283 if (cpudrv_init(cpudsp) != DDI_SUCCESS) { 284 cpudrv_enabled = B_FALSE; 285 cpudrv_free(cpudsp); 286 ddi_soft_state_free(cpudrv_state, instance); 287 return (DDI_FAILURE); 288 } 289 if (cpudrv_comp_create(cpudsp) != DDI_SUCCESS) { 290 cpudrv_enabled = B_FALSE; 291 cpudrv_free(cpudsp); 292 ddi_soft_state_free(cpudrv_state, instance); 293 return (DDI_FAILURE); 294 } 295 if (ddi_prop_update_string(DDI_DEV_T_NONE, 296 dip, "pm-class", "CPU") != DDI_PROP_SUCCESS) { 297 cpudrv_enabled = B_FALSE; 298 cpudrv_free(cpudsp); 299 ddi_soft_state_free(cpudrv_state, instance); 300 return (DDI_FAILURE); 301 } 302 303 /* 304 * Taskq is used to dispatch routine to monitor CPU 305 * activities. 306 */ 307 cpudsp->cpudrv_pm.tq = taskq_create_instance( 308 "cpudrv_monitor", 309 ddi_get_instance(dip), CPUDRV_TASKQ_THREADS, 310 (maxclsyspri - 1), CPUDRV_TASKQ_MIN, 311 CPUDRV_TASKQ_MAX, 312 TASKQ_PREPOPULATE|TASKQ_CPR_SAFE); 313 314 mutex_init(&cpudsp->cpudrv_pm.timeout_lock, NULL, 315 MUTEX_DRIVER, NULL); 316 cv_init(&cpudsp->cpudrv_pm.timeout_cv, NULL, 317 CV_DEFAULT, NULL); 318 319 /* 320 * Driver needs to assume that CPU is running at 321 * unknown speed at DDI_ATTACH and switch it to the 322 * needed speed. We assume that initial needed speed 323 * is full speed for us. 324 */ 325 /* 326 * We need to take the lock because cpudrv_monitor() 327 * will start running in parallel with attach(). 328 */ 329 mutex_enter(&cpudsp->lock); 330 cpudsp->cpudrv_pm.cur_spd = NULL; 331 cpudsp->cpudrv_pm.pm_started = B_FALSE; 332 /* 333 * We don't call pm_raise_power() directly from attach 334 * because driver attach for a slave CPU node can 335 * happen before the CPU is even initialized. We just 336 * start the monitoring system which understands 337 * unknown speed and moves CPU to top speed when it 338 * has been initialized. 339 */ 340 CPUDRV_MONITOR_INIT(cpudsp); 341 mutex_exit(&cpudsp->lock); 342 343 } 344 345 CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp); 346 347 ddi_report_dev(dip); 348 return (DDI_SUCCESS); 349 350 case DDI_RESUME: 351 DPRINTF(D_ATTACH, ("cpudrv_attach: instance %d: " 352 "DDI_RESUME called\n", instance)); 353 354 cpudsp = ddi_get_soft_state(cpudrv_state, instance); 355 ASSERT(cpudsp != NULL); 356 357 /* 358 * Nothing to do for resume, if not doing active PM. 359 */ 360 if (!cpudrv_is_enabled(cpudsp)) 361 return (DDI_SUCCESS); 362 363 mutex_enter(&cpudsp->lock); 364 /* 365 * Driver needs to assume that CPU is running at unknown speed 366 * at DDI_RESUME and switch it to the needed speed. We assume 367 * that the needed speed is full speed for us. 368 */ 369 cpudsp->cpudrv_pm.cur_spd = NULL; 370 CPUDRV_MONITOR_INIT(cpudsp); 371 mutex_exit(&cpudsp->lock); 372 CPUDRV_REDEFINE_TOPSPEED(dip); 373 return (DDI_SUCCESS); 374 375 default: 376 return (DDI_FAILURE); 377 } 378 } 379 380 /* 381 * Driver detach(9e) entry point. 382 */ 383 static int 384 cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 385 { 386 int instance; 387 cpudrv_devstate_t *cpudsp; 388 cpudrv_pm_t *cpupm; 389 390 instance = ddi_get_instance(dip); 391 392 switch (cmd) { 393 case DDI_DETACH: 394 DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: " 395 "DDI_DETACH called\n", instance)); 396 /* 397 * If the only thing supported by the driver is power 398 * management, we can in future enhance the driver and 399 * framework that loads it to unload the driver when 400 * user has disabled CPU power management. 401 */ 402 return (DDI_FAILURE); 403 404 case DDI_SUSPEND: 405 DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: " 406 "DDI_SUSPEND called\n", instance)); 407 408 cpudsp = ddi_get_soft_state(cpudrv_state, instance); 409 ASSERT(cpudsp != NULL); 410 411 /* 412 * Nothing to do for suspend, if not doing active PM. 413 */ 414 if (!cpudrv_is_enabled(cpudsp)) 415 return (DDI_SUCCESS); 416 417 /* 418 * During a checkpoint-resume sequence, framework will 419 * stop interrupts to quiesce kernel activity. This will 420 * leave our monitoring system ineffective. Handle this 421 * by stopping our monitoring system and bringing CPU 422 * to full speed. In case we are in special direct pm 423 * mode, we leave the CPU at whatever speed it is. This 424 * is harmless other than speed. 425 */ 426 mutex_enter(&cpudsp->lock); 427 cpupm = &(cpudsp->cpudrv_pm); 428 429 DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: DDI_SUSPEND - " 430 "cur_spd %d, topspeed %d\n", instance, 431 cpupm->cur_spd->pm_level, 432 CPUDRV_TOPSPEED(cpupm)->pm_level)); 433 434 CPUDRV_MONITOR_FINI(cpudsp); 435 436 if (!cpudrv_direct_pm && (cpupm->cur_spd != 437 CPUDRV_TOPSPEED(cpupm))) { 438 if (cpupm->pm_busycnt < 1) { 439 if ((pm_busy_component(dip, CPUDRV_COMP_NUM) 440 == DDI_SUCCESS)) { 441 cpupm->pm_busycnt++; 442 } else { 443 CPUDRV_MONITOR_INIT(cpudsp); 444 mutex_exit(&cpudsp->lock); 445 cmn_err(CE_WARN, "cpudrv_detach: " 446 "instance %d: can't busy CPU " 447 "component", instance); 448 return (DDI_FAILURE); 449 } 450 } 451 mutex_exit(&cpudsp->lock); 452 if (pm_raise_power(dip, CPUDRV_COMP_NUM, 453 CPUDRV_TOPSPEED(cpupm)->pm_level) != 454 DDI_SUCCESS) { 455 mutex_enter(&cpudsp->lock); 456 CPUDRV_MONITOR_INIT(cpudsp); 457 mutex_exit(&cpudsp->lock); 458 cmn_err(CE_WARN, "cpudrv_detach: instance %d: " 459 "can't raise CPU power level to %d", 460 instance, 461 CPUDRV_TOPSPEED(cpupm)->pm_level); 462 return (DDI_FAILURE); 463 } else { 464 return (DDI_SUCCESS); 465 } 466 } else { 467 mutex_exit(&cpudsp->lock); 468 return (DDI_SUCCESS); 469 } 470 471 default: 472 return (DDI_FAILURE); 473 } 474 } 475 476 /* 477 * Driver power(9e) entry point. 478 * 479 * Driver's notion of current power is set *only* in power(9e) entry point 480 * after actual power change operation has been successfully completed. 481 */ 482 /* ARGSUSED */ 483 static int 484 cpudrv_power(dev_info_t *dip, int comp, int level) 485 { 486 int instance; 487 cpudrv_devstate_t *cpudsp; 488 cpudrv_pm_t *cpudrvpm; 489 cpudrv_pm_spd_t *new_spd; 490 boolean_t is_ready; 491 int ret; 492 493 instance = ddi_get_instance(dip); 494 495 DPRINTF(D_POWER, ("cpudrv_power: instance %d: level %d\n", 496 instance, level)); 497 498 if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) == NULL) { 499 cmn_err(CE_WARN, "cpudrv_power: instance %d: can't " 500 "get state", instance); 501 return (DDI_FAILURE); 502 } 503 504 mutex_enter(&cpudsp->lock); 505 cpudrvpm = &(cpudsp->cpudrv_pm); 506 507 /* 508 * In normal operation, we fail if we are busy and request is 509 * to lower the power level. We let this go through if the driver 510 * is in special direct pm mode. On x86, we also let this through 511 * if the change is due to a request to govern the max speed. 512 */ 513 if (!cpudrv_direct_pm && (cpudrvpm->pm_busycnt >= 1) && 514 !cpudrv_is_governor_thread(cpudrvpm)) { 515 if ((cpudrvpm->cur_spd != NULL) && 516 (level < cpudrvpm->cur_spd->pm_level)) { 517 mutex_exit(&cpudsp->lock); 518 return (DDI_FAILURE); 519 } 520 } 521 522 for (new_spd = cpudrvpm->head_spd; new_spd; new_spd = 523 new_spd->down_spd) { 524 if (new_spd->pm_level == level) 525 break; 526 } 527 if (!new_spd) { 528 CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm); 529 mutex_exit(&cpudsp->lock); 530 cmn_err(CE_WARN, "cpudrv_power: instance %d: " 531 "can't locate new CPU speed", instance); 532 return (DDI_FAILURE); 533 } 534 535 /* 536 * We currently refuse to power manage if the CPU is not ready to 537 * take cross calls (cross calls fail silently if CPU is not ready 538 * for it). 539 * 540 * Additionally, for x86 platforms we cannot power manage 541 * any one instance, until all instances have been initialized. 542 * That's because we don't know what the CPU domains look like 543 * until all instances have been initialized. 544 */ 545 is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id); 546 if (!is_ready) { 547 DPRINTF(D_POWER, ("cpudrv_power: instance %d: " 548 "CPU not ready for x-calls\n", instance)); 549 } else if (!(is_ready = cpudrv_power_ready())) { 550 DPRINTF(D_POWER, ("cpudrv_power: instance %d: " 551 "waiting for all CPUs to be power manageable\n", 552 instance)); 553 } 554 if (!is_ready) { 555 CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm); 556 mutex_exit(&cpudsp->lock); 557 return (DDI_FAILURE); 558 } 559 560 /* 561 * Execute CPU specific routine on the requested CPU to 562 * change its speed to normal-speed/divisor. 563 */ 564 if ((ret = cpudrv_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) { 565 cmn_err(CE_WARN, "cpudrv_power: " 566 "cpudrv_change_speed() return = %d", ret); 567 mutex_exit(&cpudsp->lock); 568 return (DDI_FAILURE); 569 } 570 571 /* 572 * Reset idle threshold time for the new power level. 573 */ 574 if ((cpudrvpm->cur_spd != NULL) && (level < 575 cpudrvpm->cur_spd->pm_level)) { 576 if (pm_idle_component(dip, CPUDRV_COMP_NUM) == 577 DDI_SUCCESS) { 578 if (cpudrvpm->pm_busycnt >= 1) 579 cpudrvpm->pm_busycnt--; 580 } else { 581 cmn_err(CE_WARN, "cpudrv_power: instance %d: " 582 "can't idle CPU component", 583 ddi_get_instance(dip)); 584 } 585 } 586 /* 587 * Reset various parameters because we are now running at new speed. 588 */ 589 cpudrvpm->lastquan_mstate[CMS_IDLE] = 0; 590 cpudrvpm->lastquan_mstate[CMS_SYSTEM] = 0; 591 cpudrvpm->lastquan_mstate[CMS_USER] = 0; 592 cpudrvpm->lastquan_ticks = 0; 593 cpudrvpm->cur_spd = new_spd; 594 CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm); 595 mutex_exit(&cpudsp->lock); 596 597 return (DDI_SUCCESS); 598 } 599 600 /* 601 * Initialize power management data. 602 */ 603 static int 604 cpudrv_init(cpudrv_devstate_t *cpudsp) 605 { 606 cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm); 607 cpudrv_pm_spd_t *cur_spd; 608 cpudrv_pm_spd_t *prev_spd = NULL; 609 int *speeds; 610 uint_t nspeeds; 611 int idle_cnt_percent; 612 int user_cnt_percent; 613 int i; 614 615 CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds); 616 if (nspeeds < 2) { 617 /* Need at least two speeds to power manage */ 618 CPUDRV_FREE_SPEEDS(speeds, nspeeds); 619 return (DDI_FAILURE); 620 } 621 cpupm->num_spd = nspeeds; 622 623 /* 624 * Calculate the watermarks and other parameters based on the 625 * supplied speeds. 626 * 627 * One of the basic assumption is that for X amount of CPU work, 628 * if CPU is slowed down by a factor of N, the time it takes to 629 * do the same work will be N * X. 630 * 631 * The driver declares that a CPU is idle and ready for slowed down, 632 * if amount of idle thread is more than the current speed idle_hwm 633 * without dropping below idle_hwm a number of consecutive sampling 634 * intervals and number of running threads in user mode are below 635 * user_lwm. We want to set the current user_lwm such that if we 636 * just switched to the next slower speed with no change in real work 637 * load, the amount of user threads at the slower speed will be such 638 * that it falls below the slower speed's user_hwm. If we didn't do 639 * that then we will just come back to the higher speed as soon as we 640 * go down even with no change in work load. 641 * The user_hwm is a fixed precentage and not calculated dynamically. 642 * 643 * We bring the CPU up if idle thread at current speed is less than 644 * the current speed idle_lwm for a number of consecutive sampling 645 * intervals or user threads are above the user_hwm for the current 646 * speed. 647 */ 648 for (i = 0; i < nspeeds; i++) { 649 cur_spd = kmem_zalloc(sizeof (cpudrv_pm_spd_t), KM_SLEEP); 650 cur_spd->speed = speeds[i]; 651 if (i == 0) { /* normal speed */ 652 cpupm->head_spd = cur_spd; 653 CPUDRV_TOPSPEED(cpupm) = cur_spd; 654 cur_spd->quant_cnt = CPUDRV_QUANT_CNT_NORMAL; 655 cur_spd->idle_hwm = 656 (cpudrv_idle_hwm * cur_spd->quant_cnt) / 100; 657 /* can't speed anymore */ 658 cur_spd->idle_lwm = 0; 659 cur_spd->user_hwm = UINT_MAX; 660 } else { 661 cur_spd->quant_cnt = CPUDRV_QUANT_CNT_OTHR; 662 ASSERT(prev_spd != NULL); 663 prev_spd->down_spd = cur_spd; 664 cur_spd->up_spd = cpupm->head_spd; 665 666 /* 667 * Let's assume CPU is considered idle at full speed 668 * when it is spending I% of time in running the idle 669 * thread. At full speed, CPU will be busy (100 - I) % 670 * of times. This % of busyness increases by factor of 671 * N as CPU slows down. CPU that is idle I% of times 672 * in full speed, it is idle (100 - ((100 - I) * N)) % 673 * of times in N speed. The idle_lwm is a fixed 674 * percentage. A large value of N may result in 675 * idle_hwm to go below idle_lwm. We need to make sure 676 * that there is at least a buffer zone seperation 677 * between the idle_lwm and idle_hwm values. 678 */ 679 idle_cnt_percent = CPUDRV_IDLE_CNT_PERCENT( 680 cpudrv_idle_hwm, speeds, i); 681 idle_cnt_percent = max(idle_cnt_percent, 682 (cpudrv_idle_lwm + cpudrv_idle_buf_zone)); 683 cur_spd->idle_hwm = 684 (idle_cnt_percent * cur_spd->quant_cnt) / 100; 685 cur_spd->idle_lwm = 686 (cpudrv_idle_lwm * cur_spd->quant_cnt) / 100; 687 688 /* 689 * The lwm for user threads are determined such that 690 * if CPU slows down, the load of work in the 691 * new speed would still keep the CPU at or below the 692 * user_hwm in the new speed. This is to prevent 693 * the quick jump back up to higher speed. 694 */ 695 cur_spd->user_hwm = (cpudrv_user_hwm * 696 cur_spd->quant_cnt) / 100; 697 user_cnt_percent = CPUDRV_USER_CNT_PERCENT( 698 cpudrv_user_hwm, speeds, i); 699 prev_spd->user_lwm = 700 (user_cnt_percent * prev_spd->quant_cnt) / 100; 701 } 702 prev_spd = cur_spd; 703 } 704 /* Slowest speed. Can't slow down anymore */ 705 cur_spd->idle_hwm = UINT_MAX; 706 cur_spd->user_lwm = -1; 707 #ifdef DEBUG 708 DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: head_spd spd %d, " 709 "num_spd %d\n", ddi_get_instance(cpudsp->dip), 710 cpupm->head_spd->speed, cpupm->num_spd)); 711 for (cur_spd = cpupm->head_spd; cur_spd; cur_spd = cur_spd->down_spd) { 712 DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: speed %d, " 713 "down_spd spd %d, idle_hwm %d, user_lwm %d, " 714 "up_spd spd %d, idle_lwm %d, user_hwm %d, " 715 "quant_cnt %d\n", ddi_get_instance(cpudsp->dip), 716 cur_spd->speed, 717 (cur_spd->down_spd ? cur_spd->down_spd->speed : 0), 718 cur_spd->idle_hwm, cur_spd->user_lwm, 719 (cur_spd->up_spd ? cur_spd->up_spd->speed : 0), 720 cur_spd->idle_lwm, cur_spd->user_hwm, 721 cur_spd->quant_cnt)); 722 } 723 #endif /* DEBUG */ 724 CPUDRV_FREE_SPEEDS(speeds, nspeeds); 725 return (DDI_SUCCESS); 726 } 727 728 /* 729 * Free CPU power management data. 730 */ 731 static void 732 cpudrv_free(cpudrv_devstate_t *cpudsp) 733 { 734 cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm); 735 cpudrv_pm_spd_t *cur_spd, *next_spd; 736 737 cur_spd = cpupm->head_spd; 738 while (cur_spd) { 739 next_spd = cur_spd->down_spd; 740 kmem_free(cur_spd, sizeof (cpudrv_pm_spd_t)); 741 cur_spd = next_spd; 742 } 743 bzero(cpupm, sizeof (cpudrv_pm_t)); 744 } 745 746 /* 747 * Create pm-components property. 748 */ 749 static int 750 cpudrv_comp_create(cpudrv_devstate_t *cpudsp) 751 { 752 cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm); 753 cpudrv_pm_spd_t *cur_spd; 754 char **pmc; 755 int size; 756 char name[] = "NAME=CPU Speed"; 757 int i, j; 758 uint_t comp_spd; 759 int result = DDI_FAILURE; 760 761 pmc = kmem_zalloc((cpupm->num_spd + 1) * sizeof (char *), KM_SLEEP); 762 size = CPUDRV_COMP_SIZE(); 763 if (cpupm->num_spd > CPUDRV_COMP_MAX_VAL) { 764 cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: " 765 "number of speeds exceeded limits", 766 ddi_get_instance(cpudsp->dip)); 767 kmem_free(pmc, (cpupm->num_spd + 1) * sizeof (char *)); 768 return (result); 769 } 770 771 for (i = cpupm->num_spd, cur_spd = cpupm->head_spd; i > 0; 772 i--, cur_spd = cur_spd->down_spd) { 773 cur_spd->pm_level = i; 774 pmc[i] = kmem_zalloc((size * sizeof (char)), KM_SLEEP); 775 comp_spd = CPUDRV_COMP_SPEED(cpupm, cur_spd); 776 if (comp_spd > CPUDRV_COMP_MAX_VAL) { 777 cmn_err(CE_WARN, "cpudrv_comp_create: " 778 "instance %d: speed exceeded limits", 779 ddi_get_instance(cpudsp->dip)); 780 for (j = cpupm->num_spd; j >= i; j--) { 781 kmem_free(pmc[j], size * sizeof (char)); 782 } 783 kmem_free(pmc, (cpupm->num_spd + 1) * 784 sizeof (char *)); 785 return (result); 786 } 787 CPUDRV_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd) 788 DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: " 789 "instance %d: pm-components power level %d string '%s'\n", 790 ddi_get_instance(cpudsp->dip), i, pmc[i])); 791 } 792 pmc[0] = kmem_zalloc(sizeof (name), KM_SLEEP); 793 (void) strcat(pmc[0], name); 794 DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: instance %d: " 795 "pm-components component name '%s'\n", 796 ddi_get_instance(cpudsp->dip), pmc[0])); 797 798 if (ddi_prop_update_string_array(DDI_DEV_T_NONE, cpudsp->dip, 799 "pm-components", pmc, cpupm->num_spd + 1) == DDI_PROP_SUCCESS) { 800 result = DDI_SUCCESS; 801 } else { 802 cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: " 803 "can't create pm-components property", 804 ddi_get_instance(cpudsp->dip)); 805 } 806 807 for (i = cpupm->num_spd; i > 0; i--) { 808 kmem_free(pmc[i], size * sizeof (char)); 809 } 810 kmem_free(pmc[0], sizeof (name)); 811 kmem_free(pmc, (cpupm->num_spd + 1) * sizeof (char *)); 812 return (result); 813 } 814 815 /* 816 * Mark a component idle. 817 */ 818 #define CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm) { \ 819 if ((cpupm)->pm_busycnt >= 1) { \ 820 if (pm_idle_component((dip), CPUDRV_COMP_NUM) == \ 821 DDI_SUCCESS) { \ 822 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \ 823 "instance %d: pm_idle_component called\n", \ 824 ddi_get_instance((dip)))); \ 825 (cpupm)->pm_busycnt--; \ 826 } else { \ 827 cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \ 828 "can't idle CPU component", \ 829 ddi_get_instance((dip))); \ 830 } \ 831 } \ 832 } 833 834 /* 835 * Marks a component busy in both PM framework and driver state structure. 836 */ 837 #define CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm) { \ 838 if ((cpupm)->pm_busycnt < 1) { \ 839 if (pm_busy_component((dip), CPUDRV_COMP_NUM) == \ 840 DDI_SUCCESS) { \ 841 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \ 842 "instance %d: pm_busy_component called\n", \ 843 ddi_get_instance((dip)))); \ 844 (cpupm)->pm_busycnt++; \ 845 } else { \ 846 cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \ 847 "can't busy CPU component", \ 848 ddi_get_instance((dip))); \ 849 } \ 850 } \ 851 } 852 853 /* 854 * Marks a component busy and calls pm_raise_power(). 855 */ 856 #define CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_level) { \ 857 /* \ 858 * Mark driver and PM framework busy first so framework doesn't try \ 859 * to bring CPU to lower speed when we need to be at higher speed. \ 860 */ \ 861 CPUDRV_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \ 862 mutex_exit(&(cpudsp)->lock); \ 863 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " \ 864 "pm_raise_power called to %d\n", ddi_get_instance((dip)), \ 865 (new_level))); \ 866 if (pm_raise_power((dip), CPUDRV_COMP_NUM, (new_level)) != \ 867 DDI_SUCCESS) { \ 868 cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't " \ 869 "raise CPU power level", ddi_get_instance((dip))); \ 870 } \ 871 mutex_enter(&(cpudsp)->lock); \ 872 } 873 874 /* 875 * In order to monitor a CPU, we need to hold cpu_lock to access CPU 876 * statistics. Holding cpu_lock is not allowed from a callout routine. 877 * We dispatch a taskq to do that job. 878 */ 879 static void 880 cpudrv_monitor_disp(void *arg) 881 { 882 cpudrv_devstate_t *cpudsp = (cpudrv_devstate_t *)arg; 883 884 /* 885 * We are here because the last task has scheduled a timeout. 886 * The queue should be empty at this time. 887 */ 888 mutex_enter(&cpudsp->cpudrv_pm.timeout_lock); 889 if (!taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_monitor, arg, 890 TQ_NOSLEEP)) { 891 mutex_exit(&cpudsp->cpudrv_pm.timeout_lock); 892 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor_disp: failed to " 893 "dispatch the cpudrv_monitor taskq\n")); 894 mutex_enter(&cpudsp->lock); 895 CPUDRV_MONITOR_INIT(cpudsp); 896 mutex_exit(&cpudsp->lock); 897 return; 898 } 899 cpudsp->cpudrv_pm.timeout_count++; 900 mutex_exit(&cpudsp->cpudrv_pm.timeout_lock); 901 } 902 903 /* 904 * Monitors each CPU for the amount of time idle thread was running in the 905 * last quantum and arranges for the CPU to go to the lower or higher speed. 906 * Called at the time interval appropriate for the current speed. The 907 * time interval for normal speed is CPUDRV_QUANT_CNT_NORMAL. The time 908 * interval for other speeds (including unknown speed) is 909 * CPUDRV_QUANT_CNT_OTHR. 910 */ 911 static void 912 cpudrv_monitor(void *arg) 913 { 914 cpudrv_devstate_t *cpudsp = (cpudrv_devstate_t *)arg; 915 cpudrv_pm_t *cpupm; 916 cpudrv_pm_spd_t *cur_spd, *new_spd; 917 dev_info_t *dip; 918 uint_t idle_cnt, user_cnt, system_cnt; 919 clock_t ticks; 920 uint_t tick_cnt; 921 hrtime_t msnsecs[NCMSTATES]; 922 boolean_t is_ready; 923 924 #define GET_CPU_MSTATE_CNT(state, cnt) \ 925 msnsecs[state] = NSEC_TO_TICK(msnsecs[state]); \ 926 if (cpupm->lastquan_mstate[state] > msnsecs[state]) \ 927 msnsecs[state] = cpupm->lastquan_mstate[state]; \ 928 cnt = msnsecs[state] - cpupm->lastquan_mstate[state]; \ 929 cpupm->lastquan_mstate[state] = msnsecs[state] 930 931 mutex_enter(&cpudsp->lock); 932 cpupm = &(cpudsp->cpudrv_pm); 933 if (cpupm->timeout_id == 0) { 934 mutex_exit(&cpudsp->lock); 935 goto do_return; 936 } 937 cur_spd = cpupm->cur_spd; 938 dip = cpudsp->dip; 939 940 /* 941 * We assume that a CPU is initialized and has a valid cpu_t 942 * structure, if it is ready for cross calls. If this changes, 943 * additional checks might be needed. 944 * 945 * Additionally, for x86 platforms we cannot power manage 946 * any one instance, until all instances have been initialized. 947 * That's because we don't know what the CPU domains look like 948 * until all instances have been initialized. 949 */ 950 is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id); 951 if (!is_ready) { 952 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " 953 "CPU not ready for x-calls\n", ddi_get_instance(dip))); 954 } else if (!(is_ready = cpudrv_power_ready())) { 955 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " 956 "waiting for all CPUs to be power manageable\n", 957 ddi_get_instance(dip))); 958 } 959 if (!is_ready) { 960 /* 961 * Make sure that we are busy so that framework doesn't 962 * try to bring us down in this situation. 963 */ 964 CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm); 965 CPUDRV_MONITOR_INIT(cpudsp); 966 mutex_exit(&cpudsp->lock); 967 goto do_return; 968 } 969 970 /* 971 * Make sure that we are still not at unknown power level. 972 */ 973 if (cur_spd == NULL) { 974 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " 975 "cur_spd is unknown\n", ddi_get_instance(dip))); 976 CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, 977 CPUDRV_TOPSPEED(cpupm)->pm_level); 978 /* 979 * We just changed the speed. Wait till at least next 980 * call to this routine before proceeding ahead. 981 */ 982 CPUDRV_MONITOR_INIT(cpudsp); 983 mutex_exit(&cpudsp->lock); 984 goto do_return; 985 } 986 987 mutex_enter(&cpu_lock); 988 if (cpudsp->cp == NULL && 989 (cpudsp->cp = cpu_get(cpudsp->cpu_id)) == NULL) { 990 mutex_exit(&cpu_lock); 991 CPUDRV_MONITOR_INIT(cpudsp); 992 mutex_exit(&cpudsp->lock); 993 cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't get " 994 "cpu_t", ddi_get_instance(dip)); 995 goto do_return; 996 } 997 998 if (!cpupm->pm_started) { 999 cpupm->pm_started = B_TRUE; 1000 cpudrv_set_supp_freqs(cpudsp); 1001 } 1002 1003 get_cpu_mstate(cpudsp->cp, msnsecs); 1004 GET_CPU_MSTATE_CNT(CMS_IDLE, idle_cnt); 1005 GET_CPU_MSTATE_CNT(CMS_USER, user_cnt); 1006 GET_CPU_MSTATE_CNT(CMS_SYSTEM, system_cnt); 1007 1008 /* 1009 * We can't do anything when we have just switched to a state 1010 * because there is no valid timestamp. 1011 */ 1012 if (cpupm->lastquan_ticks == 0) { 1013 cpupm->lastquan_ticks = NSEC_TO_TICK(gethrtime()); 1014 mutex_exit(&cpu_lock); 1015 CPUDRV_MONITOR_INIT(cpudsp); 1016 mutex_exit(&cpudsp->lock); 1017 goto do_return; 1018 } 1019 1020 /* 1021 * Various watermarks are based on this routine being called back 1022 * exactly at the requested period. This is not guaranteed 1023 * because this routine is called from a taskq that is dispatched 1024 * from a timeout routine. Handle this by finding out how many 1025 * ticks have elapsed since the last call and adjusting 1026 * the idle_cnt based on the delay added to the requested period 1027 * by timeout and taskq. 1028 */ 1029 ticks = NSEC_TO_TICK(gethrtime()); 1030 tick_cnt = ticks - cpupm->lastquan_ticks; 1031 ASSERT(tick_cnt != 0); 1032 cpupm->lastquan_ticks = ticks; 1033 mutex_exit(&cpu_lock); 1034 /* 1035 * Time taken between recording the current counts and 1036 * arranging the next call of this routine is an error in our 1037 * calculation. We minimize the error by calling 1038 * CPUDRV_MONITOR_INIT() here instead of end of this routine. 1039 */ 1040 CPUDRV_MONITOR_INIT(cpudsp); 1041 DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_monitor: instance %d: " 1042 "idle count %d, user count %d, system count %d, pm_level %d, " 1043 "pm_busycnt %d\n", ddi_get_instance(dip), idle_cnt, user_cnt, 1044 system_cnt, cur_spd->pm_level, cpupm->pm_busycnt)); 1045 1046 #ifdef DEBUG 1047 /* 1048 * Notify that timeout and taskq has caused delays and we need to 1049 * scale our parameters accordingly. 1050 * 1051 * To get accurate result, don't turn on other DPRINTFs with 1052 * the following DPRINTF. PROM calls generated by other 1053 * DPRINTFs changes the timing. 1054 */ 1055 if (tick_cnt > cur_spd->quant_cnt) { 1056 DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_monitor: instance %d: " 1057 "tick count %d > quantum_count %u\n", 1058 ddi_get_instance(dip), tick_cnt, cur_spd->quant_cnt)); 1059 } 1060 #endif /* DEBUG */ 1061 1062 /* 1063 * Adjust counts based on the delay added by timeout and taskq. 1064 */ 1065 idle_cnt = (idle_cnt * cur_spd->quant_cnt) / tick_cnt; 1066 user_cnt = (user_cnt * cur_spd->quant_cnt) / tick_cnt; 1067 1068 if ((user_cnt > cur_spd->user_hwm) || (idle_cnt < cur_spd->idle_lwm && 1069 cur_spd->idle_blwm_cnt >= cpudrv_idle_blwm_cnt_max)) { 1070 cur_spd->idle_blwm_cnt = 0; 1071 cur_spd->idle_bhwm_cnt = 0; 1072 /* 1073 * In normal situation, arrange to go to next higher speed. 1074 * If we are running in special direct pm mode, we just stay 1075 * at the current speed. 1076 */ 1077 if (cur_spd == cur_spd->up_spd || cpudrv_direct_pm) { 1078 CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm); 1079 } else { 1080 new_spd = cur_spd->up_spd; 1081 CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, 1082 new_spd->pm_level); 1083 } 1084 } else if ((user_cnt <= cur_spd->user_lwm) && 1085 (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cpudsp->cp)) { 1086 cur_spd->idle_blwm_cnt = 0; 1087 cur_spd->idle_bhwm_cnt = 0; 1088 /* 1089 * Arrange to go to next lower speed by informing our idle 1090 * status to the power management framework. 1091 */ 1092 CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm); 1093 } else { 1094 /* 1095 * If we are between the idle water marks and have not 1096 * been here enough consecutive times to be considered 1097 * busy, just increment the count and return. 1098 */ 1099 if ((idle_cnt < cur_spd->idle_hwm) && 1100 (idle_cnt >= cur_spd->idle_lwm) && 1101 (cur_spd->idle_bhwm_cnt < cpudrv_idle_bhwm_cnt_max)) { 1102 cur_spd->idle_blwm_cnt = 0; 1103 cur_spd->idle_bhwm_cnt++; 1104 mutex_exit(&cpudsp->lock); 1105 goto do_return; 1106 } 1107 if (idle_cnt < cur_spd->idle_lwm) { 1108 cur_spd->idle_blwm_cnt++; 1109 cur_spd->idle_bhwm_cnt = 0; 1110 } 1111 /* 1112 * Arranges to stay at the current speed. 1113 */ 1114 CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm); 1115 } 1116 mutex_exit(&cpudsp->lock); 1117 do_return: 1118 mutex_enter(&cpupm->timeout_lock); 1119 ASSERT(cpupm->timeout_count > 0); 1120 cpupm->timeout_count--; 1121 cv_signal(&cpupm->timeout_cv); 1122 mutex_exit(&cpupm->timeout_lock); 1123 } 1124