1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2004-2007 Nate Lawson (SDG) 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/param.h> 31 #include <sys/bus.h> 32 #include <sys/cpu.h> 33 #include <sys/eventhandler.h> 34 #include <sys/kernel.h> 35 #include <sys/lock.h> 36 #include <sys/malloc.h> 37 #include <sys/module.h> 38 #include <sys/proc.h> 39 #include <sys/queue.h> 40 #include <sys/sbuf.h> 41 #include <sys/sched.h> 42 #include <sys/smp.h> 43 #include <sys/sysctl.h> 44 #include <sys/systm.h> 45 #include <sys/sx.h> 46 #include <sys/timetc.h> 47 #include <sys/taskqueue.h> 48 49 #include "cpufreq_if.h" 50 51 /* 52 * Common CPU frequency glue code. Drivers for specific hardware can 53 * attach this interface to allow users to get/set the CPU frequency. 54 */ 55 56 /* 57 * Number of levels we can handle. Levels are synthesized from settings 58 * so for M settings and N drivers, there may be M*N levels. 59 */ 60 #define CF_MAX_LEVELS 256 61 62 struct cf_saved_freq { 63 struct cf_level level; 64 int priority; 65 SLIST_ENTRY(cf_saved_freq) link; 66 }; 67 68 struct cpufreq_softc { 69 struct sx lock; 70 struct cf_level curr_level; 71 int curr_priority; 72 SLIST_HEAD(, cf_saved_freq) saved_freq; 73 struct cf_level_lst all_levels; 74 int all_count; 75 int max_mhz; 76 device_t dev; 77 device_t cf_drv_dev; 78 struct sysctl_ctx_list sysctl_ctx; 79 struct task startup_task; 80 struct cf_level *levels_buf; 81 }; 82 83 struct cf_setting_array { 84 struct cf_setting sets[MAX_SETTINGS]; 85 int count; 86 TAILQ_ENTRY(cf_setting_array) link; 87 }; 88 89 TAILQ_HEAD(cf_setting_lst, cf_setting_array); 90 91 #define CF_MTX_INIT(x) sx_init((x), "cpufreq lock") 92 #define CF_MTX_LOCK(x) sx_xlock((x)) 93 #define CF_MTX_UNLOCK(x) sx_xunlock((x)) 94 #define CF_MTX_ASSERT(x) sx_assert((x), SX_XLOCKED) 95 96 #define CF_DEBUG(msg...) do { \ 97 if (cf_verbose) \ 98 printf("cpufreq: " msg); \ 99 } while (0) 100 101 static int cpufreq_attach(device_t dev); 102 static void cpufreq_startup_task(void *ctx, int pending); 103 static int cpufreq_detach(device_t dev); 104 static int cf_set_method(device_t dev, const struct cf_level *level, 105 int priority); 106 static int cf_get_method(device_t dev, struct cf_level *level); 107 static int cf_levels_method(device_t dev, struct cf_level *levels, 108 int *count); 109 static int cpufreq_insert_abs(struct cpufreq_softc *sc, 110 struct cf_setting *sets, int count); 111 static int cpufreq_expand_set(struct cpufreq_softc *sc, 112 struct cf_setting_array *set_arr); 113 static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc, 114 struct cf_level *dup, struct cf_setting *set); 115 static int cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS); 116 static int cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS); 117 static int cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS); 118 119 static device_method_t cpufreq_methods[] = { 120 DEVMETHOD(device_probe, bus_generic_probe), 121 DEVMETHOD(device_attach, cpufreq_attach), 122 DEVMETHOD(device_detach, cpufreq_detach), 123 124 DEVMETHOD(cpufreq_set, cf_set_method), 125 DEVMETHOD(cpufreq_get, cf_get_method), 126 DEVMETHOD(cpufreq_levels, cf_levels_method), 127 {0, 0} 128 }; 129 130 static driver_t cpufreq_driver = { 131 "cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc) 132 }; 133 134 DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, 0, 0); 135 136 static int cf_lowest_freq; 137 static int cf_verbose; 138 static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 139 "cpufreq debugging"); 140 SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RWTUN, &cf_lowest_freq, 1, 141 "Don't provide levels below this frequency."); 142 SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RWTUN, &cf_verbose, 1, 143 "Print verbose debugging messages"); 144 145 /* 146 * This is called as the result of a hardware specific frequency control driver 147 * calling cpufreq_register. It provides a general interface for system wide 148 * frequency controls and operates on a per cpu basis. 149 */ 150 static int 151 cpufreq_attach(device_t dev) 152 { 153 struct cpufreq_softc *sc; 154 struct pcpu *pc; 155 device_t parent; 156 uint64_t rate; 157 158 CF_DEBUG("initializing %s\n", device_get_nameunit(dev)); 159 sc = device_get_softc(dev); 160 parent = device_get_parent(dev); 161 sc->dev = dev; 162 sysctl_ctx_init(&sc->sysctl_ctx); 163 TAILQ_INIT(&sc->all_levels); 164 CF_MTX_INIT(&sc->lock); 165 sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN; 166 SLIST_INIT(&sc->saved_freq); 167 /* Try to get nominal CPU freq to use it as maximum later if needed */ 168 sc->max_mhz = cpu_get_nominal_mhz(dev); 169 /* If that fails, try to measure the current rate */ 170 if (sc->max_mhz <= 0) { 171 CF_DEBUG("Unable to obtain nominal frequency.\n"); 172 pc = cpu_get_pcpu(dev); 173 if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0) 174 sc->max_mhz = rate / 1000000; 175 else 176 sc->max_mhz = CPUFREQ_VAL_UNKNOWN; 177 } 178 179 CF_DEBUG("initializing one-time data for %s\n", 180 device_get_nameunit(dev)); 181 sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf), 182 M_DEVBUF, M_WAITOK); 183 SYSCTL_ADD_PROC(&sc->sysctl_ctx, 184 SYSCTL_CHILDREN(device_get_sysctl_tree(parent)), 185 OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 186 sc, 0, cpufreq_curr_sysctl, "I", "Current CPU frequency"); 187 SYSCTL_ADD_PROC(&sc->sysctl_ctx, 188 SYSCTL_CHILDREN(device_get_sysctl_tree(parent)), 189 OID_AUTO, "freq_levels", 190 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, sc, 0, 191 cpufreq_levels_sysctl, "A", "CPU frequency levels"); 192 193 /* 194 * Queue a one-shot broadcast that levels have changed. 195 * It will run once the system has completed booting. 196 */ 197 TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev); 198 taskqueue_enqueue(taskqueue_thread, &sc->startup_task); 199 200 return (0); 201 } 202 203 /* Handle any work to be done for all drivers that attached during boot. */ 204 static void 205 cpufreq_startup_task(void *ctx, int pending) 206 { 207 208 cpufreq_settings_changed((device_t)ctx); 209 } 210 211 static int 212 cpufreq_detach(device_t dev) 213 { 214 struct cpufreq_softc *sc; 215 struct cf_saved_freq *saved_freq; 216 217 CF_DEBUG("shutdown %s\n", device_get_nameunit(dev)); 218 sc = device_get_softc(dev); 219 sysctl_ctx_free(&sc->sysctl_ctx); 220 221 while ((saved_freq = SLIST_FIRST(&sc->saved_freq)) != NULL) { 222 SLIST_REMOVE_HEAD(&sc->saved_freq, link); 223 free(saved_freq, M_TEMP); 224 } 225 226 free(sc->levels_buf, M_DEVBUF); 227 228 return (0); 229 } 230 231 static int 232 cf_set_method(device_t dev, const struct cf_level *level, int priority) 233 { 234 struct cpufreq_softc *sc; 235 const struct cf_setting *set; 236 struct cf_saved_freq *saved_freq, *curr_freq; 237 struct pcpu *pc; 238 int error, i; 239 u_char pri; 240 241 sc = device_get_softc(dev); 242 error = 0; 243 set = NULL; 244 saved_freq = NULL; 245 246 /* We are going to change levels so notify the pre-change handler. */ 247 EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error); 248 if (error != 0) { 249 EVENTHANDLER_INVOKE(cpufreq_post_change, level, error); 250 return (error); 251 } 252 253 CF_MTX_LOCK(&sc->lock); 254 255 #ifdef SMP 256 #ifdef EARLY_AP_STARTUP 257 MPASS(mp_ncpus == 1 || smp_started); 258 #else 259 /* 260 * If still booting and secondary CPUs not started yet, don't allow 261 * changing the frequency until they're online. This is because we 262 * can't switch to them using sched_bind() and thus we'd only be 263 * switching the main CPU. XXXTODO: Need to think more about how to 264 * handle having different CPUs at different frequencies. 265 */ 266 if (mp_ncpus > 1 && !smp_started) { 267 device_printf(dev, "rejecting change, SMP not started yet\n"); 268 error = ENXIO; 269 goto out; 270 } 271 #endif 272 #endif /* SMP */ 273 274 /* 275 * If the requested level has a lower priority, don't allow 276 * the new level right now. 277 */ 278 if (priority < sc->curr_priority) { 279 CF_DEBUG("ignoring, curr prio %d less than %d\n", priority, 280 sc->curr_priority); 281 error = EPERM; 282 goto out; 283 } 284 285 /* 286 * If the caller didn't specify a level and one is saved, prepare to 287 * restore the saved level. If none has been saved, return an error. 288 */ 289 if (level == NULL) { 290 saved_freq = SLIST_FIRST(&sc->saved_freq); 291 if (saved_freq == NULL) { 292 CF_DEBUG("NULL level, no saved level\n"); 293 error = ENXIO; 294 goto out; 295 } 296 level = &saved_freq->level; 297 priority = saved_freq->priority; 298 CF_DEBUG("restoring saved level, freq %d prio %d\n", 299 level->total_set.freq, priority); 300 } 301 302 /* Reject levels that are below our specified threshold. */ 303 if (level->total_set.freq < cf_lowest_freq) { 304 CF_DEBUG("rejecting freq %d, less than %d limit\n", 305 level->total_set.freq, cf_lowest_freq); 306 error = EINVAL; 307 goto out; 308 } 309 310 /* If already at this level, just return. */ 311 if (sc->curr_level.total_set.freq == level->total_set.freq) { 312 CF_DEBUG("skipping freq %d, same as current level %d\n", 313 level->total_set.freq, sc->curr_level.total_set.freq); 314 goto skip; 315 } 316 317 /* First, set the absolute frequency via its driver. */ 318 set = &level->abs_set; 319 if (set->dev) { 320 if (!device_is_attached(set->dev)) { 321 error = ENXIO; 322 goto out; 323 } 324 325 /* Bind to the target CPU before switching. */ 326 pc = cpu_get_pcpu(set->dev); 327 328 /* Skip settings if CPU is not started. */ 329 if (pc == NULL) { 330 error = 0; 331 goto out; 332 } 333 thread_lock(curthread); 334 pri = curthread->td_priority; 335 sched_prio(curthread, PRI_MIN); 336 sched_bind(curthread, pc->pc_cpuid); 337 thread_unlock(curthread); 338 CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq, 339 device_get_nameunit(set->dev), PCPU_GET(cpuid)); 340 error = CPUFREQ_DRV_SET(set->dev, set); 341 thread_lock(curthread); 342 sched_unbind(curthread); 343 sched_prio(curthread, pri); 344 thread_unlock(curthread); 345 if (error) { 346 goto out; 347 } 348 } 349 350 /* Next, set any/all relative frequencies via their drivers. */ 351 for (i = 0; i < level->rel_count; i++) { 352 set = &level->rel_set[i]; 353 if (!device_is_attached(set->dev)) { 354 error = ENXIO; 355 goto out; 356 } 357 358 /* Bind to the target CPU before switching. */ 359 pc = cpu_get_pcpu(set->dev); 360 thread_lock(curthread); 361 pri = curthread->td_priority; 362 sched_prio(curthread, PRI_MIN); 363 sched_bind(curthread, pc->pc_cpuid); 364 thread_unlock(curthread); 365 CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq, 366 device_get_nameunit(set->dev), PCPU_GET(cpuid)); 367 error = CPUFREQ_DRV_SET(set->dev, set); 368 thread_lock(curthread); 369 sched_unbind(curthread); 370 sched_prio(curthread, pri); 371 thread_unlock(curthread); 372 if (error) { 373 /* XXX Back out any successful setting? */ 374 goto out; 375 } 376 } 377 378 skip: 379 /* 380 * Before recording the current level, check if we're going to a 381 * higher priority. If so, save the previous level and priority. 382 */ 383 if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN && 384 priority > sc->curr_priority) { 385 CF_DEBUG("saving level, freq %d prio %d\n", 386 sc->curr_level.total_set.freq, sc->curr_priority); 387 curr_freq = malloc(sizeof(*curr_freq), M_TEMP, M_NOWAIT); 388 if (curr_freq == NULL) { 389 error = ENOMEM; 390 goto out; 391 } 392 curr_freq->level = sc->curr_level; 393 curr_freq->priority = sc->curr_priority; 394 SLIST_INSERT_HEAD(&sc->saved_freq, curr_freq, link); 395 } 396 sc->curr_level = *level; 397 sc->curr_priority = priority; 398 399 /* If we were restoring a saved state, reset it to "unused". */ 400 if (saved_freq != NULL) { 401 CF_DEBUG("resetting saved level\n"); 402 sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN; 403 SLIST_REMOVE_HEAD(&sc->saved_freq, link); 404 free(saved_freq, M_TEMP); 405 } 406 407 out: 408 CF_MTX_UNLOCK(&sc->lock); 409 410 /* 411 * We changed levels (or attempted to) so notify the post-change 412 * handler of new frequency or error. 413 */ 414 EVENTHANDLER_INVOKE(cpufreq_post_change, level, error); 415 if (error && set) 416 device_printf(set->dev, "set freq failed, err %d\n", error); 417 418 return (error); 419 } 420 421 static int 422 cpufreq_get_frequency(device_t dev) 423 { 424 struct cf_setting set; 425 426 if (CPUFREQ_DRV_GET(dev, &set) != 0) 427 return (-1); 428 429 return (set.freq); 430 } 431 432 /* Returns the index into *levels with the match */ 433 static int 434 cpufreq_get_level(device_t dev, struct cf_level *levels, int count) 435 { 436 int i, freq; 437 438 if ((freq = cpufreq_get_frequency(dev)) < 0) 439 return (-1); 440 for (i = 0; i < count; i++) 441 if (freq == levels[i].total_set.freq) 442 return (i); 443 444 return (-1); 445 } 446 447 /* 448 * Used by the cpufreq core, this function will populate *level with the current 449 * frequency as either determined by a cached value sc->curr_level, or in the 450 * case the lower level driver has set the CPUFREQ_FLAG_UNCACHED flag, it will 451 * obtain the frequency from the driver itself. 452 */ 453 static int 454 cf_get_method(device_t dev, struct cf_level *level) 455 { 456 struct cpufreq_softc *sc; 457 struct cf_level *levels; 458 struct cf_setting *curr_set; 459 struct pcpu *pc; 460 int bdiff, count, diff, error, i, type; 461 uint64_t rate; 462 463 sc = device_get_softc(dev); 464 error = 0; 465 levels = NULL; 466 467 /* 468 * If we already know the current frequency, and the driver didn't ask 469 * for uncached usage, we're done. 470 */ 471 CF_MTX_LOCK(&sc->lock); 472 curr_set = &sc->curr_level.total_set; 473 error = CPUFREQ_DRV_TYPE(sc->cf_drv_dev, &type); 474 if (error == 0 && (type & CPUFREQ_FLAG_UNCACHED)) { 475 struct cf_setting set; 476 477 /* 478 * If the driver wants to always report back the real frequency, 479 * first try the driver and if that fails, fall back to 480 * estimating. 481 */ 482 if (CPUFREQ_DRV_GET(sc->cf_drv_dev, &set) == 0) { 483 sc->curr_level.total_set = set; 484 CF_DEBUG("get returning immediate freq %d\n", 485 curr_set->freq); 486 goto out; 487 } 488 } else if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) { 489 CF_DEBUG("get returning known freq %d\n", curr_set->freq); 490 error = 0; 491 goto out; 492 } 493 CF_MTX_UNLOCK(&sc->lock); 494 495 /* 496 * We need to figure out the current level. Loop through every 497 * driver, getting the current setting. Then, attempt to get a best 498 * match of settings against each level. 499 */ 500 count = CF_MAX_LEVELS; 501 levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT); 502 if (levels == NULL) 503 return (ENOMEM); 504 error = CPUFREQ_LEVELS(sc->dev, levels, &count); 505 if (error) { 506 if (error == E2BIG) 507 printf("cpufreq: need to increase CF_MAX_LEVELS\n"); 508 free(levels, M_TEMP); 509 return (error); 510 } 511 512 /* 513 * Reacquire the lock and search for the given level. 514 * 515 * XXX Note: this is not quite right since we really need to go 516 * through each level and compare both absolute and relative 517 * settings for each driver in the system before making a match. 518 * The estimation code below catches this case though. 519 */ 520 CF_MTX_LOCK(&sc->lock); 521 i = cpufreq_get_level(sc->cf_drv_dev, levels, count); 522 if (i >= 0) 523 sc->curr_level = levels[i]; 524 else 525 CF_DEBUG("Couldn't find supported level for %s\n", 526 device_get_nameunit(sc->cf_drv_dev)); 527 528 if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) { 529 CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq); 530 goto out; 531 } 532 533 /* 534 * We couldn't find an exact match, so attempt to estimate and then 535 * match against a level. 536 */ 537 pc = cpu_get_pcpu(dev); 538 if (pc == NULL) { 539 error = ENXIO; 540 goto out; 541 } 542 cpu_est_clockrate(pc->pc_cpuid, &rate); 543 rate /= 1000000; 544 bdiff = 1 << 30; 545 for (i = 0; i < count; i++) { 546 diff = abs(levels[i].total_set.freq - rate); 547 if (diff < bdiff) { 548 bdiff = diff; 549 sc->curr_level = levels[i]; 550 } 551 } 552 CF_DEBUG("get estimated freq %d\n", curr_set->freq); 553 554 out: 555 if (error == 0) 556 *level = sc->curr_level; 557 558 CF_MTX_UNLOCK(&sc->lock); 559 if (levels) 560 free(levels, M_TEMP); 561 return (error); 562 } 563 564 /* 565 * Either directly obtain settings from the cpufreq driver, or build a list of 566 * relative settings to be integrated later against an absolute max. 567 */ 568 static int 569 cpufreq_add_levels(device_t cf_dev, struct cf_setting_lst *rel_sets) 570 { 571 struct cf_setting_array *set_arr; 572 struct cf_setting *sets; 573 device_t dev; 574 struct cpufreq_softc *sc; 575 int type, set_count, error; 576 577 sc = device_get_softc(cf_dev); 578 dev = sc->cf_drv_dev; 579 580 /* Skip devices that aren't ready. */ 581 if (!device_is_attached(cf_dev)) 582 return (0); 583 584 /* 585 * Get settings, skipping drivers that offer no settings or 586 * provide settings for informational purposes only. 587 */ 588 error = CPUFREQ_DRV_TYPE(dev, &type); 589 if (error != 0 || (type & CPUFREQ_FLAG_INFO_ONLY)) { 590 if (error == 0) { 591 CF_DEBUG("skipping info-only driver %s\n", 592 device_get_nameunit(cf_dev)); 593 } 594 return (error); 595 } 596 597 sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT); 598 if (sets == NULL) 599 return (ENOMEM); 600 601 set_count = MAX_SETTINGS; 602 error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count); 603 if (error != 0 || set_count == 0) 604 goto out; 605 606 /* Add the settings to our absolute/relative lists. */ 607 switch (type & CPUFREQ_TYPE_MASK) { 608 case CPUFREQ_TYPE_ABSOLUTE: 609 error = cpufreq_insert_abs(sc, sets, set_count); 610 break; 611 case CPUFREQ_TYPE_RELATIVE: 612 CF_DEBUG("adding %d relative settings\n", set_count); 613 set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT); 614 if (set_arr == NULL) { 615 error = ENOMEM; 616 goto out; 617 } 618 bcopy(sets, set_arr->sets, set_count * sizeof(*sets)); 619 set_arr->count = set_count; 620 TAILQ_INSERT_TAIL(rel_sets, set_arr, link); 621 break; 622 default: 623 error = EINVAL; 624 } 625 626 out: 627 free(sets, M_TEMP); 628 return (error); 629 } 630 631 static int 632 cf_levels_method(device_t dev, struct cf_level *levels, int *count) 633 { 634 struct cf_setting_array *set_arr; 635 struct cf_setting_lst rel_sets; 636 struct cpufreq_softc *sc; 637 struct cf_level *lev; 638 struct pcpu *pc; 639 int error, i; 640 uint64_t rate; 641 642 if (levels == NULL || count == NULL) 643 return (EINVAL); 644 645 TAILQ_INIT(&rel_sets); 646 sc = device_get_softc(dev); 647 648 CF_MTX_LOCK(&sc->lock); 649 error = cpufreq_add_levels(sc->dev, &rel_sets); 650 if (error) 651 goto out; 652 653 /* 654 * If there are no absolute levels, create a fake one at 100%. We 655 * then cache the clockrate for later use as our base frequency. 656 */ 657 if (TAILQ_EMPTY(&sc->all_levels)) { 658 struct cf_setting set; 659 660 CF_DEBUG("No absolute levels returned by driver\n"); 661 662 if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) { 663 sc->max_mhz = cpu_get_nominal_mhz(dev); 664 /* 665 * If the CPU can't report a rate for 100%, hope 666 * the CPU is running at its nominal rate right now, 667 * and use that instead. 668 */ 669 if (sc->max_mhz <= 0) { 670 pc = cpu_get_pcpu(dev); 671 cpu_est_clockrate(pc->pc_cpuid, &rate); 672 sc->max_mhz = rate / 1000000; 673 } 674 } 675 memset(&set, CPUFREQ_VAL_UNKNOWN, sizeof(set)); 676 set.freq = sc->max_mhz; 677 set.dev = NULL; 678 error = cpufreq_insert_abs(sc, &set, 1); 679 if (error) 680 goto out; 681 } 682 683 /* Create a combined list of absolute + relative levels. */ 684 TAILQ_FOREACH(set_arr, &rel_sets, link) 685 cpufreq_expand_set(sc, set_arr); 686 687 /* If the caller doesn't have enough space, return the actual count. */ 688 if (sc->all_count > *count) { 689 *count = sc->all_count; 690 error = E2BIG; 691 goto out; 692 } 693 694 /* Finally, output the list of levels. */ 695 i = 0; 696 TAILQ_FOREACH(lev, &sc->all_levels, link) { 697 /* Skip levels that have a frequency that is too low. */ 698 if (lev->total_set.freq < cf_lowest_freq) { 699 sc->all_count--; 700 continue; 701 } 702 703 levels[i] = *lev; 704 i++; 705 } 706 *count = sc->all_count; 707 error = 0; 708 709 out: 710 /* Clear all levels since we regenerate them each time. */ 711 while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) { 712 TAILQ_REMOVE(&sc->all_levels, lev, link); 713 free(lev, M_TEMP); 714 } 715 sc->all_count = 0; 716 717 CF_MTX_UNLOCK(&sc->lock); 718 while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) { 719 TAILQ_REMOVE(&rel_sets, set_arr, link); 720 free(set_arr, M_TEMP); 721 } 722 return (error); 723 } 724 725 /* 726 * Create levels for an array of absolute settings and insert them in 727 * sorted order in the specified list. 728 */ 729 static int 730 cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets, 731 int count) 732 { 733 struct cf_level_lst *list; 734 struct cf_level *level, *search; 735 int i, inserted; 736 737 CF_MTX_ASSERT(&sc->lock); 738 739 list = &sc->all_levels; 740 for (i = 0; i < count; i++) { 741 level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO); 742 if (level == NULL) 743 return (ENOMEM); 744 level->abs_set = sets[i]; 745 level->total_set = sets[i]; 746 level->total_set.dev = NULL; 747 sc->all_count++; 748 inserted = 0; 749 750 if (TAILQ_EMPTY(list)) { 751 CF_DEBUG("adding abs setting %d at head\n", 752 sets[i].freq); 753 TAILQ_INSERT_HEAD(list, level, link); 754 continue; 755 } 756 757 TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) 758 if (sets[i].freq <= search->total_set.freq) { 759 CF_DEBUG("adding abs setting %d after %d\n", 760 sets[i].freq, search->total_set.freq); 761 TAILQ_INSERT_AFTER(list, search, level, link); 762 inserted = 1; 763 break; 764 } 765 766 if (inserted == 0) { 767 TAILQ_FOREACH(search, list, link) 768 if (sets[i].freq >= search->total_set.freq) { 769 CF_DEBUG("adding abs setting %d before %d\n", 770 sets[i].freq, search->total_set.freq); 771 TAILQ_INSERT_BEFORE(search, level, link); 772 break; 773 } 774 } 775 } 776 777 return (0); 778 } 779 780 /* 781 * Expand a group of relative settings, creating derived levels from them. 782 */ 783 static int 784 cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr) 785 { 786 struct cf_level *fill, *search; 787 struct cf_setting *set; 788 int i; 789 790 CF_MTX_ASSERT(&sc->lock); 791 792 /* 793 * Walk the set of all existing levels in reverse. This is so we 794 * create derived states from the lowest absolute settings first 795 * and discard duplicates created from higher absolute settings. 796 * For instance, a level of 50 Mhz derived from 100 Mhz + 50% is 797 * preferable to 200 Mhz + 25% because absolute settings are more 798 * efficient since they often change the voltage as well. 799 */ 800 TAILQ_FOREACH_REVERSE(search, &sc->all_levels, cf_level_lst, link) { 801 /* Add each setting to the level, duplicating if necessary. */ 802 for (i = 0; i < set_arr->count; i++) { 803 set = &set_arr->sets[i]; 804 805 /* 806 * If this setting is less than 100%, split the level 807 * into two and add this setting to the new level. 808 */ 809 fill = search; 810 if (set->freq < 10000) { 811 fill = cpufreq_dup_set(sc, search, set); 812 813 /* 814 * The new level was a duplicate of an existing 815 * level or its absolute setting is too high 816 * so we freed it. For example, we discard a 817 * derived level of 1000 MHz/25% if a level 818 * of 500 MHz/100% already exists. 819 */ 820 if (fill == NULL) 821 break; 822 } 823 824 /* Add this setting to the existing or new level. */ 825 KASSERT(fill->rel_count < MAX_SETTINGS, 826 ("cpufreq: too many relative drivers (%d)", 827 MAX_SETTINGS)); 828 fill->rel_set[fill->rel_count] = *set; 829 fill->rel_count++; 830 CF_DEBUG( 831 "expand set added rel setting %d%% to %d level\n", 832 set->freq / 100, fill->total_set.freq); 833 } 834 } 835 836 return (0); 837 } 838 839 static struct cf_level * 840 cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup, 841 struct cf_setting *set) 842 { 843 struct cf_level_lst *list; 844 struct cf_level *fill, *itr; 845 struct cf_setting *fill_set, *itr_set; 846 int i; 847 848 CF_MTX_ASSERT(&sc->lock); 849 850 /* 851 * Create a new level, copy it from the old one, and update the 852 * total frequency and power by the percentage specified in the 853 * relative setting. 854 */ 855 fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT); 856 if (fill == NULL) 857 return (NULL); 858 *fill = *dup; 859 fill_set = &fill->total_set; 860 fill_set->freq = 861 ((uint64_t)fill_set->freq * set->freq) / 10000; 862 if (fill_set->power != CPUFREQ_VAL_UNKNOWN) { 863 fill_set->power = ((uint64_t)fill_set->power * set->freq) 864 / 10000; 865 } 866 if (set->lat != CPUFREQ_VAL_UNKNOWN) { 867 if (fill_set->lat != CPUFREQ_VAL_UNKNOWN) 868 fill_set->lat += set->lat; 869 else 870 fill_set->lat = set->lat; 871 } 872 CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq); 873 874 /* 875 * If we copied an old level that we already modified (say, at 100%), 876 * we need to remove that setting before adding this one. Since we 877 * process each setting array in order, we know any settings for this 878 * driver will be found at the end. 879 */ 880 for (i = fill->rel_count; i != 0; i--) { 881 if (fill->rel_set[i - 1].dev != set->dev) 882 break; 883 CF_DEBUG("removed last relative driver: %s\n", 884 device_get_nameunit(set->dev)); 885 fill->rel_count--; 886 } 887 888 /* 889 * Insert the new level in sorted order. If it is a duplicate of an 890 * existing level (1) or has an absolute setting higher than the 891 * existing level (2), do not add it. We can do this since any such 892 * level is guaranteed use less power. For example (1), a level with 893 * one absolute setting of 800 Mhz uses less power than one composed 894 * of an absolute setting of 1600 Mhz and a relative setting at 50%. 895 * Also for example (2), a level of 800 Mhz/75% is preferable to 896 * 1600 Mhz/25% even though the latter has a lower total frequency. 897 */ 898 list = &sc->all_levels; 899 KASSERT(!TAILQ_EMPTY(list), ("all levels list empty in dup set")); 900 TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) { 901 itr_set = &itr->total_set; 902 if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) { 903 CF_DEBUG("dup set rejecting %d (dupe)\n", 904 fill_set->freq); 905 itr = NULL; 906 break; 907 } else if (fill_set->freq < itr_set->freq) { 908 if (fill->abs_set.freq <= itr->abs_set.freq) { 909 CF_DEBUG( 910 "dup done, inserting new level %d after %d\n", 911 fill_set->freq, itr_set->freq); 912 TAILQ_INSERT_AFTER(list, itr, fill, link); 913 sc->all_count++; 914 } else { 915 CF_DEBUG("dup set rejecting %d (abs too big)\n", 916 fill_set->freq); 917 itr = NULL; 918 } 919 break; 920 } 921 } 922 923 /* We didn't find a good place for this new level so free it. */ 924 if (itr == NULL) { 925 CF_DEBUG("dup set freeing new level %d (not optimal)\n", 926 fill_set->freq); 927 free(fill, M_TEMP); 928 fill = NULL; 929 } 930 931 return (fill); 932 } 933 934 static int 935 cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS) 936 { 937 struct cpufreq_softc *sc; 938 struct cf_level *levels; 939 int best, count, diff, bdiff, devcount, error, freq, i, n; 940 device_t *devs; 941 942 devs = NULL; 943 sc = oidp->oid_arg1; 944 levels = sc->levels_buf; 945 946 error = CPUFREQ_GET(sc->dev, &levels[0]); 947 if (error) 948 goto out; 949 freq = levels[0].total_set.freq; 950 error = sysctl_handle_int(oidp, &freq, 0, req); 951 if (error != 0 || req->newptr == NULL) 952 goto out; 953 954 /* 955 * While we only call cpufreq_get() on one device (assuming all 956 * CPUs have equal levels), we call cpufreq_set() on all CPUs. 957 * This is needed for some MP systems. 958 */ 959 error = devclass_get_devices(devclass_find("cpufreq"), &devs, &devcount); 960 if (error) 961 goto out; 962 for (n = 0; n < devcount; n++) { 963 count = CF_MAX_LEVELS; 964 error = CPUFREQ_LEVELS(devs[n], levels, &count); 965 if (error) { 966 if (error == E2BIG) 967 printf( 968 "cpufreq: need to increase CF_MAX_LEVELS\n"); 969 break; 970 } 971 best = 0; 972 bdiff = 1 << 30; 973 for (i = 0; i < count; i++) { 974 diff = abs(levels[i].total_set.freq - freq); 975 if (diff < bdiff) { 976 bdiff = diff; 977 best = i; 978 } 979 } 980 error = CPUFREQ_SET(devs[n], &levels[best], CPUFREQ_PRIO_USER); 981 } 982 983 out: 984 if (devs) 985 free(devs, M_TEMP); 986 return (error); 987 } 988 989 static int 990 cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS) 991 { 992 struct cpufreq_softc *sc; 993 struct cf_level *levels; 994 struct cf_setting *set; 995 struct sbuf sb; 996 int count, error, i; 997 998 sc = oidp->oid_arg1; 999 sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND); 1000 1001 /* Get settings from the device and generate the output string. */ 1002 count = CF_MAX_LEVELS; 1003 levels = sc->levels_buf; 1004 if (levels == NULL) { 1005 sbuf_delete(&sb); 1006 return (ENOMEM); 1007 } 1008 error = CPUFREQ_LEVELS(sc->dev, levels, &count); 1009 if (error) { 1010 if (error == E2BIG) 1011 printf("cpufreq: need to increase CF_MAX_LEVELS\n"); 1012 goto out; 1013 } 1014 if (count) { 1015 for (i = 0; i < count; i++) { 1016 set = &levels[i].total_set; 1017 sbuf_printf(&sb, "%d/%d ", set->freq, set->power); 1018 } 1019 } else 1020 sbuf_cpy(&sb, "0"); 1021 sbuf_trim(&sb); 1022 sbuf_finish(&sb); 1023 error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); 1024 1025 out: 1026 sbuf_delete(&sb); 1027 return (error); 1028 } 1029 1030 static int 1031 cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS) 1032 { 1033 device_t dev; 1034 struct cf_setting *sets; 1035 struct sbuf sb; 1036 int error, i, set_count; 1037 1038 dev = oidp->oid_arg1; 1039 sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND); 1040 1041 /* Get settings from the device and generate the output string. */ 1042 set_count = MAX_SETTINGS; 1043 sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT); 1044 if (sets == NULL) { 1045 sbuf_delete(&sb); 1046 return (ENOMEM); 1047 } 1048 error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count); 1049 if (error) 1050 goto out; 1051 if (set_count) { 1052 for (i = 0; i < set_count; i++) 1053 sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power); 1054 } else 1055 sbuf_cpy(&sb, "0"); 1056 sbuf_trim(&sb); 1057 sbuf_finish(&sb); 1058 error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); 1059 1060 out: 1061 free(sets, M_TEMP); 1062 sbuf_delete(&sb); 1063 return (error); 1064 } 1065 1066 static void 1067 cpufreq_add_freq_driver_sysctl(device_t cf_dev) 1068 { 1069 struct cpufreq_softc *sc; 1070 1071 sc = device_get_softc(cf_dev); 1072 SYSCTL_ADD_CONST_STRING(&sc->sysctl_ctx, 1073 SYSCTL_CHILDREN(device_get_sysctl_tree(cf_dev)), OID_AUTO, 1074 "freq_driver", CTLFLAG_RD, device_get_nameunit(sc->cf_drv_dev), 1075 "cpufreq driver used by this cpu"); 1076 } 1077 1078 int 1079 cpufreq_register(device_t dev) 1080 { 1081 struct cpufreq_softc *sc; 1082 device_t cf_dev, cpu_dev; 1083 int error; 1084 1085 /* Add a sysctl to get each driver's settings separately. */ 1086 SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), 1087 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), 1088 OID_AUTO, "freq_settings", 1089 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, dev, 0, 1090 cpufreq_settings_sysctl, "A", "CPU frequency driver settings"); 1091 1092 /* 1093 * Add only one cpufreq device to each CPU. Currently, all CPUs 1094 * must offer the same levels and be switched at the same time. 1095 */ 1096 cpu_dev = device_get_parent(dev); 1097 if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) { 1098 sc = device_get_softc(cf_dev); 1099 sc->max_mhz = CPUFREQ_VAL_UNKNOWN; 1100 MPASS(sc->cf_drv_dev != NULL); 1101 return (0); 1102 } 1103 1104 /* Add the child device and possibly sysctls. */ 1105 cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", device_get_unit(cpu_dev)); 1106 if (cf_dev == NULL) 1107 return (ENOMEM); 1108 device_quiet(cf_dev); 1109 1110 error = device_probe_and_attach(cf_dev); 1111 if (error) 1112 return (error); 1113 1114 sc = device_get_softc(cf_dev); 1115 sc->cf_drv_dev = dev; 1116 cpufreq_add_freq_driver_sysctl(cf_dev); 1117 return (error); 1118 } 1119 1120 int 1121 cpufreq_unregister(device_t dev) 1122 { 1123 device_t cf_dev; 1124 struct cpufreq_softc *sc __diagused; 1125 1126 /* 1127 * If this is the last cpufreq child device, remove the control 1128 * device as well. We identify cpufreq children by calling a method 1129 * they support. 1130 */ 1131 cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1); 1132 if (cf_dev == NULL) { 1133 device_printf(dev, 1134 "warning: cpufreq_unregister called with no cpufreq device active\n"); 1135 return (0); 1136 } 1137 sc = device_get_softc(cf_dev); 1138 MPASS(sc->cf_drv_dev == dev); 1139 device_delete_child(device_get_parent(cf_dev), cf_dev); 1140 1141 return (0); 1142 } 1143 1144 int 1145 cpufreq_settings_changed(device_t dev) 1146 { 1147 1148 EVENTHANDLER_INVOKE(cpufreq_levels_changed, 1149 device_get_unit(device_get_parent(dev))); 1150 return (0); 1151 } 1152